360 files changed, 8762 insertions, 4120 deletions
diff --git a/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 33fdc8b628c5..856d7e90acb2 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -104,12 +104,67 @@ static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
 /// All reachable probability will proportionally share the remaining part.
 static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1);
 
+/// Heuristics and lookup tables for non-loop branches:
+/// Pointer Heuristics (PH)
 static const uint32_t PH_TAKEN_WEIGHT = 20;
 static const uint32_t PH_NONTAKEN_WEIGHT = 12;
+static const BranchProbability
+    PtrTakenProb(PH_TAKEN_WEIGHT, PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
+static const BranchProbability
+    PtrUntakenProb(PH_NONTAKEN_WEIGHT, PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
+
+using ProbabilityList = SmallVector<BranchProbability>;
+using ProbabilityTable = std::map<CmpInst::Predicate, ProbabilityList>;
+
+/// Pointer comparisons:
+static const ProbabilityTable PointerTable{
+    {ICmpInst::ICMP_NE, {PtrTakenProb, PtrUntakenProb}}, /// p != q -> Likely
+    {ICmpInst::ICMP_EQ, {PtrUntakenProb, PtrTakenProb}}, /// p == q -> Unlikely
+};
 
+/// Zero Heuristics (ZH)
 static const uint32_t ZH_TAKEN_WEIGHT = 20;
 static const uint32_t ZH_NONTAKEN_WEIGHT = 12;
+static const BranchProbability
+    ZeroTakenProb(ZH_TAKEN_WEIGHT, ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
+static const BranchProbability
+    ZeroUntakenProb(ZH_NONTAKEN_WEIGHT, ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
+
+/// Integer compares with 0:
+static const ProbabilityTable ICmpWithZeroTable{
+    {CmpInst::ICMP_EQ, {ZeroUntakenProb, ZeroTakenProb}},  /// X == 0 -> Unlikely
+    {CmpInst::ICMP_NE, {ZeroTakenProb, ZeroUntakenProb}},  /// X != 0 -> Likely
+    {CmpInst::ICMP_SLT, {ZeroUntakenProb, ZeroTakenProb}}, /// X < 0  -> Unlikely
+    {CmpInst::ICMP_SGT, {ZeroTakenProb, ZeroUntakenProb}}, /// X > 0  -> Likely
+};
+
+/// Integer compares with -1:
+static const ProbabilityTable ICmpWithMinusOneTable{
+    {CmpInst::ICMP_EQ, {ZeroUntakenProb, ZeroTakenProb}},  /// X == -1 -> Unlikely
+    {CmpInst::ICMP_NE, {ZeroTakenProb, ZeroUntakenProb}},  /// X != -1 -> Likely
+    // InstCombine canonicalizes X >= 0 into X > -1
+    {CmpInst::ICMP_SGT, {ZeroTakenProb, ZeroUntakenProb}}, /// X >= 0  -> Likely
+};
+
+/// Integer compares with 1:
+static const ProbabilityTable ICmpWithOneTable{
+    // InstCombine canonicalizes X <= 0 into X < 1
+    {CmpInst::ICMP_SLT, {ZeroUntakenProb, ZeroTakenProb}}, /// X <= 0 -> Unlikely
+};
+
+/// strcmp and similar functions return zero, negative, or positive, if the
+/// first string is equal, less, or greater than the second. We consider it
+/// likely that the strings are not equal, so a comparison with zero is
+/// probably false, but also a comparison with any other number is also
+/// probably false given that what exactly is returned for nonzero values is
+/// not specified. Any kind of comparison other than equality we know
+/// nothing about.
+static const ProbabilityTable ICmpWithLibCallTable{
+    {CmpInst::ICMP_EQ, {ZeroUntakenProb, ZeroTakenProb}},
+    {CmpInst::ICMP_NE, {ZeroTakenProb, ZeroUntakenProb}},
+};
 
+// Floating-Point Heuristics (FPH)
 static const uint32_t FPH_TAKEN_WEIGHT = 20;
 static const uint32_t FPH_NONTAKEN_WEIGHT = 12;
 
@@ -120,6 +175,21 @@ static const uint32_t FPH_ORD_WEIGHT = 1024 * 1024 - 1;
 /// exceptional case, so the result is unlikely.
 static const uint32_t FPH_UNO_WEIGHT = 1;
 
+static const BranchProbability FPOrdTakenProb(FPH_ORD_WEIGHT,
+                                              FPH_ORD_WEIGHT + FPH_UNO_WEIGHT);
+static const BranchProbability
+    FPOrdUntakenProb(FPH_UNO_WEIGHT, FPH_ORD_WEIGHT + FPH_UNO_WEIGHT);
+static const BranchProbability
+    FPTakenProb(FPH_TAKEN_WEIGHT, FPH_TAKEN_WEIGHT + FPH_NONTAKEN_WEIGHT);
+static const BranchProbability
+    FPUntakenProb(FPH_NONTAKEN_WEIGHT, FPH_TAKEN_WEIGHT + FPH_NONTAKEN_WEIGHT);
+
+/// Floating-Point compares:
+static const ProbabilityTable FCmpTable{
+    {FCmpInst::FCMP_ORD, {FPOrdTakenProb, FPOrdUntakenProb}}, /// !isnan -> Likely
+    {FCmpInst::FCMP_UNO, {FPOrdUntakenProb, FPOrdTakenProb}}, /// isnan -> Unlikely
+};
+
 /// Set of dedicated "absolute" execution weights for a block. These weights are
 /// meaningful relative to each other and their derivatives only.
 enum class BlockExecWeight : std::uint32_t {
@@ -468,21 +538,10 @@ bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
 
   assert(CI->getOperand(1)->getType()->isPointerTy());
 
-  BranchProbability TakenProb(PH_TAKEN_WEIGHT,
-                              PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
-  BranchProbability UntakenProb(PH_NONTAKEN_WEIGHT,
-                                PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
-
-  // p != 0   ->   isProb = true
-  // p == 0   ->   isProb = false
-  // p != q   ->   isProb = true
-  // p == q   ->   isProb = false;
-  bool isProb = CI->getPredicate() == ICmpInst::ICMP_NE;
-  if (!isProb)
-    std::swap(TakenProb, UntakenProb);
-
-  setEdgeProbability(
-      BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb}));
+  auto Search = PointerTable.find(CI->getPredicate());
+  if (Search == PointerTable.end())
+    return false;
+  setEdgeProbability(BB, Search->second);
   return true;
 }
 
@@ -949,86 +1008,33 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
       if (Function *CalledFn = Call->getCalledFunction())
         TLI->getLibFunc(*CalledFn, Func);
 
-  bool isProb;
+  ProbabilityTable::const_iterator Search;
   if (Func == LibFunc_strcasecmp ||
       Func == LibFunc_strcmp ||
       Func == LibFunc_strncasecmp ||
       Func == LibFunc_strncmp ||
       Func == LibFunc_memcmp ||
       Func == LibFunc_bcmp) {
-    // strcmp and similar functions return zero, negative, or positive, if the
-    // first string is equal, less, or greater than the second. We consider it
-    // likely that the strings are not equal, so a comparison with zero is
-    // probably false, but also a comparison with any other number is also
-    // probably false given that what exactly is returned for nonzero values is
-    // not specified. Any kind of comparison other than equality we know
-    // nothing about.
-    switch (CI->getPredicate()) {
-    case CmpInst::ICMP_EQ:
-      isProb = false;
-      break;
-    case CmpInst::ICMP_NE:
-      isProb = true;
-      break;
-    default:
+    Search = ICmpWithLibCallTable.find(CI->getPredicate());
+    if (Search == ICmpWithLibCallTable.end())
       return false;
-    }
   } else if (CV->isZero()) {
-    switch (CI->getPredicate()) {
-    case CmpInst::ICMP_EQ:
-      // X == 0   ->  Unlikely
-      isProb = false;
-      break;
-    case CmpInst::ICMP_NE:
-      // X != 0   ->  Likely
-      isProb = true;
-      break;
-    case CmpInst::ICMP_SLT:
-      // X < 0   ->  Unlikely
-      isProb = false;
-      break;
-    case CmpInst::ICMP_SGT:
-      // X > 0   ->  Likely
-      isProb = true;
-      break;
-    default:
+    Search = ICmpWithZeroTable.find(CI->getPredicate());
+    if (Search == ICmpWithZeroTable.end())
+      return false;
+  } else if (CV->isOne()) {
+    Search = ICmpWithOneTable.find(CI->getPredicate());
+    if (Search == ICmpWithOneTable.end())
       return false;
-    }
-  } else if (CV->isOne() && CI->getPredicate() == CmpInst::ICMP_SLT) {
-    // InstCombine canonicalizes X <= 0 into X < 1.
-    // X <= 0   ->  Unlikely
-    isProb = false;
   } else if (CV->isMinusOne()) {
-    switch (CI->getPredicate()) {
-    case CmpInst::ICMP_EQ:
-      // X == -1  ->  Unlikely
-      isProb = false;
-      break;
-    case CmpInst::ICMP_NE:
-      // X != -1  ->  Likely
-      isProb = true;
-      break;
-    case CmpInst::ICMP_SGT:
-      // InstCombine canonicalizes X >= 0 into X > -1.
-      // X >= 0   ->  Likely
-      isProb = true;
-      break;
-    default:
+    Search = ICmpWithMinusOneTable.find(CI->getPredicate());
+    if (Search == ICmpWithMinusOneTable.end())
       return false;
-    }
   } else {
     return false;
   }
 
-  BranchProbability TakenProb(ZH_TAKEN_WEIGHT,
-                              ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
-  BranchProbability UntakenProb(ZH_NONTAKEN_WEIGHT,
-                                ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
-  if (!isProb)
-    std::swap(TakenProb, UntakenProb);
-
-  setEdgeProbability(
-      BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb}));
+  setEdgeProbability(BB, Search->second);
   return true;
 }
 
@@ -1042,34 +1048,21 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
   if (!FCmp)
     return false;
 
-  uint32_t TakenWeight = FPH_TAKEN_WEIGHT;
-  uint32_t NontakenWeight = FPH_NONTAKEN_WEIGHT;
-  bool isProb;
+  ProbabilityList ProbList;
   if (FCmp->isEquality()) {
-    // f1 == f2 -> Unlikely
-    // f1 != f2 -> Likely
-    isProb = !FCmp->isTrueWhenEqual();
-  } else if (FCmp->getPredicate() == FCmpInst::FCMP_ORD) {
-    // !isnan -> Likely
-    isProb = true;
-    TakenWeight = FPH_ORD_WEIGHT;
-    NontakenWeight = FPH_UNO_WEIGHT;
-  } else if (FCmp->getPredicate() == FCmpInst::FCMP_UNO) {
-    // isnan -> Unlikely
-    isProb = false;
-    TakenWeight = FPH_ORD_WEIGHT;
-    NontakenWeight = FPH_UNO_WEIGHT;
+    ProbList = !FCmp->isTrueWhenEqual() ?
+      // f1 == f2 -> Unlikely
+      ProbabilityList({FPTakenProb, FPUntakenProb}) :
+      // f1 != f2 -> Likely
+      ProbabilityList({FPUntakenProb, FPTakenProb});
   } else {
-    return false;
+    auto Search = FCmpTable.find(FCmp->getPredicate());
+    if (Search == FCmpTable.end())
+      return false;
+    ProbList = Search->second;
   }
 
-  BranchProbability TakenProb(TakenWeight, TakenWeight + NontakenWeight);
-  BranchProbability UntakenProb(NontakenWeight, TakenWeight + NontakenWeight);
-  if (!isProb)
-    std::swap(TakenProb, UntakenProb);
-
-  setEdgeProbability(
-      BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb}));
+  setEdgeProbability(BB, ProbList);
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp
index 3634526370f5..7426d0c07592 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -24,12 +24,12 @@
 // divergent can help the compiler to selectively run these optimizations.
 //
 // This implementation is derived from the Vectorization Analysis of the
-// Region Vectorizer (RV). That implementation in turn is based on the approach
-// described in
+// Region Vectorizer (RV). The analysis is based on the approach described in
 //
-//   Improving Performance of OpenCL on CPUs
-//   Ralf Karrenberg and Sebastian Hack
-//   CC '12
+//   An abstract interpretation for SPMD divergence
+//       on reducible control flow graphs.
+//   Julian Rosemann, Simon Moll and Sebastian Hack
+//   POPL '21
 //
 // This implementation is generic in the sense that it does
 // not itself identify original sources of divergence.
diff --git a/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
index f22c6aa04f5e..2ec6cbeabda2 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -820,7 +820,7 @@ void IRSimilarityIdentifier::populateMapper(
 /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
 /// the IRInstructionData in subsequence.
 ///
-/// \param [in] Mapper - The instruction mapper for sanity checks.
+/// \param [in] Mapper - The instruction mapper for basic correctness checks.
 /// \param [in] InstrList - The vector that holds the instruction data.
 /// \param [in] IntegerMapping - The vector that holds the mapped integers.
 /// \param [out] CandsForRepSubstring - The vector to store the generated
diff --git a/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp b/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp
index c4b7239b43ab..cfe910df4e91 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp
@@ -81,6 +81,7 @@ bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurKind Kind) {
   case RecurKind::Mul:
   case RecurKind::FAdd:
   case RecurKind::FMul:
+  case RecurKind::FMulAdd:
     return true;
   }
   return false;
@@ -194,21 +195,28 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
 // vectorizing floating point operations without unsafe math.
 static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
                                   Instruction *Exit, PHINode *Phi) {
-  // Currently only FAdd is supported
-  if (Kind != RecurKind::FAdd)
+  // Currently only FAdd and FMulAdd are supported.
+  if (Kind != RecurKind::FAdd && Kind != RecurKind::FMulAdd)
     return false;
 
-  // Ensure the exit instruction is an FAdd, and that it only has one user
-  // other than the reduction PHI
-  if (Exit->getOpcode() != Instruction::FAdd || Exit->hasNUsesOrMore(3) ||
-      Exit != ExactFPMathInst)
+  if (Kind == RecurKind::FAdd && Exit->getOpcode() != Instruction::FAdd)
+    return false;
+
+  if (Kind == RecurKind::FMulAdd &&
+      !RecurrenceDescriptor::isFMulAddIntrinsic(Exit))
+    return false;
+
+  // Ensure the exit instruction has only one user other than the reduction PHI
+  if (Exit != ExactFPMathInst || Exit->hasNUsesOrMore(3))
     return false;
 
   // The only pattern accepted is the one in which the reduction PHI
   // is used as one of the operands of the exit instruction
-  auto *LHS = Exit->getOperand(0);
-  auto *RHS = Exit->getOperand(1);
-  if (LHS != Phi && RHS != Phi)
+  auto *Op0 = Exit->getOperand(0);
+  auto *Op1 = Exit->getOperand(1);
+  if (Kind == RecurKind::FAdd && Op0 != Phi && Op1 != Phi)
+    return false;
+  if (Kind == RecurKind::FMulAdd && Exit->getOperand(2) != Phi)
     return false;
 
   LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi
@@ -389,6 +397,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
     for (User *U : Cur->users()) {
       Instruction *UI = cast<Instruction>(U);
 
+      // If the user is a call to llvm.fmuladd then the instruction can only be
+      // the final operand.
+      if (isFMulAddIntrinsic(UI))
+        if (Cur == UI->getOperand(0) || Cur == UI->getOperand(1))
+          return false;
+
       // Check if we found the exit user.
       BasicBlock *Parent = UI->getParent();
       if (!TheLoop->contains(Parent)) {
@@ -710,6 +724,9 @@ RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi,
            I->hasNoSignedZeros())) &&
          isFPMinMaxRecurrenceKind(Kind)))
       return isMinMaxPattern(I, Kind, Prev);
+    else if (isFMulAddIntrinsic(I))
+      return InstDesc(Kind == RecurKind::FMulAdd, I,
+                      I->hasAllowReassoc() ? nullptr : I);
     return InstDesc(false, I);
   }
 }
@@ -804,6 +821,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                       << " PHI." << *Phi << "\n");
     return true;
   }
+  if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC,
+                      DT)) {
+    LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
+    return true;
+  }
   // Not a reduction of known type.
   return false;
 }
@@ -927,6 +949,7 @@ Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
   case RecurKind::FMul:
     // Multiplying a number by 1 does not change it.
     return ConstantFP::get(Tp, 1.0L);
+  case RecurKind::FMulAdd:
   case RecurKind::FAdd:
     // Adding zero to a number does not change it.
     // FIXME: Ideally we should not need to check FMF for FAdd and should always
@@ -974,6 +997,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
     return Instruction::Xor;
   case RecurKind::FMul:
     return Instruction::FMul;
+  case RecurKind::FMulAdd:
   case RecurKind::FAdd:
     return Instruction::FAdd;
   case RecurKind::SMax:
@@ -1032,6 +1056,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
       return SelectPatternResult::isMinOrMax(
           matchSelectPattern(Cur, LHS, RHS).Flavor);
     }
+    // Recognize a call to the llvm.fmuladd intrinsic.
+    if (isFMulAddIntrinsic(Cur))
+      return true;
+
     return Cur->getOpcode() == RedOp;
   };
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp
index 864eeea4f8bf..22d2ce11cc90 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2180,6 +2180,55 @@ Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit);
 }
 
+static Value *simplifyOrLogic(Value *X, Value *Y) {
+  assert(X->getType() == Y->getType() && "Expected same type for 'or' ops");
+  Type *Ty = X->getType();
+
+  // X | ~X --> -1
+  if (match(Y, m_Not(m_Specific(X))))
+    return ConstantInt::getAllOnesValue(Ty);
+
+  // X | ~(X & ?) = -1
+  if (match(Y, m_Not(m_c_And(m_Specific(X), m_Value()))))
+    return ConstantInt::getAllOnesValue(Ty);
+
+  // X | (X & ?) --> X
+  if (match(Y, m_c_And(m_Specific(X), m_Value())))
+    return X;
+
+  Value *A, *B;
+
+  // (A & ~B) | (A ^ B) --> A ^ B
+  // (~B & A) | (A ^ B) --> A ^ B
+  // (A & ~B) | (B ^ A) --> B ^ A
+  // (~B & A) | (B ^ A) --> B ^ A
+  if (match(X, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Y, m_c_Xor(m_Specific(A), m_Specific(B))))
+    return Y;
+
+  // (~A ^ B) | (A & B) --> ~A ^ B
+  // (B ^ ~A) | (A & B) --> B ^ ~A
+  // (~A ^ B) | (B & A) --> ~A ^ B
+  // (B ^ ~A) | (B & A) --> B ^ ~A
+  if (match(X, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+      match(Y, m_c_And(m_Specific(A), m_Specific(B))))
+    return X;
+
+  // (A ^ B) | (A | B) --> A | B
+  // (A ^ B) | (B | A) --> B | A
+  if (match(X, m_Xor(m_Value(A), m_Value(B))) &&
+      match(Y, m_c_Or(m_Specific(A), m_Specific(B))))
+    return Y;
+
+  // ~(A ^ B) | (A | B) --> -1
+  // ~(A ^ B) | (B | A) --> -1
+  if (match(X, m_Not(m_Xor(m_Value(A), m_Value(B)))) &&
+      match(Y, m_c_Or(m_Specific(A), m_Specific(B))))
+    return ConstantInt::getAllOnesValue(Ty);
+
+  return nullptr;
+}
+
 /// Given operands for an Or, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
@@ -2202,81 +2251,15 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   if (Op0 == Op1 || match(Op1, m_Zero()))
     return Op0;
 
-  // A | ~A  =  ~A | A  =  -1
-  if (match(Op0, m_Not(m_Specific(Op1))) ||
-      match(Op1, m_Not(m_Specific(Op0))))
-    return Constant::getAllOnesValue(Op0->getType());
-
-  // (A & ?) | A = A
-  if (match(Op0, m_c_And(m_Specific(Op1), m_Value())))
-    return Op1;
-
-  // A | (A & ?) = A
-  if (match(Op1, m_c_And(m_Specific(Op0), m_Value())))
-    return Op0;
-
-  // ~(A & ?) | A = -1
-  if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
-    return Constant::getAllOnesValue(Op1->getType());
-
-  // A | ~(A & ?) = -1
-  if (match(Op1, m_Not(m_c_And(m_Specific(Op0), m_Value()))))
-    return Constant::getAllOnesValue(Op0->getType());
+  if (Value *R = simplifyOrLogic(Op0, Op1))
+    return R;
+  if (Value *R = simplifyOrLogic(Op1, Op0))
+    return R;
 
   if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Or))
     return V;
 
   Value *A, *B, *NotA;
-  // (A & ~B) | (A ^ B) -> (A ^ B)
-  // (~B & A) | (A ^ B) -> (A ^ B)
-  // (A & ~B) | (B ^ A) -> (B ^ A)
-  // (~B & A) | (B ^ A) -> (B ^ A)
-  if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
-      (match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
-       match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
-    return Op1;
-
-  // Commute the 'or' operands.
-  // (A ^ B) | (A & ~B) -> (A ^ B)
-  // (A ^ B) | (~B & A) -> (A ^ B)
-  // (B ^ A) | (A & ~B) -> (B ^ A)
-  // (B ^ A) | (~B & A) -> (B ^ A)
-  if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
-      (match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
-       match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
-    return Op0;
-
-  // (A & B) | (~A ^ B) -> (~A ^ B)
-  // (B & A) | (~A ^ B) -> (~A ^ B)
-  // (A & B) | (B ^ ~A) -> (B ^ ~A)
-  // (B & A) | (B ^ ~A) -> (B ^ ~A)
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
-       match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
-    return Op1;
-
-  // Commute the 'or' operands.
-  // (~A ^ B) | (A & B) -> (~A ^ B)
-  // (~A ^ B) | (B & A) -> (~A ^ B)
-  // (B ^ ~A) | (A & B) -> (B ^ ~A)
-  // (B ^ ~A) | (B & A) -> (B ^ ~A)
-  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
-      (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
-       match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
-    return Op0;
-
-  // (A | B) | (A ^ B) --> A | B
-  // (B | A) | (A ^ B) --> B | A
-  if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
-      match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
-    return Op0;
-
-  // Commute the outer 'or' operands.
-  // (A ^ B) | (A | B) --> A | B
-  // (A ^ B) | (B | A) --> B | A
-  if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
-      match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
-    return Op1;
 
   // (~A & B) | ~(A | B) --> ~A
   // (~A & B) | ~(B | A) --> ~A
@@ -2414,6 +2397,30 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Op0->getType());
 
+  auto foldAndOrNot = [](Value *X, Value *Y) -> Value * {
+    Value *A, *B;
+    // (~A & B) ^ (A | B) --> A -- There are 8 commuted variants.
+    if (match(X, m_c_And(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Y, m_c_Or(m_Specific(A), m_Specific(B))))
+      return A;
+
+    // (~A | B) ^ (A & B) --> ~A -- There are 8 commuted variants.
+    // The 'not' op must contain a complete -1 operand (no undef elements for
+    // vector) for the transform to be safe.
+    Value *NotA;
+    if (match(X,
+              m_c_Or(m_CombineAnd(m_NotForbidUndef(m_Value(A)), m_Value(NotA)),
+                     m_Value(B))) &&
+        match(Y, m_c_And(m_Specific(A), m_Specific(B))))
+      return NotA;
+
+    return nullptr;
+  };
+  if (Value *R = foldAndOrNot(Op0, Op1))
+    return R;
+  if (Value *R = foldAndOrNot(Op1, Op0))
+    return R;
+
   if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Xor))
     return V;
 
@@ -2935,8 +2942,10 @@ static Value *simplifyICmpWithBinOpOnLHS(
       return getFalse(ITy);
   }
 
-  // x >> y <=u x
-  // x udiv y <=u x.
+  // x >>u y <=u x --> true.
+  // x >>u y >u  x --> false.
+  // x udiv y <=u x --> true.
+  // x udiv y >u  x --> false.
   if (match(LBO, m_LShr(m_Specific(RHS), m_Value())) ||
       match(LBO, m_UDiv(m_Specific(RHS), m_Value()))) {
     // icmp pred (X op Y), X
@@ -2946,6 +2955,37 @@ static Value *simplifyICmpWithBinOpOnLHS(
       return getTrue(ITy);
   }
 
+  // If x is nonzero:
+  // x >>u C <u  x --> true  for C != 0.
+  // x >>u C !=  x --> true  for C != 0.
+  // x >>u C >=u x --> false for C != 0.
+  // x >>u C ==  x --> false for C != 0.
+  // x udiv C <u  x --> true  for C != 1.
+  // x udiv C !=  x --> true  for C != 1.
+  // x udiv C >=u x --> false for C != 1.
+  // x udiv C ==  x --> false for C != 1.
+  // TODO: allow non-constant shift amount/divisor
+  const APInt *C;
+  if ((match(LBO, m_LShr(m_Specific(RHS), m_APInt(C))) && *C != 0) ||
+      (match(LBO, m_UDiv(m_Specific(RHS), m_APInt(C))) && *C != 1)) {
+    if (isKnownNonZero(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) {
+      switch (Pred) {
+      default:
+        break;
+      case ICmpInst::ICMP_EQ:
+      case ICmpInst::ICMP_UGE:
+        return getFalse(ITy);
+      case ICmpInst::ICMP_NE:
+      case ICmpInst::ICMP_ULT:
+        return getTrue(ITy);
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_ULE:
+        // UGT/ULE are handled by the more general case just above
+        llvm_unreachable("Unexpected UGT/ULE, should have been handled");
+      }
+    }
+  }
+
   // (x*C1)/C2 <= x for C1 <= C2.
   // This holds even if the multiplication overflows: Assume that x != 0 and
   // arithmetic is modulo M. For overflow to occur we must have C1 >= M/x and
diff --git a/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp b/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp
index 23ff4fd6f85e..d9620fd405bc 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp
@@ -36,16 +36,16 @@ INITIALIZE_PASS(IntervalPartition, "intervals",
 
 // releaseMemory - Reset state back to before function was analyzed
 void IntervalPartition::releaseMemory() {
-  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
-    delete Intervals[i];
+  for (Interval *I : Intervals)
+    delete I;
   IntervalMap.clear();
   Intervals.clear();
   RootInterval = nullptr;
 }
 
 void IntervalPartition::print(raw_ostream &O, const Module*) const {
-  for(unsigned i = 0, e = Intervals.size(); i != e; ++i)
-    Intervals[i]->print(O);
+  for (const Interval *I : Intervals)
+    I->print(O);
 }
 
 // addIntervalToPartition - Add an interval to the internal list of intervals,
@@ -87,8 +87,8 @@ bool IntervalPartition::runOnFunction(Function &F) {
 
   // Now that we know all of the successor information, propagate this to the
   // predecessors for each block.
-  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
-    updatePredecessors(Intervals[i]);
+  for (Interval *I : Intervals)
+    updatePredecessors(I);
   return false;
 }
 
@@ -113,6 +113,6 @@ IntervalPartition::IntervalPartition(IntervalPartition &IP, bool)
 
   // Now that we know all of the successor information, propagate this to the
   // predecessors for each block.
-  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
-    updatePredecessors(Intervals[i]);
+  for (Interval *I : Intervals)
+    updatePredecessors(I);
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp
index 50fa169c2081..5b5d48bf6fe5 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1095,7 +1095,8 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
   if (!Ty->isIntegerTy())
     return ValueLatticeElement::getOverdefined();
 
-  APInt Offset(Ty->getScalarSizeInBits(), 0);
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  APInt Offset(BitWidth, 0);
   if (matchICmpOperand(Offset, LHS, Val, EdgePred))
     return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset);
 
@@ -1118,13 +1119,23 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
     // If (Val & Mask) != 0 then the value must be larger than the lowest set
     // bit of Mask.
     if (EdgePred == ICmpInst::ICMP_NE && !Mask->isZero() && C->isZero()) {
-      unsigned BitWidth = Ty->getIntegerBitWidth();
       return ValueLatticeElement::getRange(ConstantRange::getNonEmpty(
           APInt::getOneBitSet(BitWidth, Mask->countTrailingZeros()),
           APInt::getZero(BitWidth)));
     }
   }
 
+  // If (X urem Modulus) >= C, then X >= C.
+  // TODO: An upper bound could be computed as well.
+  if (match(LHS, m_URem(m_Specific(Val), m_Value())) &&
+      match(RHS, m_APInt(C))) {
+    // Use the icmp region so we don't have to deal with different predicates.
+    ConstantRange CR = ConstantRange::makeExactICmpRegion(EdgePred, *C);
+    if (!CR.isEmptySet())
+      return ValueLatticeElement::getRange(ConstantRange::getNonEmpty(
+          CR.getUnsignedMin(), APInt(BitWidth, 0)));
+  }
+
   return ValueLatticeElement::getOverdefined();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f9bd7167317f..19a24ac6a484 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -666,6 +666,29 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE,
   return false;
 }
 
+static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
+                          function_ref<void(Value *)> AddPointer) {
+  SmallPtrSet<Value *, 8> Visited;
+  SmallVector<Value *> WorkList;
+  WorkList.push_back(StartPtr);
+
+  while (!WorkList.empty()) {
+    Value *Ptr = WorkList.pop_back_val();
+    if (!Visited.insert(Ptr).second)
+      continue;
+    auto *PN = dyn_cast<PHINode>(Ptr);
+    // SCEV does not look through non-header PHIs inside the loop. Such phis
+    // can be analyzed by adding separate accesses for each incoming pointer
+    // value.
+    if (PN && InnermostLoop.contains(PN->getParent()) &&
+        PN->getParent() != InnermostLoop.getHeader()) {
+      for (const Use &Inc : PN->incoming_values())
+        WorkList.push_back(Inc);
+    } else
+      AddPointer(Ptr);
+  }
+}
+
 bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
                                           MemAccessInfo Access,
                                           const ValueToValueMap &StridesMap,
@@ -1032,13 +1055,11 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy,
                            bool ShouldCheckWrap) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
-  unsigned AddrSpace = Ty->getPointerAddressSpace();
+  assert(!AccessTy->isAggregateType() && "Bad stride - Not a pointer to a scalar type");
 
-  // Make sure we're not accessing an aggregate type.
-  // TODO: Why? This doesn't make any sense.
-  if (AccessTy->isAggregateType()) {
-    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
-                      << *Ptr << "\n");
+  if (isa<ScalableVectorType>(AccessTy)) {
+    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
+                      << "\n");
     return 0;
   }
 
@@ -1068,6 +1089,7 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy,
   // An getelementptr without an inbounds attribute and unit stride would have
   // to access the pointer value "0" which is undefined behavior in address
   // space 0, therefore we can also vectorize this case.
+  unsigned AddrSpace = Ty->getPointerAddressSpace();
   bool IsInBoundsGEP = isInBoundsGep(Ptr);
   bool IsNoWrapAddRec = !ShouldCheckWrap ||
     PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
@@ -1101,7 +1123,8 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy,
   }
 
   auto &DL = Lp->getHeader()->getModule()->getDataLayout();
-  int64_t Size = DL.getTypeAllocSize(AccessTy);
+  TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
+  int64_t Size = AllocSize.getFixedSize();
   const APInt &APStepVal = C->getAPInt();
 
   // Huge step value - give up.
@@ -1263,29 +1286,6 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
   return Diff && *Diff == 1;
 }
 
-static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
-                          function_ref<void(Value *)> AddPointer) {
-  SmallPtrSet<Value *, 8> Visited;
-  SmallVector<Value *> WorkList;
-  WorkList.push_back(StartPtr);
-
-  while (!WorkList.empty()) {
-    Value *Ptr = WorkList.pop_back_val();
-    if (!Visited.insert(Ptr).second)
-      continue;
-    auto *PN = dyn_cast<PHINode>(Ptr);
-    // SCEV does not look through non-header PHIs inside the loop. Such phis
-    // can be analyzed by adding separate accesses for each incoming pointer
-    // value.
-    if (PN && InnermostLoop.contains(PN->getParent()) &&
-        PN->getParent() != InnermostLoop.getHeader()) {
-      for (const Use &Inc : PN->incoming_values())
-        WorkList.push_back(Inc);
-    } else
-      AddPointer(Ptr);
-  }
-}
-
 void MemoryDepChecker::addAccess(StoreInst *SI) {
   visitPointers(SI->getPointerOperand(), *InnermostLoop,
                 [this, SI](Value *Ptr) {
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index b44d15e71556..da6bb4c49cba 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -1481,11 +1481,11 @@ void MemoryDependenceResults::removeCachedNonLocalPointerDependencies(
   // instructions from the reverse map.
   NonLocalDepInfo &PInfo = It->second.NonLocalDeps;
 
-  for (unsigned i = 0, e = PInfo.size(); i != e; ++i) {
-    Instruction *Target = PInfo[i].getResult().getInst();
+  for (const NonLocalDepEntry &DE : PInfo) {
+    Instruction *Target = DE.getResult().getInst();
     if (!Target)
       continue; // Ignore non-local dep results.
-    assert(Target->getParent() == PInfo[i].getBB());
+    assert(Target->getParent() == DE.getBB());
 
     // Eliminating the dirty entry from 'Cache', so update the reverse info.
     RemoveFromReverseMap(ReverseNonLocalPtrDeps, Target, P);
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp
index 7f2d04c49565..854ba83bd34a 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp
@@ -213,6 +213,28 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
   LibFunc F;
   if (TLI && TLI->getLibFunc(*Call, F) && TLI->has(F)) {
     switch (F) {
+    case LibFunc_memset_chk: {
+      assert(ArgIdx == 0 && "Invalid argument index for memset_chk");
+      LocationSize Size = LocationSize::afterPointer();
+      if (const auto *Len = dyn_cast<ConstantInt>(Call->getArgOperand(2))) {
+        // memset_chk writes at most Len bytes. It may write less, if Len
+        // exceeds the specified max size and aborts.
+        Size = LocationSize::upperBound(Len->getZExtValue());
+      }
+      return MemoryLocation(Arg, Size, AATags);
+    }
+    case LibFunc_strncpy: {
+      assert((ArgIdx == 0 || ArgIdx == 1) &&
+             "Invalid argument index for strncpy");
+      LocationSize Size = LocationSize::afterPointer();
+      if (const auto *Len = dyn_cast<ConstantInt>(Call->getArgOperand(2))) {
+        // strncpy is guaranteed to write Len bytes, but only reads up to Len
+        // bytes.
+        Size = ArgIdx == 0 ? LocationSize::precise(Len->getZExtValue())
+                           : LocationSize::upperBound(Len->getZExtValue());
+      }
+      return MemoryLocation(Arg, Size, AATags);
+    }
     case LibFunc_memset_pattern16:
       assert((ArgIdx == 0 || ArgIdx == 1) &&
              "Invalid argument index for memset_pattern16");
diff --git a/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp b/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp
index c73e1fd82915..4c80f6743411 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp
@@ -69,7 +69,7 @@ static bool VerifySubExpr(Value *Expr,
   }
 
   // If it isn't in the InstInputs list it is a subexpr incorporated into the
-  // address.  Sanity check that it is phi translatable.
+  // address.  Validate that it is phi translatable.
   if (!CanPHITrans(I)) {
     errs() << "Instruction in PHITransAddr is not phi-translatable:\n";
     errs() << *I << '\n';
diff --git a/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp b/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp
index a73607dbef61..c20ecff5f912 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
+#include "llvm/IR/PrintPasses.h"
 #include "llvm/IR/StructuralHash.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
@@ -187,6 +188,8 @@ public:
   }
 
   bool runOnRegion(Region *R, RGPassManager &RGM) override {
+    if (!isFunctionInPrintList(R->getEntry()->getParent()->getName()))
+      return false;
     Out << Banner;
     for (const auto *BB : R->blocks()) {
       if (BB)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp
index f7c22cfb0310..7dc7f9904c70 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2915,8 +2915,8 @@ ScalarEvolution::getOrCreateAddRecExpr(ArrayRef<const SCEV *> Ops,
                                        const Loop *L, SCEV::NoWrapFlags Flags) {
   FoldingSetNodeID ID;
   ID.AddInteger(scAddRecExpr);
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    ID.AddPointer(Ops[i]);
+  for (const SCEV *Op : Ops)
+    ID.AddPointer(Op);
   ID.AddPointer(L);
   void *IP = nullptr;
   SCEVAddRecExpr *S =
@@ -2939,8 +2939,8 @@ ScalarEvolution::getOrCreateMulExpr(ArrayRef<const SCEV *> Ops,
                                     SCEV::NoWrapFlags Flags) {
   FoldingSetNodeID ID;
   ID.AddInteger(scMulExpr);
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    ID.AddPointer(Ops[i]);
+  for (const SCEV *Op : Ops)
+    ID.AddPointer(Op);
   void *IP = nullptr;
   SCEVMulExpr *S =
     static_cast<SCEVMulExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
@@ -3708,8 +3708,8 @@ SCEV *ScalarEvolution::findExistingSCEVInCache(SCEVTypes SCEVType,
                                                ArrayRef<const SCEV *> Ops) {
   FoldingSetNodeID ID;
   ID.AddInteger(SCEVType);
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    ID.AddPointer(Ops[i]);
+  for (const SCEV *Op : Ops)
+    ID.AddPointer(Op);
   void *IP = nullptr;
   return UniqueSCEVs.FindNodeOrInsertPos(ID, IP);
 }
@@ -4094,6 +4094,17 @@ void ScalarEvolution::eraseValueFromMap(Value *V) {
   }
 }
 
+void ScalarEvolution::insertValueToMap(Value *V, const SCEV *S) {
+  // A recursive query may have already computed the SCEV. It should be
+  // equivalent, but may not necessarily be exactly the same, e.g. due to lazily
+  // inferred nowrap flags.
+  auto It = ValueExprMap.find_as(V);
+  if (It == ValueExprMap.end()) {
+    ValueExprMap.insert({SCEVCallbackVH(V, this), S});
+    ExprValueMap[S].insert({V, nullptr});
+  }
+}
+
 /// Return an existing SCEV if it exists, otherwise analyze the expression and
 /// create a new one.
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
@@ -4134,10 +4145,9 @@ const SCEV *ScalarEvolution::getExistingSCEV(Value *V) {
   ValueExprMapType::iterator I = ValueExprMap.find_as(V);
   if (I != ValueExprMap.end()) {
     const SCEV *S = I->second;
-    if (checkValidity(S))
-      return S;
-    eraseValueFromMap(V);
-    forgetMemoizedResults(S);
+    assert(checkValidity(S) &&
+           "existing SCEV has not been properly invalidated");
+    return S;
   }
   return nullptr;
 }
@@ -4430,44 +4440,6 @@ static void PushDefUseChildren(Instruction *I,
   }
 }
 
-void ScalarEvolution::forgetSymbolicName(Instruction *PN, const SCEV *SymName) {
-  SmallVector<Instruction *, 16> Worklist;
-  SmallPtrSet<Instruction *, 8> Visited;
-  SmallVector<const SCEV *, 8> ToForget;
-  Visited.insert(PN);
-  Worklist.push_back(PN);
-  while (!Worklist.empty()) {
-    Instruction *I = Worklist.pop_back_val();
-
-    auto It = ValueExprMap.find_as(static_cast<Value *>(I));
-    if (It != ValueExprMap.end()) {
-      const SCEV *Old = It->second;
-
-      // Short-circuit the def-use traversal if the symbolic name
-      // ceases to appear in expressions.
-      if (Old != SymName && !hasOperand(Old, SymName))
-        continue;
-
-      // SCEVUnknown for a PHI either means that it has an unrecognized
-      // structure, it's a PHI that's in the progress of being computed
-      // by createNodeForPHI, or it's a single-value PHI. In the first case,
-      // additional loop trip count information isn't going to change anything.
-      // In the second case, createNodeForPHI will perform the necessary
-      // updates on its own when it gets to that point. In the third, we do
-      // want to forget the SCEVUnknown.
-      if (!isa<PHINode>(I) ||
-          !isa<SCEVUnknown>(Old) ||
-          (I != PN && Old == SymName)) {
-        eraseValueFromMap(It->first);
-        ToForget.push_back(Old);
-      }
-    }
-
-    PushDefUseChildren(I, Worklist, Visited);
-  }
-  forgetMemoizedResults(ToForget);
-}
-
 namespace {
 
 /// Takes SCEV S and Loop L. For each AddRec sub-expression, use its start
@@ -5335,15 +5307,17 @@ const SCEV *ScalarEvolution::createSimpleAffineAddRec(PHINode *PN,
 
   const SCEV *StartVal = getSCEV(StartValueV);
   const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
-
-  ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+  insertValueToMap(PN, PHISCEV);
 
   // We can add Flags to the post-inc expression only if we
   // know that it is *undefined behavior* for BEValueV to
   // overflow.
-  if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
-    if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
+  if (auto *BEInst = dyn_cast<Instruction>(BEValueV)) {
+    assert(isLoopInvariant(Accum, L) &&
+           "Accum is defined outside L, but is not invariant?");
+    if (isAddRecNeverPoison(BEInst, L))
       (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
+  }
 
   return PHISCEV;
 }
@@ -5386,7 +5360,7 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
 
   // Handle PHI node value symbolically.
   const SCEV *SymbolicName = getUnknown(PN);
-  ValueExprMap.insert({SCEVCallbackVH(PN, this), SymbolicName});
+  insertValueToMap(PN, SymbolicName);
 
   // Using this symbolic name for the PHI, analyze the value coming around
   // the back-edge.
@@ -5457,8 +5431,8 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
         // Okay, for the entire analysis of this edge we assumed the PHI
         // to be symbolic.  We now need to go back and purge all of the
         // entries for the scalars that use the symbolic expression.
-        forgetSymbolicName(PN, SymbolicName);
-        ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+        forgetMemoizedResults(SymbolicName);
+        insertValueToMap(PN, PHISCEV);
 
         // We can add Flags to the post-inc expression only if we
         // know that it is *undefined behavior* for BEValueV to
@@ -5489,8 +5463,8 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
         // Okay, for the entire analysis of this edge we assumed the PHI
         // to be symbolic.  We now need to go back and purge all of the
         // entries for the scalars that use the symbolic expression.
-        forgetSymbolicName(PN, SymbolicName);
-        ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted;
+        forgetMemoizedResults(SymbolicName);
+        insertValueToMap(PN, Shifted);
         return Shifted;
       }
     }
@@ -7598,62 +7572,19 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // Now that we know more about the trip count for this loop, forget any
   // existing SCEV values for PHI nodes in this loop since they are only
   // conservative estimates made without the benefit of trip count
-  // information. This is similar to the code in forgetLoop, except that
-  // it handles SCEVUnknown PHI nodes specially.
+  // information. This invalidation is not necessary for correctness, and is
+  // only done to produce more precise results.
   if (Result.hasAnyInfo()) {
-    SmallVector<Instruction *, 16> Worklist;
-    SmallPtrSet<Instruction *, 8> Discovered;
+    // Invalidate any expression using an addrec in this loop.
     SmallVector<const SCEV *, 8> ToForget;
-    PushLoopPHIs(L, Worklist, Discovered);
-    while (!Worklist.empty()) {
-      Instruction *I = Worklist.pop_back_val();
-
-      ValueExprMapType::iterator It =
-        ValueExprMap.find_as(static_cast<Value *>(I));
-      if (It != ValueExprMap.end()) {
-        const SCEV *Old = It->second;
-
-        // SCEVUnknown for a PHI either means that it has an unrecognized
-        // structure, or it's a PHI that's in the progress of being computed
-        // by createNodeForPHI.  In the former case, additional loop trip
-        // count information isn't going to change anything. In the later
-        // case, createNodeForPHI will perform the necessary updates on its
-        // own when it gets to that point.
-        if (!isa<PHINode>(I) || !isa<SCEVUnknown>(Old)) {
-          eraseValueFromMap(It->first);
-          ToForget.push_back(Old);
-        }
-        if (PHINode *PN = dyn_cast<PHINode>(I))
-          ConstantEvolutionLoopExitValue.erase(PN);
-      }
-
-      // Since we don't need to invalidate anything for correctness and we're
-      // only invalidating to make SCEV's results more precise, we get to stop
-      // early to avoid invalidating too much.  This is especially important in
-      // cases like:
-      //
-      //   %v = f(pn0, pn1) // pn0 and pn1 used through some other phi node
-      // loop0:
-      //   %pn0 = phi
-      //   ...
-      // loop1:
-      //   %pn1 = phi
-      //   ...
-      //
-      // where both loop0 and loop1's backedge taken count uses the SCEV
-      // expression for %v.  If we don't have the early stop below then in cases
-      // like the above, getBackedgeTakenInfo(loop1) will clear out the trip
-      // count for loop0 and getBackedgeTakenInfo(loop0) will clear out the trip
-      // count for loop1, effectively nullifying SCEV's trip count cache.
-      for (auto *U : I->users())
-        if (auto *I = dyn_cast<Instruction>(U)) {
-          auto *LoopForUser = LI.getLoopFor(I->getParent());
-          if (LoopForUser && L->contains(LoopForUser) &&
-              Discovered.insert(I).second)
-            Worklist.push_back(I);
-        }
-    }
+    auto LoopUsersIt = LoopUsers.find(L);
+    if (LoopUsersIt != LoopUsers.end())
+      append_range(ToForget, LoopUsersIt->second);
     forgetMemoizedResults(ToForget);
+
+    // Invalidate constant-evolved loop header phis.
+    for (PHINode &PN : L->getHeader()->phis())
+      ConstantEvolutionLoopExitValue.erase(&PN);
   }
 
   // Re-lookup the insert position, since the call to
@@ -7672,10 +7603,12 @@ void ScalarEvolution::forgetAllLoops() {
   // result.
   BackedgeTakenCounts.clear();
   PredicatedBackedgeTakenCounts.clear();
+  BECountUsers.clear();
   LoopPropertiesCache.clear();
   ConstantEvolutionLoopExitValue.clear();
   ValueExprMap.clear();
   ValuesAtScopes.clear();
+  ValuesAtScopesUsers.clear();
   LoopDispositions.clear();
   BlockDispositions.clear();
   UnsignedRanges.clear();
@@ -7697,8 +7630,8 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     auto *CurrL = LoopWorklist.pop_back_val();
 
     // Drop any stored trip count value.
-    BackedgeTakenCounts.erase(CurrL);
-    PredicatedBackedgeTakenCounts.erase(CurrL);
+    forgetBackedgeTakenCounts(CurrL, /* Predicated */ false);
+    forgetBackedgeTakenCounts(CurrL, /* Predicated */ true);
 
     // Drop information about predicated SCEV rewrites for this loop.
     for (auto I = PredicatedSCEVRewrites.begin();
@@ -7872,10 +7805,6 @@ bool ScalarEvolution::BackedgeTakenInfo::isConstantMaxOrZero(
   return MaxOrZero && !any_of(ExitNotTaken, PredicateNotAlwaysTrue);
 }
 
-bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S) const {
-  return Operands.contains(S);
-}
-
 ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
     : ExitLimit(E, E, false, None) {
 }
@@ -7916,19 +7845,6 @@ ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
     : ExitLimit(E, M, MaxOrZero, None) {
 }
 
-class SCEVRecordOperands {
-  SmallPtrSetImpl<const SCEV *> &Operands;
-
-public:
-  SCEVRecordOperands(SmallPtrSetImpl<const SCEV *> &Operands)
-    : Operands(Operands) {}
-  bool follow(const SCEV *S) {
-    Operands.insert(S);
-    return true;
-  }
-  bool isDone() { return false; }
-};
-
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
 ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
@@ -7957,14 +7873,6 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
   assert((isa<SCEVCouldNotCompute>(ConstantMax) ||
           isa<SCEVConstant>(ConstantMax)) &&
          "No point in having a non-constant max backedge taken count!");
-
-  SCEVRecordOperands RecordOperands(Operands);
-  SCEVTraversal<SCEVRecordOperands> ST(RecordOperands);
-  if (!isa<SCEVCouldNotCompute>(ConstantMax))
-    ST.visitAll(ConstantMax);
-  for (auto &ENT : ExitNotTaken)
-    if (!isa<SCEVCouldNotCompute>(ENT.ExactNotTaken))
-      ST.visitAll(ENT.ExactNotTaken);
 }
 
 /// Compute the number of times the backedge of the specified loop will execute.
@@ -8046,6 +7954,13 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L,
   // The loop backedge will be taken the maximum or zero times if there's
   // a single exit that must be taken the maximum or zero times.
   bool MaxOrZero = (MustExitMaxOrZero && ExitingBlocks.size() == 1);
+
+  // Remember which SCEVs are used in exit limits for invalidation purposes.
+  // We only care about non-constant SCEVs here, so we can ignore EL.MaxNotTaken
+  // and MaxBECount, which must be SCEVConstant.
+  for (const auto &Pair : ExitCounts)
+    if (!isa<SCEVConstant>(Pair.second.ExactNotTaken))
+      BECountUsers[Pair.second.ExactNotTaken].insert({L, AllowPredicates});
   return BackedgeTakenInfo(std::move(ExitCounts), CouldComputeBECount,
                            MaxBECount, MaxOrZero);
 }
@@ -8916,6 +8831,9 @@ const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
       LS.second = C;
       break;
     }
+
+  if (!isa<SCEVConstant>(C))
+    ValuesAtScopesUsers[C].push_back({L, V});
   return C;
 }
 
@@ -12387,7 +12305,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
     if (Range.contains(Val->getValue()))
       return SE.getCouldNotCompute();  // Something strange happened
 
-    // Ensure that the previous value is in the range.  This is a sanity check.
+    // Ensure that the previous value is in the range.
     assert(Range.contains(
            EvaluateConstantChrecAtConstant(this,
            ConstantInt::get(SE.getContext(), ExitVal - 1), SE)->getValue()) &&
@@ -12531,9 +12449,11 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
       PredicatedBackedgeTakenCounts(
           std::move(Arg.PredicatedBackedgeTakenCounts)),
+      BECountUsers(std::move(Arg.BECountUsers)),
       ConstantEvolutionLoopExitValue(
           std::move(Arg.ConstantEvolutionLoopExitValue)),
       ValuesAtScopes(std::move(Arg.ValuesAtScopes)),
+      ValuesAtScopesUsers(std::move(Arg.ValuesAtScopesUsers)),
       LoopDispositions(std::move(Arg.LoopDispositions)),
       LoopPropertiesCache(std::move(Arg.LoopPropertiesCache)),
       BlockDispositions(std::move(Arg.BlockDispositions)),
@@ -12946,6 +12866,23 @@ bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
   return SCEVExprContains(S, [&](const SCEV *Expr) { return Expr == Op; });
 }
 
+void ScalarEvolution::forgetBackedgeTakenCounts(const Loop *L,
+                                                bool Predicated) {
+  auto &BECounts =
+      Predicated ? PredicatedBackedgeTakenCounts : BackedgeTakenCounts;
+  auto It = BECounts.find(L);
+  if (It != BECounts.end()) {
+    for (const ExitNotTakenInfo &ENT : It->second.ExitNotTaken) {
+      if (!isa<SCEVConstant>(ENT.ExactNotTaken)) {
+        auto UserIt = BECountUsers.find(ENT.ExactNotTaken);
+        assert(UserIt != BECountUsers.end());
+        UserIt->second.erase({L, Predicated});
+      }
+    }
+    BECounts.erase(It);
+  }
+}
+
 void ScalarEvolution::forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs) {
   SmallPtrSet<const SCEV *, 8> ToForget(SCEVs.begin(), SCEVs.end());
   SmallVector<const SCEV *, 8> Worklist(ToForget.begin(), ToForget.end());
@@ -12970,32 +12907,52 @@ void ScalarEvolution::forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs) {
     else
       ++I;
   }
-
-  auto RemoveSCEVFromBackedgeMap = [&ToForget](
-      DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
-        for (auto I = Map.begin(), E = Map.end(); I != E;) {
-          BackedgeTakenInfo &BEInfo = I->second;
-          if (any_of(ToForget,
-                     [&BEInfo](const SCEV *S) { return BEInfo.hasOperand(S); }))
-            Map.erase(I++);
-          else
-            ++I;
-        }
-  };
-
-  RemoveSCEVFromBackedgeMap(BackedgeTakenCounts);
-  RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts);
 }
 
 void ScalarEvolution::forgetMemoizedResultsImpl(const SCEV *S) {
-  ValuesAtScopes.erase(S);
   LoopDispositions.erase(S);
   BlockDispositions.erase(S);
   UnsignedRanges.erase(S);
   SignedRanges.erase(S);
-  ExprValueMap.erase(S);
   HasRecMap.erase(S);
   MinTrailingZerosCache.erase(S);
+
+  auto ExprIt = ExprValueMap.find(S);
+  if (ExprIt != ExprValueMap.end()) {
+    for (auto &ValueAndOffset : ExprIt->second) {
+      if (ValueAndOffset.second == nullptr) {
+        auto ValueIt = ValueExprMap.find_as(ValueAndOffset.first);
+        if (ValueIt != ValueExprMap.end())
+          ValueExprMap.erase(ValueIt);
+      }
+    }
+    ExprValueMap.erase(ExprIt);
+  }
+
+  auto ScopeIt = ValuesAtScopes.find(S);
+  if (ScopeIt != ValuesAtScopes.end()) {
+    for (const auto &Pair : ScopeIt->second)
+      if (!isa_and_nonnull<SCEVConstant>(Pair.second))
+        erase_value(ValuesAtScopesUsers[Pair.second],
+                    std::make_pair(Pair.first, S));
+    ValuesAtScopes.erase(ScopeIt);
+  }
+
+  auto ScopeUserIt = ValuesAtScopesUsers.find(S);
+  if (ScopeUserIt != ValuesAtScopesUsers.end()) {
+    for (const auto &Pair : ScopeUserIt->second)
+      erase_value(ValuesAtScopes[Pair.second], std::make_pair(Pair.first, S));
+    ValuesAtScopesUsers.erase(ScopeUserIt);
+  }
+
+  auto BEUsersIt = BECountUsers.find(S);
+  if (BEUsersIt != BECountUsers.end()) {
+    // Work on a copy, as forgetBackedgeTakenCounts() will modify the original.
+    auto Copy = BEUsersIt->second;
+    for (const auto &Pair : Copy)
+      forgetBackedgeTakenCounts(Pair.getPointer(), Pair.getInt());
+    BECountUsers.erase(BEUsersIt);
+  }
 }
 
 void
@@ -13100,16 +13057,43 @@ void ScalarEvolution::verify() const {
     ValidLoops.insert(L);
     Worklist.append(L->begin(), L->end());
   }
-  // Check for SCEV expressions referencing invalid/deleted loops.
   for (auto &KV : ValueExprMap) {
-    auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second);
-    if (!AR)
-      continue;
-    assert(ValidLoops.contains(AR->getLoop()) &&
-           "AddRec references invalid loop");
+    // Check for SCEV expressions referencing invalid/deleted loops.
+    if (auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second)) {
+      assert(ValidLoops.contains(AR->getLoop()) &&
+             "AddRec references invalid loop");
+    }
+
+    // Check that the value is also part of the reverse map.
+    auto It = ExprValueMap.find(KV.second);
+    if (It == ExprValueMap.end() || !It->second.contains({KV.first, nullptr})) {
+      dbgs() << "Value " << *KV.first
+             << " is in ValueExprMap but not in ExprValueMap\n";
+      std::abort();
+    }
+  }
+
+  for (const auto &KV : ExprValueMap) {
+    for (const auto &ValueAndOffset : KV.second) {
+      if (ValueAndOffset.second != nullptr)
+        continue;
+
+      auto It = ValueExprMap.find_as(ValueAndOffset.first);
+      if (It == ValueExprMap.end()) {
+        dbgs() << "Value " << *ValueAndOffset.first
+               << " is in ExprValueMap but not in ValueExprMap\n";
+        std::abort();
+      }
+      if (It->second != KV.first) {
+        dbgs() << "Value " << *ValueAndOffset.first
+               << " mapped to " << *It->second
+               << " rather than " << *KV.first << "\n";
+        std::abort();
+      }
+    }
   }
 
-  // Verify intergity of SCEV users.
+  // Verify integrity of SCEV users.
   for (const auto &S : UniqueSCEVs) {
     SmallVector<const SCEV *, 4> Ops;
     collectUniqueOps(&S, Ops);
@@ -13125,6 +13109,61 @@ void ScalarEvolution::verify() const {
       std::abort();
     }
   }
+
+  // Verify integrity of ValuesAtScopes users.
+  for (const auto &ValueAndVec : ValuesAtScopes) {
+    const SCEV *Value = ValueAndVec.first;
+    for (const auto &LoopAndValueAtScope : ValueAndVec.second) {
+      const Loop *L = LoopAndValueAtScope.first;
+      const SCEV *ValueAtScope = LoopAndValueAtScope.second;
+      if (!isa<SCEVConstant>(ValueAtScope)) {
+        auto It = ValuesAtScopesUsers.find(ValueAtScope);
+        if (It != ValuesAtScopesUsers.end() &&
+            is_contained(It->second, std::make_pair(L, Value)))
+          continue;
+        dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: "
+               << ValueAtScope << " missing in ValuesAtScopesUsers\n";
+        std::abort();
+      }
+    }
+  }
+
+  for (const auto &ValueAtScopeAndVec : ValuesAtScopesUsers) {
+    const SCEV *ValueAtScope = ValueAtScopeAndVec.first;
+    for (const auto &LoopAndValue : ValueAtScopeAndVec.second) {
+      const Loop *L = LoopAndValue.first;
+      const SCEV *Value = LoopAndValue.second;
+      assert(!isa<SCEVConstant>(Value));
+      auto It = ValuesAtScopes.find(Value);
+      if (It != ValuesAtScopes.end() &&
+          is_contained(It->second, std::make_pair(L, ValueAtScope)))
+        continue;
+      dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: "
+             << ValueAtScope << " missing in ValuesAtScopes\n";
+      std::abort();
+    }
+  }
+
+  // Verify integrity of BECountUsers.
+  auto VerifyBECountUsers = [&](bool Predicated) {
+    auto &BECounts =
+        Predicated ? PredicatedBackedgeTakenCounts : BackedgeTakenCounts;
+    for (const auto &LoopAndBEInfo : BECounts) {
+      for (const ExitNotTakenInfo &ENT : LoopAndBEInfo.second.ExitNotTaken) {
+        if (!isa<SCEVConstant>(ENT.ExactNotTaken)) {
+          auto UserIt = BECountUsers.find(ENT.ExactNotTaken);
+          if (UserIt != BECountUsers.end() &&
+              UserIt->second.contains({ LoopAndBEInfo.first, Predicated }))
+            continue;
+          dbgs() << "Value " << *ENT.ExactNotTaken << " for loop "
+                 << *LoopAndBEInfo.first << " missing from BECountUsers\n";
+          std::abort();
+        }
+      }
+    }
+  };
+  VerifyBECountUsers(/* Predicated */ false);
+  VerifyBECountUsers(/* Predicated */ true);
 }
 
 bool ScalarEvolution::invalidate(
diff --git a/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 74cc39b7f2c0..54f3605ee033 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -14,12 +14,14 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/StackLifetime.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
@@ -117,7 +119,7 @@ template <typename CalleeTy> struct UseInfo {
   // Access range if the address (alloca or parameters).
   // It is allowed to be empty-set when there are no known accesses.
   ConstantRange Range;
-  std::map<const Instruction *, ConstantRange> Accesses;
+  std::set<const Instruction *> UnsafeAccesses;
 
   // List of calls which pass address as an argument.
   // Value is offset range of address from base address (alloca or calling
@@ -131,10 +133,9 @@ template <typename CalleeTy> struct UseInfo {
   UseInfo(unsigned PointerSize) : Range{PointerSize, false} {}
 
   void updateRange(const ConstantRange &R) { Range = unionNoWrap(Range, R); }
-  void addRange(const Instruction *I, const ConstantRange &R) {
-    auto Ins = Accesses.emplace(I, R);
-    if (!Ins.second)
-      Ins.first->second = unionNoWrap(Ins.first->second, R);
+  void addRange(const Instruction *I, const ConstantRange &R, bool IsSafe) {
+    if (!IsSafe)
+      UnsafeAccesses.insert(I);
     updateRange(R);
   }
 };
@@ -230,7 +231,7 @@ struct StackSafetyInfo::InfoTy {
 struct StackSafetyGlobalInfo::InfoTy {
   GVToSSI Info;
   SmallPtrSet<const AllocaInst *, 8> SafeAllocas;
-  std::map<const Instruction *, bool> AccessIsUnsafe;
+  std::set<const Instruction *> UnsafeAccesses;
 };
 
 namespace {
@@ -253,6 +254,11 @@ class StackSafetyLocalAnalysis {
   void analyzeAllUses(Value *Ptr, UseInfo<GlobalValue> &AS,
                       const StackLifetime &SL);
 
+
+  bool isSafeAccess(const Use &U, AllocaInst *AI, const SCEV *AccessSize);
+  bool isSafeAccess(const Use &U, AllocaInst *AI, Value *V);
+  bool isSafeAccess(const Use &U, AllocaInst *AI, TypeSize AccessSize);
+
 public:
   StackSafetyLocalAnalysis(Function &F, ScalarEvolution &SE)
       : F(F), DL(F.getParent()->getDataLayout()), SE(SE),
@@ -333,6 +339,56 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
   return getAccessRange(U, Base, SizeRange);
 }
 
+bool StackSafetyLocalAnalysis::isSafeAccess(const Use &U, AllocaInst *AI,
+                                            Value *V) {
+  return isSafeAccess(U, AI, SE.getSCEV(V));
+}
+
+bool StackSafetyLocalAnalysis::isSafeAccess(const Use &U, AllocaInst *AI,
+                                            TypeSize TS) {
+  if (TS.isScalable())
+    return false;
+  auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize);
+  const SCEV *SV = SE.getConstant(CalculationTy, TS.getFixedSize());
+  return isSafeAccess(U, AI, SV);
+}
+
+bool StackSafetyLocalAnalysis::isSafeAccess(const Use &U, AllocaInst *AI,
+                                            const SCEV *AccessSize) {
+
+  if (!AI)
+    return true;
+  if (isa<SCEVCouldNotCompute>(AccessSize))
+    return false;
+
+  const auto *I = cast<Instruction>(U.getUser());
+
+  auto ToCharPtr = [&](const SCEV *V) {
+    auto *PtrTy = IntegerType::getInt8PtrTy(SE.getContext());
+    return SE.getTruncateOrZeroExtend(V, PtrTy);
+  };
+
+  const SCEV *AddrExp = ToCharPtr(SE.getSCEV(U.get()));
+  const SCEV *BaseExp = ToCharPtr(SE.getSCEV(AI));
+  const SCEV *Diff = SE.getMinusSCEV(AddrExp, BaseExp);
+  if (isa<SCEVCouldNotCompute>(Diff))
+    return false;
+
+  auto Size = getStaticAllocaSizeRange(*AI);
+
+  auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize);
+  auto ToDiffTy = [&](const SCEV *V) {
+    return SE.getTruncateOrZeroExtend(V, CalculationTy);
+  };
+  const SCEV *Min = ToDiffTy(SE.getConstant(Size.getLower()));
+  const SCEV *Max = SE.getMinusSCEV(ToDiffTy(SE.getConstant(Size.getUpper())),
+                                    ToDiffTy(AccessSize));
+  return SE.evaluatePredicateAt(ICmpInst::Predicate::ICMP_SGE, Diff, Min, I)
+             .getValueOr(false) &&
+         SE.evaluatePredicateAt(ICmpInst::Predicate::ICMP_SLE, Diff, Max, I)
+             .getValueOr(false);
+}
+
 /// The function analyzes all local uses of Ptr (alloca or argument) and
 /// calculates local access range and all function calls where it was used.
 void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
@@ -341,7 +397,7 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
   SmallPtrSet<const Value *, 16> Visited;
   SmallVector<const Value *, 8> WorkList;
   WorkList.push_back(Ptr);
-  const AllocaInst *AI = dyn_cast<AllocaInst>(Ptr);
+  AllocaInst *AI = dyn_cast<AllocaInst>(Ptr);
 
   // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
   while (!WorkList.empty()) {
@@ -356,11 +412,13 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
       switch (I->getOpcode()) {
       case Instruction::Load: {
         if (AI && !SL.isAliveAfter(AI, I)) {
-          US.addRange(I, UnknownRange);
+          US.addRange(I, UnknownRange, /*IsSafe=*/false);
           break;
         }
-        US.addRange(I,
-                    getAccessRange(UI, Ptr, DL.getTypeStoreSize(I->getType())));
+        auto TypeSize = DL.getTypeStoreSize(I->getType());
+        auto AccessRange = getAccessRange(UI, Ptr, TypeSize);
+        bool Safe = isSafeAccess(UI, AI, TypeSize);
+        US.addRange(I, AccessRange, Safe);
         break;
       }
 
@@ -370,16 +428,17 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
       case Instruction::Store: {
         if (V == I->getOperand(0)) {
           // Stored the pointer - conservatively assume it may be unsafe.
-          US.addRange(I, UnknownRange);
+          US.addRange(I, UnknownRange, /*IsSafe=*/false);
           break;
         }
         if (AI && !SL.isAliveAfter(AI, I)) {
-          US.addRange(I, UnknownRange);
+          US.addRange(I, UnknownRange, /*IsSafe=*/false);
           break;
         }
-        US.addRange(
-            I, getAccessRange(
-                   UI, Ptr, DL.getTypeStoreSize(I->getOperand(0)->getType())));
+        auto TypeSize = DL.getTypeStoreSize(I->getOperand(0)->getType());
+        auto AccessRange = getAccessRange(UI, Ptr, TypeSize);
+        bool Safe = isSafeAccess(UI, AI, TypeSize);
+        US.addRange(I, AccessRange, Safe);
         break;
       }
 
@@ -387,7 +446,7 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
         // Information leak.
         // FIXME: Process parameters correctly. This is a leak only if we return
         // alloca.
-        US.addRange(I, UnknownRange);
+        US.addRange(I, UnknownRange, /*IsSafe=*/false);
         break;
 
       case Instruction::Call:
@@ -396,12 +455,20 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
           break;
 
         if (AI && !SL.isAliveAfter(AI, I)) {
-          US.addRange(I, UnknownRange);
+          US.addRange(I, UnknownRange, /*IsSafe=*/false);
           break;
         }
-
         if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
-          US.addRange(I, getMemIntrinsicAccessRange(MI, UI, Ptr));
+          auto AccessRange = getMemIntrinsicAccessRange(MI, UI, Ptr);
+          bool Safe = false;
+          if (const auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+            if (MTI->getRawSource() != UI && MTI->getRawDest() != UI)
+              Safe = true;
+          } else if (MI->getRawDest() != UI) {
+            Safe = true;
+          }
+          Safe = Safe || isSafeAccess(UI, AI, MI->getLength());
+          US.addRange(I, AccessRange, Safe);
           break;
         }
 
@@ -412,15 +479,16 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
         }
 
         if (!CB.isArgOperand(&UI)) {
-          US.addRange(I, UnknownRange);
+          US.addRange(I, UnknownRange, /*IsSafe=*/false);
           break;
         }
 
         unsigned ArgNo = CB.getArgOperandNo(&UI);
         if (CB.isByValArgument(ArgNo)) {
-          US.addRange(I, getAccessRange(
-                             UI, Ptr,
-                             DL.getTypeStoreSize(CB.getParamByValType(ArgNo))));
+          auto TypeSize = DL.getTypeStoreSize(CB.getParamByValType(ArgNo));
+          auto AccessRange = getAccessRange(UI, Ptr, TypeSize);
+          bool Safe = isSafeAccess(UI, AI, TypeSize);
+          US.addRange(I, AccessRange, Safe);
           break;
         }
 
@@ -430,7 +498,7 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
         const GlobalValue *Callee =
             dyn_cast<GlobalValue>(CB.getCalledOperand()->stripPointerCasts());
         if (!Callee) {
-          US.addRange(I, UnknownRange);
+          US.addRange(I, UnknownRange, /*IsSafe=*/false);
           break;
         }
 
@@ -827,8 +895,8 @@ const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const {
           Info->SafeAllocas.insert(AI);
           ++NumAllocaStackSafe;
         }
-        for (const auto &A : KV.second.Accesses)
-          Info->AccessIsUnsafe[A.first] |= !AIRange.contains(A.second);
+        Info->UnsafeAccesses.insert(KV.second.UnsafeAccesses.begin(),
+                                    KV.second.UnsafeAccesses.end());
       }
     }
 
@@ -903,11 +971,7 @@ bool StackSafetyGlobalInfo::isSafe(const AllocaInst &AI) const {
 
 bool StackSafetyGlobalInfo::stackAccessIsSafe(const Instruction &I) const {
   const auto &Info = getInfo();
-  auto It = Info.AccessIsUnsafe.find(&I);
-  if (It == Info.AccessIsUnsafe.end()) {
-    return true;
-  }
-  return !It->second;
+  return Info.UnsafeAccesses.find(&I) == Info.UnsafeAccesses.end();
 }
 
 void StackSafetyGlobalInfo::print(raw_ostream &O) const {
diff --git a/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
index 59582cd3a198..ff833b55bbce 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -15,21 +15,18 @@
 // The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
 // control-induced divergence in phi nodes.
 //
-// -- Summary --
-// The SyncDependenceAnalysis lazily computes sync dependences [3].
-// The analysis evaluates the disjoint path criterion [2] by a reduction
-// to SSA construction. The SSA construction algorithm is implemented as
-// a simple data-flow analysis [1].
 //
-// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
-// [2] "Efficiently Computing Static Single Assignment Form
-//     and the Control Dependence Graph", TOPLAS '91,
-//           Cytron, Ferrante, Rosen, Wegman and Zadeck
-// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
-// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+// -- Reference --
+// The algorithm is presented in Section 5 of 
+//
+//   An abstract interpretation for SPMD divergence
+//       on reducible control flow graphs.
+//   Julian Rosemann, Simon Moll and Sebastian Hack
+//   POPL '21
+//
 //
 // -- Sync dependence --
-// Sync dependence [4] characterizes the control flow aspect of the
+// Sync dependence characterizes the control flow aspect of the
 // propagation of branch divergence. For example,
 //
 //   %cond = icmp slt i32 %tid, 10
@@ -46,9 +43,10 @@
 // because the branch "br i1 %cond" depends on %tid and affects which value %a
 // is assigned to.
 //
+//
 // -- Reduction to SSA construction --
 // There are two disjoint paths from A to X, if a certain variant of SSA
-// construction places a phi node in X under the following set-up scheme [2].
+// construction places a phi node in X under the following set-up scheme.
 //
 // This variant of SSA construction ignores incoming undef values.
 // That is paths from the entry without a definition do not result in
@@ -63,6 +61,7 @@
 //    D     E
 //     \   /
 //       F
+//
 // Assume that A contains a divergent branch. We are interested
 // in the set of all blocks where each block is reachable from A
 // via two disjoint paths. This would be the set {D, F} in this
@@ -70,6 +69,7 @@
 // To generally reduce this query to SSA construction we introduce
 // a virtual variable x and assign to x different values in each
 // successor block of A.
+//
 //           entry
 //         /      \
 //        A        \
@@ -79,23 +79,41 @@
 //        D     E
 //         \   /
 //           F
+//
 // Our flavor of SSA construction for x will construct the following
+//
 //            entry
 //          /      \
 //         A        \
 //       /   \       Y
 // x0 = 0   x1 = 1  /
 //       \   /   \ /
-//      x2=phi    E
+//     x2 = phi   E
 //         \     /
-//          x3=phi
+//         x3 = phi
+//
 // The blocks D and F contain phi nodes and are thus each reachable
 // by two disjoins paths from A.
 //
 // -- Remarks --
-// In case of loop exits we need to check the disjoint path criterion for loops
-// [2]. To this end, we check whether the definition of x differs between the
-// loop exit and the loop header (_after_ SSA construction).
+// * In case of loop exits we need to check the disjoint path criterion for loops.
+//   To this end, we check whether the definition of x differs between the
+//   loop exit and the loop header (_after_ SSA construction).
+//
+// -- Known Limitations & Future Work --
+// * The algorithm requires reducible loops because the implementation
+//   implicitly performs a single iteration of the underlying data flow analysis.
+//   This was done for pragmatism, simplicity and speed.
+//
+//   Relevant related work for extending the algorithm to irreducible control:
+//     A simple algorithm for global data flow analysis problems.
+//     Matthew S. Hecht and Jeffrey D. Ullman.
+//     SIAM Journal on Computing, 4(4):519–532, December 1975.
+//
+// * Another reason for requiring reducible loops is that points of
+//   synchronization in irreducible loops aren't 'obvious' - there is no unique
+//   header where threads 'should' synchronize when entering or coming back
+//   around from the latch.
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/SyncDependenceAnalysis.h"
@@ -128,8 +146,9 @@ using namespace llvm;
 //
 // We cannot use the vanilla (R)PO computation of LLVM because:
 // * We (virtually) modify the CFG.
-// * We want a loop-compact block enumeration, that is the numbers assigned by
-//   the traveral to the blocks of a loop are an interval.
+// * We want a loop-compact block enumeration, that is the numbers assigned to
+//   blocks of a loop form an interval
+//   
 using POCB = std::function<void(const BasicBlock &)>;
 using VisitedSet = std::set<const BasicBlock *>;
 using BlockStack = std::vector<const BasicBlock *>;
diff --git a/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 7326ba74c071..72fbd5ad3f68 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -166,8 +166,8 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     return;
   }
 
-  // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later.
-  // All versions of watchOS support it.
+  // memset_pattern{4,8,16} is only available on iOS 3.0 and Mac OS X 10.5 and
+  // later. All versions of watchOS support it.
   if (T.isMacOSX()) {
     // available IO unlocked variants on Mac OS X
     TLI.setAvailable(LibFunc_getc_unlocked);
@@ -175,12 +175,20 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setAvailable(LibFunc_putc_unlocked);
     TLI.setAvailable(LibFunc_putchar_unlocked);
 
-    if (T.isMacOSXVersionLT(10, 5))
+    if (T.isMacOSXVersionLT(10, 5)) {
+      TLI.setUnavailable(LibFunc_memset_pattern4);
+      TLI.setUnavailable(LibFunc_memset_pattern8);
       TLI.setUnavailable(LibFunc_memset_pattern16);
+    }
   } else if (T.isiOS()) {
-    if (T.isOSVersionLT(3, 0))
+    if (T.isOSVersionLT(3, 0)) {
+      TLI.setUnavailable(LibFunc_memset_pattern4);
+      TLI.setUnavailable(LibFunc_memset_pattern8);
       TLI.setUnavailable(LibFunc_memset_pattern16);
+    }
   } else if (!T.isWatchOS()) {
+    TLI.setUnavailable(LibFunc_memset_pattern4);
+    TLI.setUnavailable(LibFunc_memset_pattern8);
     TLI.setUnavailable(LibFunc_memset_pattern16);
   }
 
@@ -684,7 +692,6 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_strcat_chk);
     TLI.setUnavailable(LibFunc_strcpy_chk);
     TLI.setUnavailable(LibFunc_strlcat_chk);
-    TLI.setUnavailable(LibFunc_strlcat_chk);
     TLI.setUnavailable(LibFunc_strlcpy_chk);
     TLI.setUnavailable(LibFunc_strlen_chk);
     TLI.setUnavailable(LibFunc_strncat_chk);
@@ -1523,6 +1530,8 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
             FTy.getParamType(2)->isPointerTy() &&
             FTy.getParamType(3)->isIntegerTy());
 
+  case LibFunc_memset_pattern4:
+  case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
     return (!FTy.isVarArg() && NumParams == 3 &&
             FTy.getParamType(0)->isPointerTy() &&
diff --git a/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp b/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp
index 8a34a34eb307..7573975a3dd3 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -445,7 +445,6 @@ Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName,
     VF = EC.getKnownMinValue();
   }
 
-  // Sanity checks.
   // 1. We don't accept a zero lanes vectorization factor.
   // 2. We don't accept the demangling if the vector function is not
   // present in the module.
diff --git a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp
index 5bce1eaa59a0..5feabd876e3a 100644
--- a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp
+++ b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp
@@ -124,8 +124,8 @@ void LLParser::restoreParsingState(const SlotMapping *Slots) {
         std::make_pair(I.first, std::make_pair(I.second, LocTy())));
 }
 
-/// validateEndOfModule - Do final validity and sanity checks at the end of the
-/// module.
+/// validateEndOfModule - Do final validity and basic correctness checks at the
+/// end of the module.
 bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   if (!M)
     return false;
@@ -271,7 +271,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   return false;
 }
 
-/// Do final validity and sanity checks at the end of the index.
+/// Do final validity and basic correctness checks at the end of the index.
 bool LLParser::validateEndOfIndex() {
   if (!Index)
     return false;
@@ -2989,9 +2989,10 @@ BasicBlock *LLParser::PerFunctionState::defineBB(const std::string &Name,
 /// parseValID - parse an abstract value that doesn't necessarily have a
 /// type implied.  For example, if we parse "4" we don't know what integer type
 /// it has.  The value will later be combined with its type and checked for
-/// sanity.  PFS is used to convert function-local operands of metadata (since
-/// metadata operands are not just parsed here but also converted to values).
-/// PFS can be null when we are not parsing metadata values inside a function.
+/// basic correctness.  PFS is used to convert function-local operands of
+/// metadata (since metadata operands are not just parsed here but also
+/// converted to values). PFS can be null when we are not parsing metadata
+/// values inside a function.
 bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
   ID.Loc = Lex.getLoc();
   switch (Lex.getKind()) {
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 2723105b092f..d7bcb0d7f575 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -957,8 +957,8 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
             O->OS.write_escaped(Blob, /*hex=*/true) << "'";
           } else {
             bool BlobIsPrintable = true;
-            for (unsigned i = 0, e = Blob.size(); i != e; ++i)
-              if (!isPrint(static_cast<unsigned char>(Blob[i]))) {
+            for (char C : Blob)
+              if (!isPrint(static_cast<unsigned char>(C))) {
                 BlobIsPrintable = false;
                 break;
               }
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index c568461e62b0..993cb1de8c02 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3996,8 +3996,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // See if anything took the address of blocks in this function.
       auto BBFRI = BasicBlockFwdRefs.find(F);
       if (BBFRI == BasicBlockFwdRefs.end()) {
-        for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
-          FunctionBBs[i] = BasicBlock::Create(Context, "", F);
+        for (BasicBlock *&BB : FunctionBBs)
+          BB = BasicBlock::Create(Context, "", F);
       } else {
         auto &BBRefs = BBFRI->second;
         // Check for invalid basic block references.
@@ -4605,9 +4605,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
               CaseVals.push_back(ConstantInt::get(Context, Low));
           }
           BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
-          for (SmallVector<ConstantInt*, 1>::iterator cvi = CaseVals.begin(),
-                 cve = CaseVals.end(); cvi != cve; ++cvi)
-            SI->addCase(*cvi, DestBB);
+          for (ConstantInt *Cst : CaseVals)
+            SI->addCase(Cst, DestBB);
         }
         I = SI;
         break;
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 6df5a4a64d51..60530d7f7a00 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -444,7 +444,8 @@ class MetadataLoader::MetadataLoaderImpl {
   uint64_t GlobalDeclAttachmentPos = 0;
 
 #ifndef NDEBUG
-  /// Sanity check that we end up parsing all of the global decl attachments.
+  /// Baisic correctness check that we end up parsing all of the global decl
+  /// attachments.
   unsigned NumGlobalDeclAttachSkipped = 0;
   unsigned NumGlobalDeclAttachParsed = 0;
 #endif
@@ -917,7 +918,7 @@ Expected<bool> MetadataLoader::MetadataLoaderImpl::loadGlobalDeclAttachments() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
-      // Sanity check that we parsed them all.
+      // Check that we parsed them all.
       assert(NumGlobalDeclAttachSkipped == NumGlobalDeclAttachParsed);
       return true;
     case BitstreamEntry::Record:
@@ -929,7 +930,7 @@ Expected<bool> MetadataLoader::MetadataLoaderImpl::loadGlobalDeclAttachments() {
       return MaybeCode.takeError();
     if (MaybeCode.get() != bitc::METADATA_GLOBAL_DECL_ATTACHMENT) {
       // Anything other than a global decl attachment signals the end of
-      // these records. sanity check that we parsed them all.
+      // these records. Check that we parsed them all.
       assert(NumGlobalDeclAttachSkipped == NumGlobalDeclAttachParsed);
       return true;
     }
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1e9a9197aed7..e2354c40844a 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -596,10 +596,10 @@ static void writeStringRecord(BitstreamWriter &Stream, unsigned Code,
   SmallVector<unsigned, 64> Vals;
 
   // Code: [strchar x N]
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (AbbrevToUse && !BitCodeAbbrevOp::isChar6(Str[i]))
+  for (char C : Str) {
+    if (AbbrevToUse && !BitCodeAbbrevOp::isChar6(C))
       AbbrevToUse = 0;
-    Vals.push_back(Str[i]);
+    Vals.push_back(C);
   }
 
   // Emit the finished record.
@@ -914,8 +914,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
   TypeVals.clear();
 
   // Loop over all of the types, emitting each in turn.
-  for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
-    Type *T = TypeList[i];
+  for (Type *T : TypeList) {
     int AbbrevToUse = 0;
     unsigned Code = 0;
 
@@ -3343,19 +3342,18 @@ void ModuleBitcodeWriter::writeFunction(
 
   DILocation *LastDL = nullptr;
   // Finally, emit all the instructions, in order.
-  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      writeInstruction(*I, InstID, Vals);
+  for (const BasicBlock &BB : F)
+    for (const Instruction &I : BB) {
+      writeInstruction(I, InstID, Vals);
 
-      if (!I->getType()->isVoidTy())
+      if (!I.getType()->isVoidTy())
         ++InstID;
 
       // If the instruction has metadata, write a metadata attachment later.
-      NeedsMetadataAttachment |= I->hasMetadataOtherThanDebugLoc();
+      NeedsMetadataAttachment |= I.hasMetadataOtherThanDebugLoc();
 
       // If the instruction has a debug location, emit it.
-      DILocation *DL = I->getDebugLoc();
+      DILocation *DL = I.getDebugLoc();
       if (!DL)
         continue;
 
@@ -4429,9 +4427,9 @@ void ModuleBitcodeWriter::write() {
 
   // Emit function bodies.
   DenseMap<const Function *, uint64_t> FunctionToBitcodeIndex;
-  for (Module::const_iterator F = M.begin(), E = M.end(); F != E; ++F)
-    if (!F->isDeclaration())
-      writeFunction(*F, FunctionToBitcodeIndex);
+  for (const Function &F : M)
+    if (!F.isDeclaration())
+      writeFunction(F, FunctionToBitcodeIndex);
 
   // Need to write after the above call to WriteFunction which populates
   // the summary information in the index.
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 9465a3b11c8f..07e0708e68c3 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -1148,8 +1148,8 @@ void ValueEnumerator::purgeFunction() {
     ValueMap.erase(Values[i].first);
   for (unsigned i = NumModuleMDs, e = MDs.size(); i != e; ++i)
     MetadataMap.erase(MDs[i]);
-  for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i)
-    ValueMap.erase(BasicBlocks[i]);
+  for (const BasicBlock *BB : BasicBlocks)
+    ValueMap.erase(BB);
 
   Values.resize(NumModuleValues);
   MDs.resize(NumModuleMDs);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 87a3cede601b..5984063627b0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -354,8 +354,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
   // dead, or because only a subregister is live at the def. If we
   // don't do this the dead def will be incorrectly merged into the
   // previous def.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     Register Reg = MO.getReg();
     if (Reg == 0) continue;
@@ -407,8 +406,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
 
   // Scan the register defs for this instruction and update
   // live-ranges.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     Register Reg = MO.getReg();
     if (Reg == 0) continue;
@@ -495,8 +493,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     LLVM_DEBUG(dbgs() << "\tKill Group:");
 
     unsigned FirstReg = 0;
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
+    for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg()) continue;
       Register Reg = MO.getReg();
       if (Reg == 0) continue;
@@ -762,11 +759,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
 
   // ...need a map from MI to SUnit.
   std::map<MachineInstr *, const SUnit *> MISUnitMap;
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    const SUnit *SU = &SUnits[i];
-    MISUnitMap.insert(std::pair<MachineInstr *, const SUnit *>(SU->getInstr(),
-                                                               SU));
-  }
+  for (const SUnit &SU : SUnits)
+    MISUnitMap.insert(std::make_pair(SU.getInstr(), &SU));
 
   // Track progress along the critical path through the SUnit graph as
   // we walk the instructions. This is needed for regclasses that only
@@ -774,12 +768,11 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
   const SUnit *CriticalPathSU = nullptr;
   MachineInstr *CriticalPathMI = nullptr;
   if (CriticalPathSet.any()) {
-    for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-      const SUnit *SU = &SUnits[i];
+    for (const SUnit &SU : SUnits) {
       if (!CriticalPathSU ||
-          ((SU->getDepth() + SU->Latency) >
+          ((SU.getDepth() + SU.Latency) >
            (CriticalPathSU->getDepth() + CriticalPathSU->Latency))) {
-        CriticalPathSU = SU;
+        CriticalPathSU = &SU;
       }
     }
     assert(CriticalPathSU && "Failed to find SUnit critical path");
@@ -839,8 +832,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
     // but don't cause any anti-dependence breaking themselves)
     if (!MI.isKill()) {
       // Attempt to break each anti-dependency...
-      for (unsigned i = 0, e = Edges.size(); i != e; ++i) {
-        const SDep *Edge = Edges[i];
+      for (const SDep *Edge : Edges) {
         SUnit *NextSU = Edge->getSUnit();
 
         if ((Edge->getKind() != SDep::Anti) &&
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index cc848d28a9a7..828cb760b82e 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -809,9 +809,9 @@ void AsmPrinter::emitFunctionHeader() {
   // so that we don't get references to undefined symbols.
   std::vector<MCSymbol*> DeadBlockSyms;
   MMI->takeDeletedSymbolsForFunction(&F, DeadBlockSyms);
-  for (unsigned i = 0, e = DeadBlockSyms.size(); i != e; ++i) {
+  for (MCSymbol *DeadBlockSym : DeadBlockSyms) {
     OutStreamer->AddComment("Address taken block that was later removed");
-    OutStreamer->emitLabel(DeadBlockSyms[i]);
+    OutStreamer->emitLabel(DeadBlockSym);
   }
 
   if (CurrentFnBegin) {
@@ -910,8 +910,7 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
   std::string Str;
   raw_string_ostream OS(Str);
   OS << "kill:";
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &Op = MI->getOperand(i);
+  for (const MachineOperand &Op : MI->operands()) {
     assert(Op.isReg() && "KILL instruction must have only register operands");
     OS << ' ' << (Op.isDef() ? "def " : "killed ")
        << printReg(Op.getReg(), AP.MF->getSubtarget().getRegisterInfo());
@@ -2150,8 +2149,7 @@ void AsmPrinter::emitJumpTableInfo() {
       SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
       const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
       const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
-      for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
-        const MachineBasicBlock *MBB = JTBBs[ii];
+      for (const MachineBasicBlock *MBB : JTBBs) {
         if (!EmittedSets.insert(MBB).second)
           continue;
 
@@ -2177,8 +2175,8 @@ void AsmPrinter::emitJumpTableInfo() {
     MCSymbol* JTISymbol = GetJTISymbol(JTI);
     OutStreamer->emitLabel(JTISymbol);
 
-    for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii)
-      emitJumpTableEntry(MJTI, JTBBs[ii], JTI);
+    for (const MachineBasicBlock *MBB : JTBBs)
+      emitJumpTableEntry(MJTI, MBB, JTI);
   }
   if (!JTInDiffSection)
     OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index ef1abc47701a..5d0cadefdbf7 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -128,191 +128,29 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   emitInlineAsmEnd(STI, &TAP->getSTI());
 }
 
-static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
-                               MachineModuleInfo *MMI, const MCAsmInfo *MAI,
-                               AsmPrinter *AP, uint64_t LocCookie,
-                               raw_ostream &OS) {
-  // Switch to the inline assembly variant.
-  OS << "\t.intel_syntax\n\t";
-
-  int CurVariant = -1; // The number of the {.|.|.} region we are in.
-  const char *LastEmitted = AsmStr; // One past the last character emitted.
-  unsigned NumOperands = MI->getNumOperands();
-  int AsmPrinterVariant = 1; // X86MCAsmInfo.cpp's AsmWriterFlavorTy::Intel.
-
-  while (*LastEmitted) {
-    switch (*LastEmitted) {
-    default: {
-      // Not a special case, emit the string section literally.
-      const char *LiteralEnd = LastEmitted+1;
-      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
-             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
-        ++LiteralEnd;
-      if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
-        OS.write(LastEmitted, LiteralEnd - LastEmitted);
-      LastEmitted = LiteralEnd;
-      break;
-    }
-    case '\n':
-      ++LastEmitted;   // Consume newline character.
-      OS << '\n';      // Indent code with newline.
-      break;
-    case '$': {
-      ++LastEmitted;   // Consume '$' character.
-      bool Done = true;
-
-      // Handle escapes.
-      switch (*LastEmitted) {
-      default: Done = false; break;
-      case '$':
-        ++LastEmitted;  // Consume second '$' character.
-        break;
-      case '(':        // $( -> same as GCC's { character.
-        ++LastEmitted; // Consume '(' character.
-        if (CurVariant != -1)
-          report_fatal_error("Nested variants found in inline asm string: '" +
-                             Twine(AsmStr) + "'");
-        CurVariant = 0; // We're in the first variant now.
-        break;
-      case '|':
-        ++LastEmitted; // Consume '|' character.
-        if (CurVariant == -1)
-          OS << '|'; // This is gcc's behavior for | outside a variant.
-        else
-          ++CurVariant; // We're in the next variant.
-        break;
-      case ')':        // $) -> same as GCC's } char.
-        ++LastEmitted; // Consume ')' character.
-        if (CurVariant == -1)
-          OS << '}'; // This is gcc's behavior for } outside a variant.
-        else
-          CurVariant = -1;
-        break;
-      }
-      if (Done) break;
-
-      bool HasCurlyBraces = false;
-      if (*LastEmitted == '{') {     // ${variable}
-        ++LastEmitted;               // Consume '{' character.
-        HasCurlyBraces = true;
-      }
-
-      // If we have ${:foo}, then this is not a real operand reference, it is a
-      // "magic" string reference, just like in .td files.  Arrange to call
-      // PrintSpecial.
-      if (HasCurlyBraces && *LastEmitted == ':') {
-        ++LastEmitted;
-        const char *StrStart = LastEmitted;
-        const char *StrEnd = strchr(StrStart, '}');
-        if (!StrEnd)
-          report_fatal_error("Unterminated ${:foo} operand in inline asm"
-                             " string: '" + Twine(AsmStr) + "'");
-        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
-          AP->PrintSpecial(MI, OS, StringRef(StrStart, StrEnd - StrStart));
-        LastEmitted = StrEnd+1;
-        break;
-      }
-
-      const char *IDStart = LastEmitted;
-      const char *IDEnd = IDStart;
-      while (isDigit(*IDEnd))
-        ++IDEnd;
-
-      unsigned Val;
-      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
-        report_fatal_error("Bad $ operand number in inline asm string: '" +
-                           Twine(AsmStr) + "'");
-      LastEmitted = IDEnd;
-
-      if (Val >= NumOperands - 1)
-        report_fatal_error("Invalid $ operand number in inline asm string: '" +
-                           Twine(AsmStr) + "'");
-
-      char Modifier[2] = { 0, 0 };
-
-      if (HasCurlyBraces) {
-        // If we have curly braces, check for a modifier character.  This
-        // supports syntax like ${0:u}, which correspond to "%u0" in GCC asm.
-        if (*LastEmitted == ':') {
-          ++LastEmitted;    // Consume ':' character.
-          if (*LastEmitted == 0)
-            report_fatal_error("Bad ${:} expression in inline asm string: '" +
-                               Twine(AsmStr) + "'");
-
-          Modifier[0] = *LastEmitted;
-          ++LastEmitted;    // Consume modifier character.
-        }
-
-        if (*LastEmitted != '}')
-          report_fatal_error("Bad ${} expression in inline asm string: '" +
-                             Twine(AsmStr) + "'");
-        ++LastEmitted;    // Consume '}' character.
-      }
-
-      // Okay, we finally have a value number.  Ask the target to print this
-      // operand!
-      if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
-        unsigned OpNo = InlineAsm::MIOp_FirstOperand;
-
-        bool Error = false;
-
-        // Scan to find the machine operand number for the operand.
-        for (; Val; --Val) {
-          if (OpNo >= MI->getNumOperands())
-            break;
-          unsigned OpFlags = MI->getOperand(OpNo).getImm();
-          OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
-        }
-
-        // We may have a location metadata attached to the end of the
-        // instruction, and at no point should see metadata at any
-        // other point while processing. It's an error if so.
-        if (OpNo >= MI->getNumOperands() || MI->getOperand(OpNo).isMetadata()) {
-          Error = true;
-        } else {
-          unsigned OpFlags = MI->getOperand(OpNo).getImm();
-          ++OpNo; // Skip over the ID number.
-
-          // FIXME: Shouldn't arch-independent output template handling go into
-          // PrintAsmOperand?
-          // Labels are target independent.
-          if (MI->getOperand(OpNo).isBlockAddress()) {
-            const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
-            MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
-            Sym->print(OS, AP->MAI);
-            MMI->getContext().registerInlineAsmLabel(Sym);
-          } else if (InlineAsm::isMemKind(OpFlags)) {
-            Error = AP->PrintAsmMemoryOperand(
-                MI, OpNo, Modifier[0] ? Modifier : nullptr, OS);
-          } else {
-            Error = AP->PrintAsmOperand(MI, OpNo,
-                                        Modifier[0] ? Modifier : nullptr, OS);
-          }
-        }
-        if (Error) {
-          std::string msg;
-          raw_string_ostream Msg(msg);
-          Msg << "invalid operand in inline asm: '" << AsmStr << "'";
-          MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
-        }
-      }
-      break;
-    }
-    }
+static void EmitInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                             MachineModuleInfo *MMI, const MCAsmInfo *MAI,
+                             AsmPrinter *AP, uint64_t LocCookie,
+                             raw_ostream &OS) {
+  bool InputIsIntelDialect = MI->getInlineAsmDialect() == InlineAsm::AD_Intel;
+
+  if (InputIsIntelDialect) {
+    // Switch to the inline assembly variant.
+    OS << "\t.intel_syntax\n\t";
   }
-  OS << "\n\t.att_syntax\n" << (char)0;  // null terminate string.
-}
 
-static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
-                                MachineModuleInfo *MMI, const MCAsmInfo *MAI,
-                                AsmPrinter *AP, uint64_t LocCookie,
-                                raw_ostream &OS) {
   int CurVariant = -1; // The number of the {.|.|.} region we are in.
   const char *LastEmitted = AsmStr; // One past the last character emitted.
   unsigned NumOperands = MI->getNumOperands();
-  int AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant();
 
-  if (MAI->getEmitGNUAsmStartIndentationMarker())
+  int AsmPrinterVariant;
+  if (InputIsIntelDialect)
+    AsmPrinterVariant = 1; // X86MCAsmInfo.cpp's AsmWriterFlavorTy::Intel.
+  else
+    AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant();
+
+  // FIXME: Should this happen for `asm inteldialect` as well?
+  if (!InputIsIntelDialect && MAI->getEmitGNUAsmStartIndentationMarker())
     OS << '\t';
 
   while (*LastEmitted) {
@@ -340,8 +178,9 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
       switch (*LastEmitted) {
       default: Done = false; break;
       case '$':     // $$ -> $
-        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
-          OS << '$';
+        if (!InputIsIntelDialect)
+          if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+            OS << '$';
         ++LastEmitted;  // Consume second '$' character.
         break;
       case '(':        // $( -> same as GCC's { character.
@@ -480,6 +319,8 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
     }
     }
   }
+  if (InputIsIntelDialect)
+    OS << "\n\t.att_syntax";
   OS << '\n' << (char)0;  // null terminate string.
 }
 
@@ -515,9 +356,8 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
   // it.
   uint64_t LocCookie = 0;
   const MDNode *LocMD = nullptr;
-  for (unsigned i = MI->getNumOperands(); i != 0; --i) {
-    if (MI->getOperand(i-1).isMetadata() &&
-        (LocMD = MI->getOperand(i-1).getMetadata()) &&
+  for (const MachineOperand &MO : llvm::reverse(MI->operands())) {
+    if (MO.isMetadata() && (LocMD = MO.getMetadata()) &&
         LocMD->getNumOperands() != 0) {
       if (const ConstantInt *CI =
               mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
@@ -533,10 +373,7 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
   raw_svector_ostream OS(StringData);
 
   AsmPrinter *AP = const_cast<AsmPrinter*>(this);
-  if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
-    EmitGCCInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS);
-  else
-    EmitMSInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS);
+  EmitInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS);
 
   // Emit warnings if we use reserved registers on the clobber list, as
   // that might lead to undefined behaviour.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index a36d2966d44a..9b73f0ab2f05 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -521,8 +521,8 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
 }
 
 // Construct a DIE for this scope.
-void DwarfCompileUnit::constructScopeDIE(
-    LexicalScope *Scope, SmallVectorImpl<DIE *> &FinalChildren) {
+void DwarfCompileUnit::constructScopeDIE(LexicalScope *Scope,
+                                         DIE &ParentScopeDIE) {
   if (!Scope || !Scope->getScopeNode())
     return;
 
@@ -533,46 +533,27 @@ void DwarfCompileUnit::constructScopeDIE(
          "constructSubprogramScopeDIE for non-inlined "
          "subprograms");
 
-  SmallVector<DIE *, 8> Children;
-
-  // We try to create the scope DIE first, then the children DIEs. This will
-  // avoid creating un-used children then removing them later when we find out
-  // the scope DIE is null.
-  DIE *ScopeDIE;
+  // Emit inlined subprograms.
   if (Scope->getParent() && isa<DISubprogram>(DS)) {
-    ScopeDIE = constructInlinedScopeDIE(Scope);
+    DIE *ScopeDIE = constructInlinedScopeDIE(Scope);
     if (!ScopeDIE)
       return;
-    // We create children when the scope DIE is not null.
-    createScopeChildrenDIE(Scope, Children);
-  } else {
-    // Early exit when we know the scope DIE is going to be null.
-    if (DD->isLexicalScopeDIENull(Scope))
-      return;
-
-    bool HasNonScopeChildren = false;
 
-    // We create children here when we know the scope DIE is not going to be
-    // null and the children will be added to the scope DIE.
-    createScopeChildrenDIE(Scope, Children, &HasNonScopeChildren);
-
-    // If there are only other scopes as children, put them directly in the
-    // parent instead, as this scope would serve no purpose.
-    if (!HasNonScopeChildren) {
-      FinalChildren.insert(FinalChildren.end(),
-                           std::make_move_iterator(Children.begin()),
-                           std::make_move_iterator(Children.end()));
-      return;
-    }
-    ScopeDIE = constructLexicalScopeDIE(Scope);
-    assert(ScopeDIE && "Scope DIE should not be null.");
+    ParentScopeDIE.addChild(ScopeDIE);
+    createAndAddScopeChildren(Scope, *ScopeDIE);
+    return;
   }
 
-  // Add children
-  for (auto &I : Children)
-    ScopeDIE->addChild(std::move(I));
+  // Early exit when we know the scope DIE is going to be null.
+  if (DD->isLexicalScopeDIENull(Scope))
+    return;
+
+  // Emit lexical blocks.
+  DIE *ScopeDIE = constructLexicalScopeDIE(Scope);
+  assert(ScopeDIE && "Scope DIE should not be null.");
 
-  FinalChildren.push_back(std::move(ScopeDIE));
+  ParentScopeDIE.addChild(ScopeDIE);
+  createAndAddScopeChildren(Scope, *ScopeDIE);
 }
 
 void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
@@ -1022,42 +1003,6 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
   return Result;
 }
 
-DIE *DwarfCompileUnit::createScopeChildrenDIE(LexicalScope *Scope,
-                                              SmallVectorImpl<DIE *> &Children,
-                                              bool *HasNonScopeChildren) {
-  assert(Children.empty());
-  DIE *ObjectPointer = nullptr;
-
-  // Emit function arguments (order is significant).
-  auto Vars = DU->getScopeVariables().lookup(Scope);
-  for (auto &DV : Vars.Args)
-    Children.push_back(constructVariableDIE(*DV.second, *Scope, ObjectPointer));
-
-  // Emit local variables.
-  auto Locals = sortLocalVars(Vars.Locals);
-  for (DbgVariable *DV : Locals)
-    Children.push_back(constructVariableDIE(*DV, *Scope, ObjectPointer));
-
-  // Skip imported directives in gmlt-like data.
-  if (!includeMinimalInlineScopes()) {
-    // There is no need to emit empty lexical block DIE.
-    for (const auto *IE : ImportedEntities[Scope->getScopeNode()])
-      Children.push_back(
-          constructImportedEntityDIE(cast<DIImportedEntity>(IE)));
-  }
-
-  if (HasNonScopeChildren)
-    *HasNonScopeChildren = !Children.empty();
-
-  for (DbgLabel *DL : DU->getScopeLabels().lookup(Scope))
-    Children.push_back(constructLabelDIE(*DL, *Scope));
-
-  for (LexicalScope *LS : Scope->getChildren())
-    constructScopeDIE(LS, Children);
-
-  return ObjectPointer;
-}
-
 DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub,
                                                    LexicalScope *Scope) {
   DIE &ScopeDIE = updateSubprogramScopeDIE(Sub);
@@ -1088,13 +1033,48 @@ DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub,
 
 DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
                                                  DIE &ScopeDIE) {
-  // We create children when the scope DIE is not null.
-  SmallVector<DIE *, 8> Children;
-  DIE *ObjectPointer = createScopeChildrenDIE(Scope, Children);
+  DIE *ObjectPointer = nullptr;
+
+  // Emit function arguments (order is significant).
+  auto Vars = DU->getScopeVariables().lookup(Scope);
+  for (auto &DV : Vars.Args)
+    ScopeDIE.addChild(constructVariableDIE(*DV.second, *Scope, ObjectPointer));
+
+  // Emit local variables.
+  auto Locals = sortLocalVars(Vars.Locals);
+  for (DbgVariable *DV : Locals)
+    ScopeDIE.addChild(constructVariableDIE(*DV, *Scope, ObjectPointer));
+
+  // Emit imported entities (skipped in gmlt-like data).
+  if (!includeMinimalInlineScopes()) {
+    for (const auto *IE : ImportedEntities[Scope->getScopeNode()])
+      ScopeDIE.addChild(constructImportedEntityDIE(cast<DIImportedEntity>(IE)));
+  }
+
+  // Emit labels.
+  for (DbgLabel *DL : DU->getScopeLabels().lookup(Scope))
+    ScopeDIE.addChild(constructLabelDIE(*DL, *Scope));
 
-  // Add children
-  for (auto &I : Children)
-    ScopeDIE.addChild(std::move(I));
+  // Emit inner lexical scopes.
+  auto needToEmitLexicalScope = [this](LexicalScope *LS) {
+    if (isa<DISubprogram>(LS->getScopeNode()))
+      return true;
+    auto Vars = DU->getScopeVariables().lookup(LS);
+    if (!Vars.Args.empty() || !Vars.Locals.empty())
+      return true;
+    if (!includeMinimalInlineScopes() &&
+        !ImportedEntities[LS->getScopeNode()].empty())
+      return true;
+    return false;
+  };
+  for (LexicalScope *LS : Scope->getChildren()) {
+    // If the lexical block doesn't have non-scope children, skip
+    // its emission and put its children directly to the parent scope.
+    if (needToEmitLexicalScope(LS))
+      constructScopeDIE(LS, ScopeDIE);
+    else
+      createAndAddScopeChildren(LS, ScopeDIE);
+  }
 
   return ObjectPointer;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 6e9261087686..fb03982b5e4a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -191,8 +191,7 @@ public:
   /// variables.
   DIE &updateSubprogramScopeDIE(const DISubprogram *SP);
 
-  void constructScopeDIE(LexicalScope *Scope,
-                         SmallVectorImpl<DIE *> &FinalChildren);
+  void constructScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE);
 
   /// A helper function to construct a RangeSpanList for a given
   /// lexical scope.
@@ -220,11 +219,6 @@ public:
   /// Construct a DIE for the given DbgLabel.
   DIE *constructLabelDIE(DbgLabel &DL, const LexicalScope &Scope);
 
-  /// A helper function to create children of a Scope DIE.
-  DIE *createScopeChildrenDIE(LexicalScope *Scope,
-                              SmallVectorImpl<DIE *> &Children,
-                              bool *HasNonScopeChildren = nullptr);
-
   void createBaseTypeDIEs();
 
   /// Construct a DIE for this subprogram scope.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 150f19324834..39f40b172c1b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -162,9 +162,7 @@ bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) {
   bool MarkedNoUnwind = false;
   bool SawFunc = false;
 
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
-
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isGlobal()) continue;
 
     const Function *F = dyn_cast<Function>(MO.getGlobal());
@@ -386,8 +384,8 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   SmallVector<const LandingPadInfo *, 64> LandingPads;
   LandingPads.reserve(PadInfos.size());
 
-  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
-    LandingPads.push_back(&PadInfos[i]);
+  for (const LandingPadInfo &LPI : PadInfos)
+    LandingPads.push_back(&LPI);
 
   // Order landing pads lexicographically by type id.
   llvm::sort(LandingPads, [](const LandingPadInfo *L, const LandingPadInfo *R) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
index 5ac8f49a9522..64dadc82b48b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1013,8 +1013,8 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
   // If this is a large problem, avoid visiting the same basic blocks
   // multiple times.
   if (MergePotentials.size() == TailMergeThreshold)
-    for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
-      TriedMerging.insert(MergePotentials[i].getBlock());
+    for (const MergePotentialsElt &Elt : MergePotentials)
+      TriedMerging.insert(Elt.getBlock());
 
   // See if we can do any tail merging on those.
   if (MergePotentials.size() >= 2)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
index 50825ccf9bac..eda0f37fdeb7 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -513,9 +513,7 @@ bool BranchRelaxation::relaxBranchInstructions() {
 
   // Relaxing branches involves creating new basic blocks, so re-eval
   // end() for termination.
-  for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) {
-    MachineBasicBlock &MBB = *I;
-
+  for (MachineBasicBlock &MBB : *MF) {
     // Empty block?
     MachineBasicBlock::iterator Last = MBB.getLastNonDebugInstr();
     if (Last == MBB.end())
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
index e0e2db9f4725..bbdd8aab502e 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
@@ -58,8 +58,10 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLiveVariablesPass(Registry);
   initializeLocalStackSlotPassPass(Registry);
   initializeLowerIntrinsicsPass(Registry);
+  initializeMIRAddFSDiscriminatorsPass(Registry);
   initializeMIRCanonicalizerPass(Registry);
   initializeMIRNamerPass(Registry);
+  initializeMIRProfileLoaderPassPass(Registry);
   initializeMachineBlockFrequencyInfoPass(Registry);
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
index a1ff02178ffa..3bed81d5841d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
@@ -90,7 +90,7 @@ CGOPT(bool, EnableAddrsig)
 CGOPT(bool, EmitCallSiteInfo)
 CGOPT(bool, EnableMachineFunctionSplitter)
 CGOPT(bool, EnableDebugEntryValues)
-CGOPT(bool, ValueTrackingVariableLocations)
+CGOPT_EXP(bool, ValueTrackingVariableLocations)
 CGOPT(bool, ForceDwarfFrameSection)
 CGOPT(bool, XRayOmitFunctionIndex)
 CGOPT(bool, DebugStrictDwarf)
@@ -534,12 +534,17 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.EmitAddrsig = getEnableAddrsig();
   Options.EmitCallSiteInfo = getEmitCallSiteInfo();
   Options.EnableDebugEntryValues = getEnableDebugEntryValues();
-  Options.ValueTrackingVariableLocations = getValueTrackingVariableLocations();
   Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
   Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex();
   Options.DebugStrictDwarf = getDebugStrictDwarf();
   Options.LoopAlignment = getAlignLoops();
 
+  if (auto Opt = getExplicitValueTrackingVariableLocations())
+    Options.ValueTrackingVariableLocations = *Opt;
+  else
+    Options.ValueTrackingVariableLocations =
+        getDefaultValueTrackingVariableLocations(TheTriple);
+
   Options.MCOptions = mc::InitMCTargetOptionsFromFlags();
 
   Options.ThreadModel = getThreadModel();
@@ -692,3 +697,9 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   for (Function &F : M)
     setFunctionAttributes(CPU, Features, F);
 }
+
+bool codegen::getDefaultValueTrackingVariableLocations(const llvm::Triple &T) {
+  if (T.getArch() == llvm::Triple::x86_64)
+    return true;
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 981f5973fee8..4e98d49206b5 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -370,9 +370,7 @@ CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
 
     // Handle cases in which this instruction defines NewReg.
     MachineInstr *MI = RefOper->getParent();
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &CheckOper = MI->getOperand(i);
-
+    for (const MachineOperand &CheckOper : MI->operands()) {
       if (CheckOper.isRegMask() && CheckOper.clobbersPhysReg(NewReg))
         return true;
 
@@ -462,11 +460,10 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
 
   // Find the node at the bottom of the critical path.
   const SUnit *Max = nullptr;
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    const SUnit *SU = &SUnits[i];
-    MISUnitMap[SU->getInstr()] = SU;
-    if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
-      Max = SU;
+  for (const SUnit &SU : SUnits) {
+    MISUnitMap[SU.getInstr()] = &SU;
+    if (!Max || SU.getDepth() + SU.Latency > Max->getDepth() + Max->Latency)
+      Max = &SU;
   }
   assert(Max && "Failed to find bottom of the critical path");
 
@@ -621,8 +618,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
       // is invalid.  If the instruction defines other registers,
       // save a list of them so that we don't pick a new register
       // that overlaps any of them.
-      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI.getOperand(i);
+      for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg()) continue;
         Register Reg = MO.getReg();
         if (Reg == 0) continue;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index c6c0b79cd7e7..0bb186a02416 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -76,8 +76,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
     return false;
 
   // Examine each operand.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (MO.isReg() && MO.isDef()) {
       Register Reg = MO.getReg();
       if (Register::isPhysicalRegister(Reg)) {
@@ -87,7 +86,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
       } else {
         if (MO.isDead()) {
 #ifndef NDEBUG
-          // Sanity check on uses of this dead register. All of them should be
+          // Baisc check on the register. All of them should be
           // 'undef'.
           for (auto &U : MRI->use_nodbg_operands(Reg))
             assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
@@ -152,8 +151,7 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) {
       }
 
       // Record the physreg defs.
-      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = MI.getOperand(i);
+      for (const MachineOperand &MO : MI.operands()) {
         if (MO.isReg() && MO.isDef()) {
           Register Reg = MO.getReg();
           if (Register::isPhysicalRegister(Reg)) {
@@ -171,8 +169,7 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) {
       }
       // Record the physreg uses, after the defs, in case a physreg is
       // both defined and used in the same instruction.
-      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = MI.getOperand(i);
+      for (const MachineOperand &MO : MI.operands()) {
         if (MO.isReg() && MO.isUse()) {
           Register Reg = MO.getReg();
           if (Register::isPhysicalRegister(Reg)) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3a52959d54bf..755b3b844570 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Casting.h"
@@ -3732,8 +3733,7 @@ void CombinerHelper::applyExtendThroughPhis(MachineInstr &MI,
   Builder.setInstrAndDebugLoc(MI);
   auto NewPhi = Builder.buildInstrNoInsert(TargetOpcode::G_PHI);
   NewPhi.addDef(DstReg);
-  for (unsigned SrcIdx = 1; SrcIdx < MI.getNumOperands(); ++SrcIdx) {
-    auto &MO = MI.getOperand(SrcIdx);
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
     if (!MO.isReg()) {
       NewPhi.addMBB(MO.getMBB());
       continue;
@@ -3825,8 +3825,7 @@ bool CombinerHelper::matchExtractAllEltsFromBuildVector(
   unsigned NumElts = DstTy.getNumElements();
 
   SmallBitVector ExtractedElts(NumElts);
-  for (auto &II : make_range(MRI.use_instr_nodbg_begin(DstReg),
-                             MRI.use_instr_nodbg_end())) {
+  for (MachineInstr &II : MRI.use_nodbg_instructions(DstReg)) {
     if (II.getOpcode() != TargetOpcode::G_EXTRACT_VECTOR_ELT)
       return false;
     auto Cst = getIConstantVRegVal(II.getOperand(2).getReg(), MRI);
@@ -3868,6 +3867,51 @@ void CombinerHelper::applyBuildFnNoErase(
   MatchInfo(Builder);
 }
 
+bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
+                                               BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  unsigned BitWidth = Ty.getScalarSizeInBits();
+
+  Register ShlSrc, ShlAmt, LShrSrc, LShrAmt;
+  unsigned FshOpc = 0;
+
+  // Match (or (shl x, amt), (lshr y, sub(bw, amt))).
+  if (mi_match(
+          Dst, MRI,
+          // m_GOr() handles the commuted version as well.
+          m_GOr(m_GShl(m_Reg(ShlSrc), m_Reg(ShlAmt)),
+                m_GLShr(m_Reg(LShrSrc), m_GSub(m_SpecificICstOrSplat(BitWidth),
+                                               m_Reg(LShrAmt)))))) {
+    FshOpc = TargetOpcode::G_FSHL;
+
+    // Match (or (shl x, sub(bw, amt)), (lshr y, amt)).
+  } else if (mi_match(Dst, MRI,
+                      m_GOr(m_GLShr(m_Reg(LShrSrc), m_Reg(LShrAmt)),
+                            m_GShl(m_Reg(ShlSrc),
+                                   m_GSub(m_SpecificICstOrSplat(BitWidth),
+                                          m_Reg(ShlAmt)))))) {
+    FshOpc = TargetOpcode::G_FSHR;
+
+  } else {
+    return false;
+  }
+
+  if (ShlAmt != LShrAmt)
+    return false;
+
+  LLT AmtTy = MRI.getType(ShlAmt);
+  if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildInstr(FshOpc, {Dst}, {ShlSrc, LShrSrc, ShlAmt});
+  };
+  return true;
+}
+
 /// Match an FSHL or FSHR that can be combined to a ROTR or ROTL rotate.
 bool CombinerHelper::matchFunnelShiftToRotate(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
@@ -4499,20 +4543,9 @@ bool CombinerHelper::matchNarrowBinopFeedingAnd(
 bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) {
   unsigned Opc = MI.getOpcode();
   assert(Opc == TargetOpcode::G_UMULO || Opc == TargetOpcode::G_SMULO);
-  // Check for a constant 2 or a splat of 2 on the RHS.
-  auto RHS = MI.getOperand(3).getReg();
-  bool IsVector = MRI.getType(RHS).isVector();
-  if (!IsVector && !mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(2)))
-    return false;
-  if (IsVector) {
-    // FIXME: There's no mi_match pattern for this yet.
-    auto *RHSDef = getDefIgnoringCopies(RHS, MRI);
-    if (!RHSDef)
-      return false;
-    auto Splat = getBuildVectorConstantSplat(*RHSDef, MRI);
-    if (!Splat || *Splat != 2)
-      return false;
-  }
+
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(2)))
+    return false;
 
   MatchInfo = [=, &MI](MachineIRBuilder &B) {
     Observer.changingInstr(MI);
@@ -4760,6 +4793,556 @@ bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI,
   return true;
 }
 
+/// Checks if \p MI is TargetOpcode::G_FMUL and contractable either
+/// due to global flags or MachineInstr flags.
+static bool isContractableFMul(MachineInstr &MI, bool AllowFusionGlobally) {
+  if (MI.getOpcode() != TargetOpcode::G_FMUL)
+    return false;
+  return AllowFusionGlobally || MI.getFlag(MachineInstr::MIFlag::FmContract);
+}
+
+static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1,
+                        const MachineRegisterInfo &MRI) {
+  return std::distance(MRI.use_instr_nodbg_begin(MI0.getOperand(0).getReg()),
+                       MRI.use_instr_nodbg_end()) >
+         std::distance(MRI.use_instr_nodbg_begin(MI1.getOperand(0).getReg()),
+                       MRI.use_instr_nodbg_end());
+}
+
+bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
+                                         bool &AllowFusionGlobally,
+                                         bool &HasFMAD, bool &Aggressive,
+                                         bool CanReassociate) {
+
+  auto *MF = MI.getMF();
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+  const TargetOptions &Options = MF->getTarget().Options;
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+
+  if (CanReassociate &&
+      !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc)))
+    return false;
+
+  // Floating-point multiply-add with intermediate rounding.
+  HasFMAD = (LI && TLI.isFMADLegal(MI, DstType));
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(*MF, DstType) &&
+                isLegalOrBeforeLegalizer({TargetOpcode::G_FMA, {DstType}});
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return false;
+
+  AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                        Options.UnsafeFPMath || HasFMAD;
+  // If the addition is not contractable, do not combine.
+  if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
+    return false;
+
+  Aggressive = TLI.enableAggressiveFMAFusion(DstType);
+  return true;
+}
+
+bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
+  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
+      isContractableFMul(*RHS, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS, *RHS, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+  if (isContractableFMul(*LHS, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(),
+                    RHS->getOperand(0).getReg()});
+    };
+    return true;
+  }
+
+  // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+  if (isContractableFMul(*RHS, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {RHS->getOperand(1).getReg(), RHS->getOperand(2).getReg(),
+                    LHS->getOperand(0).getReg()});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
+  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
+  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
+      isContractableFMul(*RHS, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS, *RHS, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+  MachineInstr *FpExtSrc;
+  if (mi_match(LHS->getOperand(0).getReg(), MRI,
+               m_GFPExt(m_MInstr(FpExtSrc))) &&
+      isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
+      auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
+      B.buildInstr(
+          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+          {FpExtX.getReg(0), FpExtY.getReg(0), RHS->getOperand(0).getReg()});
+    };
+    return true;
+  }
+
+  // fold (fadd z, (fpext (fmul x, y))) -> (fma (fpext x), (fpext y), z)
+  // Note: Commutes FADD operands.
+  if (mi_match(RHS->getOperand(0).getReg(), MRI,
+               m_GFPExt(m_MInstr(FpExtSrc))) &&
+      isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
+      auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
+      B.buildInstr(
+          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+          {FpExtX.getReg(0), FpExtY.getReg(0), LHS->getOperand(0).getReg()});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive, true))
+    return false;
+
+  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
+  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
+      isContractableFMul(*RHS, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS, *RHS, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  MachineInstr *FMA = nullptr;
+  Register Z;
+  // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z))
+  if (LHS->getOpcode() == PreferredFusedOpcode &&
+      (MRI.getVRegDef(LHS->getOperand(3).getReg())->getOpcode() ==
+       TargetOpcode::G_FMUL) &&
+      MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()) &&
+      MRI.hasOneNonDBGUse(LHS->getOperand(3).getReg())) {
+    FMA = LHS;
+    Z = RHS->getOperand(0).getReg();
+  }
+  // fold (fadd z, (fma x, y, (fmul u, v))) -> (fma x, y, (fma u, v, z))
+  else if (RHS->getOpcode() == PreferredFusedOpcode &&
+           (MRI.getVRegDef(RHS->getOperand(3).getReg())->getOpcode() ==
+            TargetOpcode::G_FMUL) &&
+           MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()) &&
+           MRI.hasOneNonDBGUse(RHS->getOperand(3).getReg())) {
+    Z = LHS->getOperand(0).getReg();
+    FMA = RHS;
+  }
+
+  if (FMA) {
+    MachineInstr *FMulMI = MRI.getVRegDef(FMA->getOperand(3).getReg());
+    Register X = FMA->getOperand(1).getReg();
+    Register Y = FMA->getOperand(2).getReg();
+    Register U = FMulMI->getOperand(1).getReg();
+    Register V = FMulMI->getOperand(2).getReg();
+
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register InnerFMA = MRI.createGenericVirtualRegister(DstTy);
+      B.buildInstr(PreferredFusedOpcode, {InnerFMA}, {U, V, Z});
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {X, Y, InnerFMA});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  if (!Aggressive)
+    return false;
+
+  const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
+  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
+      isContractableFMul(*RHS, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS, *RHS, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  // Builds: (fma x, y, (fma (fpext u), (fpext v), z))
+  auto buildMatchInfo = [=, &MI](Register U, Register V, Register Z, Register X,
+                                 Register Y, MachineIRBuilder &B) {
+    Register FpExtU = B.buildFPExt(DstType, U).getReg(0);
+    Register FpExtV = B.buildFPExt(DstType, V).getReg(0);
+    Register InnerFMA =
+        B.buildInstr(PreferredFusedOpcode, {DstType}, {FpExtU, FpExtV, Z})
+            .getReg(0);
+    B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                 {X, Y, InnerFMA});
+  };
+
+  MachineInstr *FMulMI, *FMAMI;
+  // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
+  //   -> (fma x, y, (fma (fpext u), (fpext v), z))
+  if (LHS->getOpcode() == PreferredFusedOpcode &&
+      mi_match(LHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(),
+                     RHS->getOperand(0).getReg(), LHS->getOperand(1).getReg(),
+                     LHS->getOperand(2).getReg(), B);
+    };
+    return true;
+  }
+
+  // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
+  //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
+  // FIXME: This turns two single-precision and one double-precision
+  // operation into two double-precision operations, which might not be
+  // interesting for all targets, especially GPUs.
+  if (mi_match(LHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+      FMAMI->getOpcode() == PreferredFusedOpcode) {
+    MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
+    if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+        TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                            MRI.getType(FMAMI->getOperand(0).getReg()))) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        Register X = FMAMI->getOperand(1).getReg();
+        Register Y = FMAMI->getOperand(2).getReg();
+        X = B.buildFPExt(DstType, X).getReg(0);
+        Y = B.buildFPExt(DstType, Y).getReg(0);
+        buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                       FMulMI->getOperand(2).getReg(),
+                       RHS->getOperand(0).getReg(), X, Y, B);
+      };
+
+      return true;
+    }
+  }
+
+  // fold (fadd z, (fma x, y, (fpext (fmul u, v)))
+  //   -> (fma x, y, (fma (fpext u), (fpext v), z))
+  if (RHS->getOpcode() == PreferredFusedOpcode &&
+      mi_match(RHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(),
+                     LHS->getOperand(0).getReg(), RHS->getOperand(1).getReg(),
+                     RHS->getOperand(2).getReg(), B);
+    };
+    return true;
+  }
+
+  // fold (fadd z, (fpext (fma x, y, (fmul u, v)))
+  //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
+  // FIXME: This turns two single-precision and one double-precision
+  // operation into two double-precision operations, which might not be
+  // interesting for all targets, especially GPUs.
+  if (mi_match(RHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+      FMAMI->getOpcode() == PreferredFusedOpcode) {
+    MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
+    if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+        TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                            MRI.getType(FMAMI->getOperand(0).getReg()))) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        Register X = FMAMI->getOperand(1).getReg();
+        Register Y = FMAMI->getOperand(2).getReg();
+        X = B.buildFPExt(DstType, X).getReg(0);
+        Y = B.buildFPExt(DstType, Y).getReg(0);
+        buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                       FMulMI->getOperand(2).getReg(),
+                       LHS->getOperand(0).getReg(), X, Y, B);
+      };
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
+  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  int FirstMulHasFewerUses = true;
+  if (isContractableFMul(*LHS, AllowFusionGlobally) &&
+      isContractableFMul(*RHS, AllowFusionGlobally) &&
+      hasMoreUses(*LHS, *RHS, MRI))
+    FirstMulHasFewerUses = false;
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // fold (fsub (fmul x, y), z) -> (fma x, y, -z)
+  if (FirstMulHasFewerUses &&
+      (isContractableFMul(*LHS, AllowFusionGlobally) &&
+       (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg())))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register NegZ = B.buildFNeg(DstTy, RHS->getOperand(0).getReg()).getReg(0);
+      B.buildInstr(
+          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+          {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), NegZ});
+    };
+    return true;
+  }
+  // fold (fsub x, (fmul y, z)) -> (fma -y, z, x)
+  else if ((isContractableFMul(*RHS, AllowFusionGlobally) &&
+            (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg())))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register NegY = B.buildFNeg(DstTy, RHS->getOperand(1).getReg()).getReg(0);
+      B.buildInstr(
+          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+          {NegY, RHS->getOperand(2).getReg(), LHS->getOperand(0).getReg()});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  MachineInstr *FMulMI;
+  // fold (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+  if (mi_match(LHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) &&
+      (Aggressive || (MRI.hasOneNonDBGUse(LHSReg) &&
+                      MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally)) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register NegX =
+          B.buildFNeg(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
+      Register NegZ = B.buildFNeg(DstTy, RHSReg).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {NegX, FMulMI->getOperand(2).getReg(), NegZ});
+    };
+    return true;
+  }
+
+  // fold (fsub x, (fneg (fmul, y, z))) -> (fma y, z, x)
+  if (mi_match(RHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) &&
+      (Aggressive || (MRI.hasOneNonDBGUse(RHSReg) &&
+                      MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally)) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FMulMI->getOperand(1).getReg(),
+                    FMulMI->getOperand(2).getReg(), LHSReg});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  MachineInstr *FMulMI;
+  // fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z))
+  if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(LHSReg))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register FpExtX =
+          B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
+      Register FpExtY =
+          B.buildFPExt(DstTy, FMulMI->getOperand(2).getReg()).getReg(0);
+      Register NegZ = B.buildFNeg(DstTy, RHSReg).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FpExtX, FpExtY, NegZ});
+    };
+    return true;
+  }
+
+  // fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x)
+  if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(RHSReg))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register FpExtY =
+          B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
+      Register NegY = B.buildFNeg(DstTy, FpExtY).getReg(0);
+      Register FpExtZ =
+          B.buildFPExt(DstTy, FMulMI->getOperand(2).getReg()).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {NegY, FpExtZ, LHSReg});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  auto buildMatchInfo = [=](Register Dst, Register X, Register Y, Register Z,
+                            MachineIRBuilder &B) {
+    Register FpExtX = B.buildFPExt(DstTy, X).getReg(0);
+    Register FpExtY = B.buildFPExt(DstTy, Y).getReg(0);
+    B.buildInstr(PreferredFusedOpcode, {Dst}, {FpExtX, FpExtY, Z});
+  };
+
+  MachineInstr *FMulMI;
+  // fold (fsub (fpext (fneg (fmul x, y))), z) ->
+  //      (fneg (fma (fpext x), (fpext y), z))
+  // fold (fsub (fneg (fpext (fmul x, y))), z) ->
+  //      (fneg (fma (fpext x), (fpext y), z))
+  if ((mi_match(LHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
+       mi_match(LHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register FMAReg = MRI.createGenericVirtualRegister(DstTy);
+      buildMatchInfo(FMAReg, FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(), RHSReg, B);
+      B.buildFNeg(MI.getOperand(0).getReg(), FMAReg);
+    };
+    return true;
+  }
+
+  // fold (fsub x, (fpext (fneg (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
+  // fold (fsub x, (fneg (fpext (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
+  if ((mi_match(RHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
+       mi_match(RHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      buildMatchInfo(MI.getOperand(0).getReg(), FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(), LHSReg, B);
+    };
+    return true;
+  }
+
+  return false;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index c74bec7dfc0d..e09cd26eb0c1 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -585,8 +585,8 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
 
   // FIXME: What does the original arg index mean here?
   SmallVector<CallLowering::ArgInfo, 3> Args;
-  for (unsigned i = 1; i < MI.getNumOperands(); i++)
-    Args.push_back({MI.getOperand(i).getReg(), OpType, 0});
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+    Args.push_back({MO.getReg(), OpType, 0});
   return createLibcall(MIRBuilder, Libcall,
                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
 }
@@ -1500,8 +1500,8 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
 
   // Decompose the original operands if they don't evenly divide.
-  for (int I = 1, E = MI.getNumOperands(); I != E; ++I) {
-    Register SrcReg = MI.getOperand(I).getReg();
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
+    Register SrcReg = MO.getReg();
     if (GCD == SrcSize) {
       Unmerges.push_back(SrcReg);
     } else {
@@ -4037,8 +4037,8 @@ LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
 
   // Break into a common type
   SmallVector<Register, 16> Parts;
-  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
-    extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+    extractGCDType(Parts, GCDTy, MO.getReg());
 
   // Build the requested new merge, padding with undef.
   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
@@ -7782,7 +7782,6 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
   // of that value loaded. This can result in a sequence of loads and stores
   // mixed types, depending on what the target specifies as good types to use.
   unsigned CurrOffset = 0;
-  LLT PtrTy = MRI.getType(Src);
   unsigned Size = KnownLen;
   for (auto CopyTy : MemOps) {
     // Issuing an unaligned load / store pair  that overlaps with the previous
@@ -7800,15 +7799,19 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
     Register LoadPtr = Src;
     Register Offset;
     if (CurrOffset != 0) {
-      Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
+      LLT SrcTy = MRI.getType(Src);
+      Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
                    .getReg(0);
-      LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
+      LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
     }
     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
 
     // Create the store.
-    Register StorePtr =
-        CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+    Register StorePtr = Dst;
+    if (CurrOffset != 0) {
+      LLT DstTy = MRI.getType(Dst);
+      StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+    }
     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
     CurrOffset += CopyTy.getSizeInBytes();
     Size -= CopyTy.getSizeInBytes();
@@ -7885,7 +7888,6 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
   // Apart from that, this loop is pretty much doing the same thing as the
   // memcpy codegen function.
   unsigned CurrOffset = 0;
-  LLT PtrTy = MRI.getType(Src);
   SmallVector<Register, 16> LoadVals;
   for (auto CopyTy : MemOps) {
     // Construct MMO for the load.
@@ -7895,9 +7897,10 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
     // Create the load.
     Register LoadPtr = Src;
     if (CurrOffset != 0) {
+      LLT SrcTy = MRI.getType(Src);
       auto Offset =
-          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
-      LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
+          MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
+      LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
     }
     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
     CurrOffset += CopyTy.getSizeInBytes();
@@ -7912,9 +7915,10 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
 
     Register StorePtr = Dst;
     if (CurrOffset != 0) {
+      LLT DstTy = MRI.getType(Dst);
       auto Offset =
-          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
-      StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+          MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
+      StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
     }
     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
     CurrOffset += CopyTy.getSizeInBytes();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index 1a2102e3ef21..650500c7eb31 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -123,7 +123,7 @@ const RegisterBank *RegisterBankInfo::getRegBankFromConstraints(
 
   Register Reg = MI.getOperand(OpIdx).getReg();
   const RegisterBank &RegBank = getRegBankFromRegClass(*RC, MRI.getType(Reg));
-  // Sanity check that the target properly implemented getRegBankFromRegClass.
+  // Check that the target properly implemented getRegBankFromRegClass.
   assert(RegBank.covers(*RC) &&
          "The mapping of the register bank does not make sense");
   return &RegBank;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 1a440c064a59..b0b84763e922 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -834,10 +834,9 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
   case TargetOpcode::G_BUILD_VECTOR: {
     // TODO: Probably should have a recursion depth guard since you could have
     // bitcasted vector elements.
-    for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
-      if (!isKnownToBeAPowerOfTwo(MI.getOperand(I).getReg(), MRI, KB))
+    for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+      if (!isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB))
         return false;
-    }
 
     return true;
   }
@@ -845,8 +844,8 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
     // Only handle constants since we would need to know if number of leading
     // zeros is greater than the truncation amount.
     const unsigned BitWidth = Ty.getScalarSizeInBits();
-    for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
-      auto Const = getIConstantVRegVal(MI.getOperand(I).getReg(), MRI);
+    for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
+      auto Const = getIConstantVRegVal(MO.getReg(), MRI);
       if (!Const || !Const->zextOrTrunc(BitWidth).isPowerOf2())
         return false;
     }
@@ -1031,16 +1030,22 @@ Optional<ValueAndVReg> getAnyConstantSplat(Register VReg,
   return SplatValAndReg;
 }
 
-bool isBuildVectorConstantSplat(const MachineInstr &MI,
-                                const MachineRegisterInfo &MRI,
-                                int64_t SplatValue, bool AllowUndef) {
-  if (auto SplatValAndReg =
-          getAnyConstantSplat(MI.getOperand(0).getReg(), MRI, AllowUndef))
+} // end anonymous namespace
+
+bool llvm::isBuildVectorConstantSplat(const Register Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      int64_t SplatValue, bool AllowUndef) {
+  if (auto SplatValAndReg = getAnyConstantSplat(Reg, MRI, AllowUndef))
     return mi_match(SplatValAndReg->VReg, MRI, m_SpecificICst(SplatValue));
   return false;
 }
 
-} // end anonymous namespace
+bool llvm::isBuildVectorConstantSplat(const MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI,
+                                      int64_t SplatValue, bool AllowUndef) {
+  return isBuildVectorConstantSplat(MI.getOperand(0).getReg(), MRI, SplatValue,
+                                    AllowUndef);
+}
 
 Optional<int64_t>
 llvm::getBuildVectorConstantSplat(const MachineInstr &MI,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
index 6c1ce4c1efb0..bbd9006a5d8c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -399,8 +399,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   // having a single global, but is aggressive enough for any other case.
   if (GlobalMergeIgnoreSingleUse) {
     BitVector AllGlobals(Globals.size());
-    for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) {
-      const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1];
+    for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
       if (UGS.UsageCount == 0)
         continue;
       if (UGS.Globals.count() > 1)
@@ -418,8 +417,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   BitVector PickedGlobals(Globals.size());
   bool Changed = false;
 
-  for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) {
-    const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1];
+  for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
     if (UGS.UsageCount == 0)
       continue;
     if (PickedGlobals.anyCommon(UGS.Globals))
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
index e4606daba352..2d38a44d5a33 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -260,10 +260,12 @@ bool IndirectBrExpandPass::runOnFunction(Function &F) {
   if (DTU) {
     // If there were multiple indirectbr's, they may have common successors,
     // but in the dominator tree, we only track unique edges.
-    SmallPtrSet<BasicBlock *, 8> UniqueSuccessors(BBs.begin(), BBs.end());
-    Updates.reserve(Updates.size() + UniqueSuccessors.size());
-    for (BasicBlock *BB : UniqueSuccessors)
-      Updates.push_back({DominatorTree::Insert, SwitchBB, BB});
+    SmallPtrSet<BasicBlock *, 8> UniqueSuccessors;
+    Updates.reserve(Updates.size() + BBs.size());
+    for (BasicBlock *BB : BBs) {
+      if (UniqueSuccessors.insert(BB).second)
+        Updates.push_back({DominatorTree::Insert, SwitchBB, BB});
+    }
     DTU->applyUpdates(Updates);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
index 64e1f4351456..fc5ac45752ca 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -274,11 +274,9 @@ static Register isFullCopyOf(const MachineInstr &MI, Register Reg) {
 }
 
 static void getVDefInterval(const MachineInstr &MI, LiveIntervals &LIS) {
-  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI.getOperand(I);
+  for (const MachineOperand &MO : MI.operands())
     if (MO.isReg() && MO.isDef() && Register::isVirtualRegister(MO.getReg()))
       LIS.getInterval(MO.getReg());
-  }
 }
 
 /// isSnippet - Identify if a live interval is a snippet that should be spilled.
@@ -583,11 +581,9 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
 
   if (!ParentVNI) {
     LLVM_DEBUG(dbgs() << "\tadding <undef> flags: ");
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
+    for (MachineOperand &MO : MI.operands())
       if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg())
         MO.setIsUndef();
-    }
     LLVM_DEBUG(dbgs() << UseIdx << '\t' << MI);
     return true;
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
index c3e0553418a5..fab6b8d10a33 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -73,11 +73,9 @@ void LatencyPriorityQueue::push(SUnit *SU) {
   // Look at all of the successors of this node.  Count the number of nodes that
   // this node is the sole unscheduled node for.
   unsigned NumNodesBlocking = 0;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+  for (const SDep &Succ : SU->Succs)
+    if (getSingleUnscheduledPred(Succ.getSUnit()) == SU)
       ++NumNodesBlocking;
-  }
   NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
 
   Queue.push_back(SU);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a4eb3094612b..cf62b0e5d7e8 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -544,8 +544,7 @@ public:
       // Re-state the variable location: if there's no replacement then NewLoc
       // is None and a $noreg DBG_VALUE will be created. Otherwise, a DBG_VALUE
       // identifying the alternative location will be emitted.
-      const DIExpression *Expr = ActiveVLocIt->second.Properties.DIExpr;
-      DbgValueProperties Properties(Expr, false);
+      const DbgValueProperties &Properties = ActiveVLocIt->second.Properties;
       PendingDbgValues.push_back(MTracker->emitLoc(NewLoc, Var, Properties));
 
       // Update machine locations <=> variable locations maps. Defer updating
@@ -836,6 +835,15 @@ MachineInstrBuilder MLocTracker::emitLoc(Optional<LocIdx> MLoc,
       unsigned Base = Spill.SpillBase;
       MIB.addReg(Base);
       MIB.addImm(0);
+
+      // Being on the stack makes this location indirect; if it was _already_
+      // indirect though, we need to add extra indirection. See this test for
+      // a scenario where this happens:
+      //     llvm/test/DebugInfo/X86/spill-nontrivial-param.ll
+      if (Properties.Indirect) {
+        std::vector<uint64_t> Elts = {dwarf::DW_OP_deref};
+        Expr = DIExpression::append(Expr, Elts);
+      }
     } else {
       // This is a stack location with a weird subregister offset: emit an undef
       // DBG_VALUE instead.
@@ -1288,6 +1296,24 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
   } else if (MI.isMetaInstruction())
     return;
 
+  // We always ignore SP defines on call instructions, they don't actually
+  // change the value of the stack pointer... except for win32's _chkstk. This
+  // is rare: filter quickly for the common case (no stack adjustments, not a
+  // call, etc). If it is a call that modifies SP, recognise the SP register
+  // defs.
+  bool CallChangesSP = false;
+  if (AdjustsStackInCalls && MI.isCall() && MI.getOperand(0).isSymbol() &&
+      !strcmp(MI.getOperand(0).getSymbolName(), StackProbeSymbolName.data()))
+    CallChangesSP = true;
+
+  // Test whether we should ignore a def of this register due to it being part
+  // of the stack pointer.
+  auto IgnoreSPAlias = [this, &MI, CallChangesSP](Register R) -> bool {
+    if (CallChangesSP)
+      return false;
+    return MI.isCall() && MTracker->SPAliases.count(R);
+  };
+
   // Find the regs killed by MI, and find regmasks of preserved regs.
   // Max out the number of statically allocated elements in `DeadRegs`, as this
   // prevents fallback to std::set::count() operations.
@@ -1298,7 +1324,7 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
     // Determine whether the operand is a register def.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
         Register::isPhysicalRegister(MO.getReg()) &&
-        !(MI.isCall() && MTracker->SPAliases.count(MO.getReg()))) {
+        !IgnoreSPAlias(MO.getReg())) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         // FIXME: Can we break out of this loop early if no insertion occurs?
@@ -1347,6 +1373,9 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
       continue;
 
     Register Reg = MTracker->LocIdxToLocID[L.Idx];
+    if (IgnoreSPAlias(Reg))
+      continue;
+
     for (auto *MO : RegMaskPtrs)
       if (MO->clobbersPhysReg(Reg))
         TTracker->clobberMloc(L.Idx, MI.getIterator(), false);
@@ -1628,9 +1657,10 @@ bool InstrRefBasedLDV::transferRegisterCopy(MachineInstr &MI) {
 /// fragments of that DILocalVariable which overlap. This reduces work during
 /// the data-flow stage from "Find any overlapping fragments" to "Check if the
 /// known-to-overlap fragments are present".
-/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for
+/// \param MI A previously unprocessed debug instruction to analyze for
 ///           fragment usage.
 void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) {
+  assert(MI.isDebugValue() || MI.isDebugRef());
   DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(),
                       MI.getDebugLoc()->getInlinedAt());
   FragmentInfo ThisFragment = MIVar.getFragmentOrDefault();
@@ -1732,7 +1762,7 @@ void InstrRefBasedLDV::produceMLocTransferFunction(
     for (auto &MI : MBB) {
       process(MI);
       // Also accumulate fragment map.
-      if (MI.isDebugValue())
+      if (MI.isDebugValue() || MI.isDebugRef())
         accumulateFragmentMap(MI);
 
       // Create a map from the instruction number (if present) to the
@@ -2322,15 +2352,8 @@ Optional<ValueIDNum> InstrRefBasedLDV::pickVPHILoc(
 
 bool InstrRefBasedLDV::vlocJoin(
     MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs,
-    SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks,
     SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
     DbgValue &LiveIn) {
-  // To emulate VarLocBasedImpl, process this block if it's not in scope but
-  // _does_ assign a variable value. No live-ins for this scope are transferred
-  // in though, so we can return immediately.
-  if (InScopeBlocks.count(&MBB) == 0 && !ArtificialBlocks.count(&MBB))
-    return false;
-
   LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
   bool Changed = false;
 
@@ -2466,11 +2489,10 @@ void InstrRefBasedLDV::buildVLocValueMap(const DILocation *DILoc,
   // "blocks that are potentially in scope. See comment at start of vlocJoin.
   SmallPtrSet<const MachineBasicBlock *, 8> InScopeBlocks = BlocksToExplore;
 
-  // Old LiveDebugValues tracks variable locations that come out of blocks
-  // not in scope, where DBG_VALUEs occur. This is something we could
-  // legitimately ignore, but lets allow it for now.
-  if (EmulateOldLDV)
-    BlocksToExplore.insert(AssignBlocks.begin(), AssignBlocks.end());
+  // VarLoc LiveDebugValues tracks variable locations that are defined in
+  // blocks not in scope. This is something we could legitimately ignore, but
+  // lets allow it for now for the sake of coverage.
+  BlocksToExplore.insert(AssignBlocks.begin(), AssignBlocks.end());
 
   // We also need to propagate variable values through any artificial blocks
   // that immediately follow blocks in scope.
@@ -2635,7 +2657,7 @@ void InstrRefBasedLDV::buildVLocValueMap(const DILocation *DILoc,
         // Join values from predecessors. Updates LiveInIdx, and writes output
         // into JoinedInLocs.
         bool InLocsChanged =
-            vlocJoin(*MBB, LiveOutIdx, InScopeBlocks, BlocksToExplore, *LiveIn);
+            vlocJoin(*MBB, LiveOutIdx, BlocksToExplore, *LiveIn);
 
         SmallVector<const MachineBasicBlock *, 8> Preds;
         for (const auto *Pred : MBB->predecessors())
@@ -2730,6 +2752,8 @@ void InstrRefBasedLDV::buildVLocValueMap(const DILocation *DILoc,
         continue;
       if (BlockLiveIn->Kind == DbgValue::VPHI)
         BlockLiveIn->Kind = DbgValue::Def;
+      assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() ==
+             Var.getFragment() && "Fragment info missing during value prop");
       Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn));
     }
   } // Per-variable loop.
@@ -2879,6 +2903,12 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   MFI = &MF.getFrameInfo();
   LS.initialize(MF);
 
+  const auto &STI = MF.getSubtarget();
+  AdjustsStackInCalls = MFI->adjustsStack() &&
+                        STI.getFrameLowering()->stackProbeFunctionModifiesSP();
+  if (AdjustsStackInCalls)
+    StackProbeSymbolName = STI.getTargetLowering()->getStackProbeSymbolName(MF);
+
   MTracker =
       new MLocTracker(MF, *TII, *TRI, *MF.getSubtarget().getTargetLowering());
   VTracker = nullptr;
@@ -2895,7 +2925,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   ++MaxNumBlocks;
 
   MLocTransfer.resize(MaxNumBlocks);
-  vlocs.resize(MaxNumBlocks);
+  vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr));
   SavedLiveIns.resize(MaxNumBlocks);
 
   initialSetup(MF);
@@ -3040,6 +3070,8 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   BBNumToRPO.clear();
   DebugInstrNumToInstr.clear();
   DebugPHINumToValue.clear();
+  OverlapFragments.clear();
+  SeenFragments.clear();
 
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
index d96ef6d4f6e5..789205e61cdb 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -655,6 +655,14 @@ public:
                               const DbgValueProperties &Properties);
 };
 
+/// Types for recording sets of variable fragments that overlap. For a given
+/// local variable, we record all other fragments of that variable that could
+/// overlap it, to reduce search time.
+using FragmentOfVar =
+    std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
+using OverlapMap =
+    DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
+
 /// Collection of DBG_VALUEs observed when traversing a block. Records each
 /// variable and the value the DBG_VALUE refers to. Requires the machine value
 /// location dataflow algorithm to have run already, so that values can be
@@ -672,9 +680,12 @@ public:
   MapVector<DebugVariable, DbgValue> Vars;
   DenseMap<DebugVariable, const DILocation *> Scopes;
   MachineBasicBlock *MBB = nullptr;
+  const OverlapMap &OverlappingFragments;
+  DbgValueProperties EmptyProperties;
 
 public:
-  VLocTracker() {}
+  VLocTracker(const OverlapMap &O, const DIExpression *EmptyExpr)
+      : OverlappingFragments(O), EmptyProperties(EmptyExpr, false) {}
 
   void defVar(const MachineInstr &MI, const DbgValueProperties &Properties,
               Optional<ValueIDNum> ID) {
@@ -689,6 +700,8 @@ public:
     if (!Result.second)
       Result.first->second = Rec;
     Scopes[Var] = MI.getDebugLoc().get();
+
+    considerOverlaps(Var, MI.getDebugLoc().get());
   }
 
   void defVar(const MachineInstr &MI, const MachineOperand &MO) {
@@ -704,16 +717,37 @@ public:
     if (!Result.second)
       Result.first->second = Rec;
     Scopes[Var] = MI.getDebugLoc().get();
+
+    considerOverlaps(Var, MI.getDebugLoc().get());
   }
-};
 
-/// Types for recording sets of variable fragments that overlap. For a given
-/// local variable, we record all other fragments of that variable that could
-/// overlap it, to reduce search time.
-using FragmentOfVar =
-    std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
-using OverlapMap =
-    DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
+  void considerOverlaps(const DebugVariable &Var, const DILocation *Loc) {
+    auto Overlaps = OverlappingFragments.find(
+        {Var.getVariable(), Var.getFragmentOrDefault()});
+    if (Overlaps == OverlappingFragments.end())
+      return;
+
+    // Otherwise: terminate any overlapped variable locations.
+    for (auto FragmentInfo : Overlaps->second) {
+      // The "empty" fragment is stored as DebugVariable::DefaultFragment, so
+      // that it overlaps with everything, however its cannonical representation
+      // in a DebugVariable is as "None".
+      Optional<DIExpression::FragmentInfo> OptFragmentInfo = FragmentInfo;
+      if (DebugVariable::isDefaultFragment(FragmentInfo))
+        OptFragmentInfo = None;
+
+      DebugVariable Overlapped(Var.getVariable(), OptFragmentInfo,
+                               Var.getInlinedAt());
+      DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef);
+
+      // Attempt insertion; overwrite if it's already mapped.
+      auto Result = Vars.insert(std::make_pair(Overlapped, Rec));
+      if (!Result.second)
+        Result.first->second = Rec;
+      Scopes[Overlapped] = Loc;
+    }
+  }
+};
 
 // XXX XXX docs
 class InstrRefBasedLDV : public LDVImpl {
@@ -817,6 +851,16 @@ private:
   OverlapMap OverlapFragments;
   VarToFragments SeenFragments;
 
+  /// True if we need to examine call instructions for stack clobbers. We
+  /// normally assume that they don't clobber SP, but stack probes on Windows
+  /// do.
+  bool AdjustsStackInCalls = false;
+
+  /// If AdjustsStackInCalls is true, this holds the name of the target's stack
+  /// probe function, which is the function we expect will alter the stack
+  /// pointer.
+  StringRef StackProbeSymbolName;
+
   /// Tests whether this instruction is a spill to a stack slot.
   bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
 
@@ -962,7 +1006,6 @@ private:
   /// \returns true if any live-ins change value, either from value propagation
   ///          or PHI elimination.
   bool vlocJoin(MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs,
-                SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks,
                 SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
                 DbgValue &LiveIn);
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
index dcd546f9c6db..5f976bf43c5b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -1875,34 +1875,57 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
 
   LLVM_DEBUG(dbgs() << "********** EMITTING INSTR REFERENCES **********\n");
 
-  // Re-insert any debug instrs back in the position they were. Ordering
-  // is preserved by vector. We must re-insert in the same order to ensure that
-  // debug instructions don't swap, which could re-order assignments.
-  for (auto &P : StashedDebugInstrs) {
-    SlotIndex Idx = P.Idx;
+  // Re-insert any debug instrs back in the position they were. We must
+  // re-insert in the same order to ensure that debug instructions don't swap,
+  // which could re-order assignments. Do so in a batch -- once we find the
+  // insert position, insert all instructions at the same SlotIdx. They are
+  // guaranteed to appear in-sequence in StashedDebugInstrs because we insert
+  // them in order.
+  for (auto StashIt = StashedDebugInstrs.begin();
+       StashIt != StashedDebugInstrs.end(); ++StashIt) {
+    SlotIndex Idx = StashIt->Idx;
+    MachineBasicBlock *MBB = StashIt->MBB;
+    MachineInstr *MI = StashIt->MI;
+
+    auto EmitInstsHere = [this, &StashIt, MBB, Idx,
+                          MI](MachineBasicBlock::iterator InsertPos) {
+      // Insert this debug instruction.
+      MBB->insert(InsertPos, MI);
+
+      // Look at subsequent stashed debug instructions: if they're at the same
+      // index, insert those too.
+      auto NextItem = std::next(StashIt);
+      while (NextItem != StashedDebugInstrs.end() && NextItem->Idx == Idx) {
+        assert(NextItem->MBB == MBB && "Instrs with same slot index should be"
+               "in the same block");
+        MBB->insert(InsertPos, NextItem->MI);
+        StashIt = NextItem;
+        NextItem = std::next(StashIt);
+      };
+    };
 
     // Start block index: find the first non-debug instr in the block, and
     // insert before it.
-    if (Idx == Slots->getMBBStartIdx(P.MBB)) {
+    if (Idx == Slots->getMBBStartIdx(MBB)) {
       MachineBasicBlock::iterator InsertPos =
-          findInsertLocation(P.MBB, Idx, *LIS, BBSkipInstsMap);
-      P.MBB->insert(InsertPos, P.MI);
+          findInsertLocation(MBB, Idx, *LIS, BBSkipInstsMap);
+      EmitInstsHere(InsertPos);
       continue;
     }
 
     if (MachineInstr *Pos = Slots->getInstructionFromIndex(Idx)) {
       // Insert at the end of any debug instructions.
       auto PostDebug = std::next(Pos->getIterator());
-      PostDebug = skipDebugInstructionsForward(PostDebug, P.MBB->instr_end());
-      P.MBB->insert(PostDebug, P.MI);
+      PostDebug = skipDebugInstructionsForward(PostDebug, MBB->instr_end());
+      EmitInstsHere(PostDebug);
     } else {
       // Insert position disappeared; walk forwards through slots until we
       // find a new one.
-      SlotIndex End = Slots->getMBBEndIdx(P.MBB);
+      SlotIndex End = Slots->getMBBEndIdx(MBB);
       for (; Idx < End; Idx = Slots->getNextNonNullIndex(Idx)) {
         Pos = Slots->getInstructionFromIndex(Idx);
         if (Pos) {
-          P.MBB->insert(Pos->getIterator(), P.MI);
+          EmitInstsHere(Pos->getIterator());
           break;
         }
       }
@@ -1911,8 +1934,8 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
       // insert! It's not safe to discard any debug instructions; place them
       // in front of the first terminator, or in front of end().
       if (Idx >= End) {
-        auto TermIt = P.MBB->getFirstTerminator();
-        P.MBB->insert(TermIt, P.MI);
+        auto TermIt = MBB->getFirstTerminator();
+        EmitInstsHere(TermIt);
       }
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
index d91ff734ad8f..6380c4bfd6e6 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -108,8 +108,7 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
                                        SlotIndex UseIdx) const {
   OrigIdx = OrigIdx.getRegSlot(true);
   UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true));
-  for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = OrigMI->getOperand(i);
+  for (const MachineOperand &MO : OrigMI->operands()) {
     if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
       continue;
 
@@ -425,15 +424,8 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
     // The new intervals would have to be spilled anyway so its not worth it.
     // Also they currently aren't spilled so creating them and not spilling
     // them results in incorrect code.
-    bool BeingSpilled = false;
-    for (unsigned i = 0, e = RegsBeingSpilled.size(); i != e; ++i) {
-      if (VReg == RegsBeingSpilled[i]) {
-        BeingSpilled = true;
-        break;
-      }
-    }
-
-    if (BeingSpilled) continue;
+    if (llvm::is_contained(RegsBeingSpilled, VReg))
+      continue;
 
     // LI may have been separated, create new intervals.
     LI->RenumberValues();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h
index dace05f1ad95..ada5c5be484a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h
@@ -18,7 +18,7 @@
 namespace llvm {
 
 /// Helper function that distributes live range value numbers and the
-/// corresponding segments of a master live range \p LR to a list of newly
+/// corresponding segments of a primary live range \p LR to a list of newly
 /// created live ranges \p SplitLRs. \p VNIClasses maps each value number in \p
 /// LR to 0 meaning it should stay or to 1..N meaning it should go to a specific
 /// live range in the \p SplitLRs array.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
index 51ba4b7e53eb..e8744797707b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
@@ -58,9 +58,9 @@ void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
 
 MachineInstr *
 LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
-  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
-    if (Kills[i]->getParent() == MBB)
-      return Kills[i];
+  for (MachineInstr *MI : Kills)
+    if (MI->getParent() == MBB)
+      return MI;
   return nullptr;
 }
 
@@ -811,8 +811,8 @@ bool LiveVariables::isLiveOut(Register Reg, const MachineBasicBlock &MBB) {
   LiveVariables::VarInfo &VI = getVarInfo(Reg);
 
   SmallPtrSet<const MachineBasicBlock *, 8> Kills;
-  for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
-    Kills.insert(VI.Kills[i]->getParent());
+  for (MachineInstr *MI : VI.Kills)
+    Kills.insert(MI->getParent());
 
   // Loop over all of the successors of the basic block, checking to see if
   // the value is either live in the block, or if it is killed in the block.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 2e99c8595cbd..ee2387d1e8e6 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -316,14 +316,14 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
       // than that, but the increased register pressure makes that a
       // tricky thing to balance. Investigate if re-materializing these
       // becomes an issue.
-      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      for (const MachineOperand &MO : MI.operands()) {
         // Consider replacing all frame index operands that reference
         // an object allocated in the local block.
-        if (MI.getOperand(i).isFI()) {
+        if (MO.isFI()) {
           // Don't try this with values not in the local block.
-          if (!MFI.isObjectPreAllocated(MI.getOperand(i).getIndex()))
+          if (!MFI.isObjectPreAllocated(MO.getIndex()))
             break;
-          int Idx = MI.getOperand(i).getIndex();
+          int Idx = MO.getIndex();
           int64_t LocalOffset = LocalOffsets[Idx];
           if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
             break;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp
index 90ecc6fc68fc..b742ad9823c9 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -314,6 +314,8 @@ bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) {
   }
 
   bool Changed = MIRSampleLoader->runOnFunction(MF);
+  if (Changed)
+    MBFI->calculate(MF, *MBFI->getMBPI(), *&getAnalysis<MachineLoopInfo>());
 
   if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
index 366d06871245..310c2721c3bd 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
@@ -1170,9 +1170,10 @@ auto MachineFunction::salvageCopySSA(MachineInstr &MI)
 void MachineFunction::finalizeDebugInstrRefs() {
   auto *TII = getSubtarget().getInstrInfo();
 
-  auto MakeDbgValue = [&](MachineInstr &MI) {
+  auto MakeUndefDbgValue = [&](MachineInstr &MI) {
     const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_VALUE);
     MI.setDesc(RefII);
+    MI.getOperand(0).setReg(0);
     MI.getOperand(1).ChangeToRegister(0, false);
   };
 
@@ -1187,15 +1188,15 @@ void MachineFunction::finalizeDebugInstrRefs() {
       Register Reg = MI.getOperand(0).getReg();
 
       // Some vregs can be deleted as redundant in the meantime. Mark those
-      // as DBG_VALUE $noreg.
-      if (Reg == 0) {
-        MakeDbgValue(MI);
+      // as DBG_VALUE $noreg. Additionally, some normal instructions are
+      // quickly deleted, leaving dangling references to vregs with no def.
+      if (Reg == 0 || !RegInfo->hasOneDef(Reg)) {
+        MakeUndefDbgValue(MI);
         continue;
       }
 
       assert(Reg.isVirtual());
       MachineInstr &DefMI = *RegInfo->def_instr_begin(Reg);
-      assert(RegInfo->hasOneDef(Reg));
 
       // If we've found a copy-like instruction, follow it back to the
       // instruction that defines the source value, see salvageCopySSA docs
@@ -1327,9 +1328,9 @@ bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx,
   assert(Old != New && "Not making a change?");
   bool MadeChange = false;
   MachineJumpTableEntry &JTE = JumpTables[Idx];
-  for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j)
-    if (JTE.MBBs[j] == Old) {
-      JTE.MBBs[j] = New;
+  for (MachineBasicBlock *&MBB : JTE.MBBs)
+    if (MBB == Old) {
+      MBB = New;
       MadeChange = true;
     }
   return MadeChange;
@@ -1342,8 +1343,8 @@ void MachineJumpTableInfo::print(raw_ostream &OS) const {
 
   for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) {
     OS << printJumpTableEntryReference(i) << ':';
-    for (unsigned j = 0, f = JumpTables[i].MBBs.size(); j != f; ++j)
-      OS << ' ' << printMBBReference(*JumpTables[i].MBBs[j]);
+    for (const MachineBasicBlock *MBB : JumpTables[i].MBBs)
+      OS << ' ' << printMBBReference(*MBB);
     if (i != e)
       OS << '\n';
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
index 5c4f75e9ceb9..aaa80432d2f2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1490,12 +1490,10 @@ bool MachineInstr::allDefsAreDead() const {
 /// instruction to this instruction.
 void MachineInstr::copyImplicitOps(MachineFunction &MF,
                                    const MachineInstr &MI) {
-  for (unsigned i = MI.getDesc().getNumOperands(), e = MI.getNumOperands();
-       i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO :
+       llvm::drop_begin(MI.operands(), MI.getDesc().getNumOperands()))
     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
       addOperand(MF, MO);
-  }
 }
 
 bool MachineInstr::hasComplexRegisterTies() const {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
index 4d080e1a4f82..680dbe54ffaf 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1071,7 +1071,9 @@ void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
   // The Value and Offset may differ due to CSE. But the flags and size
   // should be the same.
   assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
-  assert(MMO->getSize() == getSize() && "Size mismatch!");
+  assert((MMO->getSize() == ~UINT64_C(0) || getSize() == ~UINT64_C(0) ||
+          MMO->getSize() == getSize()) &&
+         "Size mismatch!");
 
   if (MMO->getBaseAlign() >= getBaseAlign()) {
     // Update the alignment value.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
index cfbccebaff3e..7783b5e0d3cc 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -617,20 +617,11 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   F->addFnAttr(Attribute::OptimizeForSize);
   F->addFnAttr(Attribute::MinSize);
 
-  // Include target features from an arbitrary candidate for the outlined
-  // function. This makes sure the outlined function knows what kinds of
-  // instructions are going into it. This is fine, since all parent functions
-  // must necessarily support the instructions that are in the outlined region.
   Candidate &FirstCand = OF.Candidates.front();
-  const Function &ParentFn = FirstCand.getMF()->getFunction();
-  if (ParentFn.hasFnAttribute("target-features"))
-    F->addFnAttr(ParentFn.getFnAttribute("target-features"));
+  const TargetInstrInfo &TII =
+      *FirstCand.getMF()->getSubtarget().getInstrInfo();
 
-  // Set nounwind, so we don't generate eh_frame.
-  if (llvm::all_of(OF.Candidates, [](const outliner::Candidate &C) {
-        return C.getMF()->getFunction().hasFnAttribute(Attribute::NoUnwind);
-      }))
-    F->addFnAttr(Attribute::NoUnwind);
+  TII.mergeOutliningCandidateAttributes(*F, OF.Candidates);
 
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
   IRBuilder<> Builder(EntryBB);
@@ -639,8 +630,6 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
   MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock();
-  const TargetSubtargetInfo &STI = MF.getSubtarget();
-  const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   // Insert the new function into the module.
   MF.insert(MF.begin(), &MBB);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
index e18318386def..8d6459a627fa 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1455,17 +1455,15 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
     int asap = 0;
     int zeroLatencyDepth = 0;
     SUnit *SU = &SUnits[I];
-    for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
-                                    EP = SU->Preds.end();
-         IP != EP; ++IP) {
-      SUnit *pred = IP->getSUnit();
-      if (IP->getLatency() == 0)
+    for (const SDep &P : SU->Preds) {
+      SUnit *pred = P.getSUnit();
+      if (P.getLatency() == 0)
         zeroLatencyDepth =
             std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
-      if (ignoreDependence(*IP, true))
+      if (ignoreDependence(P, true))
         continue;
-      asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
-                                  getDistance(pred, SU, *IP) * MII));
+      asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
+                                  getDistance(pred, SU, P) * MII));
     }
     maxASAP = std::max(maxASAP, asap);
     ScheduleInfo[I].ASAP = asap;
@@ -1521,9 +1519,8 @@ static bool pred_L(SetVector<SUnit *> &NodeOrder,
                    SmallSetVector<SUnit *, 8> &Preds,
                    const NodeSet *S = nullptr) {
   Preds.clear();
-  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
-       I != E; ++I) {
-    for (const SDep &Pred : (*I)->Preds) {
+  for (const SUnit *SU : NodeOrder) {
+    for (const SDep &Pred : SU->Preds) {
       if (S && S->count(Pred.getSUnit()) == 0)
         continue;
       if (ignoreDependence(Pred, true))
@@ -1532,7 +1529,7 @@ static bool pred_L(SetVector<SUnit *> &NodeOrder,
         Preds.insert(Pred.getSUnit());
     }
     // Back-edges are predecessors with an anti-dependence.
-    for (const SDep &Succ : (*I)->Succs) {
+    for (const SDep &Succ : SU->Succs) {
       if (Succ.getKind() != SDep::Anti)
         continue;
       if (S && S->count(Succ.getSUnit()) == 0)
@@ -2546,8 +2543,7 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
   unsigned Pos = 0;
   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
        ++I, ++Pos) {
-    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (MachineOperand &MO : MI->operands()) {
       if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
         continue;
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
index 30745c7a5583..54c478645dcf 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
@@ -596,8 +596,7 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI,
   // MI is cheap, we probably don't want to break the critical edge for it.
   // However, if this would allow some definitions of its source operands
   // to be sunk then it's probably worth it.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isUse())
       continue;
     Register Reg = MO.getReg();
@@ -789,8 +788,7 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
 
   // If this instruction is inside a loop and sinking this instruction can make
   // more registers live range shorten, it is still prifitable.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     // Ignore non-register operands.
     if (!MO.isReg())
       continue;
@@ -889,8 +887,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
   MachineBasicBlock *SuccToSinkTo = nullptr;
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg()) continue;  // Ignore non-register operands.
 
     Register Reg = MO.getReg();
@@ -1322,8 +1319,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   // If the instruction to move defines a dead physical register which is live
   // when leaving the basic block, don't move it because it could turn into a
   // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
-  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI.getOperand(I);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || MO.isUse())
       continue;
     Register Reg = MO.getReg();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
index d6bb3e7c9e58..32078db76cf3 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1276,11 +1276,9 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (DstTy.getNumElements() != MI->getNumOperands() - 1)
       report("G_BUILD_VECTOR must have an operand for each elemement", MI);
 
-    for (unsigned i = 2; i < MI->getNumOperands(); ++i) {
-      if (MRI->getType(MI->getOperand(1).getReg()) !=
-          MRI->getType(MI->getOperand(i).getReg()))
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+      if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
         report("G_BUILD_VECTOR source operand types are not homogeneous", MI);
-    }
 
     break;
   }
@@ -1292,12 +1290,10 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (!DstTy.isVector() || SrcEltTy.isVector())
       report("G_BUILD_VECTOR_TRUNC must produce a vector from scalar operands",
              MI);
-    for (unsigned i = 2; i < MI->getNumOperands(); ++i) {
-      if (MRI->getType(MI->getOperand(1).getReg()) !=
-          MRI->getType(MI->getOperand(i).getReg()))
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+      if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
         report("G_BUILD_VECTOR_TRUNC source operand types are not homogeneous",
                MI);
-    }
     if (SrcEltTy.getSizeInBits() <= DstTy.getElementType().getSizeInBits())
       report("G_BUILD_VECTOR_TRUNC source operand types are not larger than "
              "dest elt type",
@@ -1316,11 +1312,9 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (MI->getNumOperands() < 3)
       report("G_CONCAT_VECTOR requires at least 2 source operands", MI);
 
-    for (unsigned i = 2; i < MI->getNumOperands(); ++i) {
-      if (MRI->getType(MI->getOperand(1).getReg()) !=
-          MRI->getType(MI->getOperand(i).getReg()))
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+      if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
         report("G_CONCAT_VECTOR source operand types are not homogeneous", MI);
-    }
     if (DstTy.getNumElements() !=
         SrcTy.getNumElements() * (MI->getNumOperands() - 1))
       report("G_CONCAT_VECTOR num dest and source elements should match", MI);
@@ -3063,9 +3057,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       SlotIndex PEnd = LiveInts->getMBBEndIdx(Pred);
       // Predecessor of landing pad live-out on last call.
       if (MFI->isEHPad()) {
-        for (auto I = Pred->rbegin(), E = Pred->rend(); I != E; ++I) {
-          if (I->isCall()) {
-            PEnd = Indexes->getInstructionIndex(*I).getBoundaryIndex();
+        for (const MachineInstr &MI : llvm::reverse(*Pred)) {
+          if (MI.isCall()) {
+            PEnd = Indexes->getInstructionIndex(MI).getBoundaryIndex();
             break;
           }
         }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
index 8b3cdfab4d42..aaa6403cc978 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -73,8 +73,7 @@ void ModuloScheduleExpander::expand() {
   // stage difference for each use.  Keep the maximum value.
   for (MachineInstr *MI : Schedule.getInstructions()) {
     int DefStage = Schedule.getStage(MI);
-    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
-      MachineOperand &Op = MI->getOperand(i);
+    for (const MachineOperand &Op : MI->operands()) {
       if (!Op.isReg() || !Op.isDef())
         continue;
 
@@ -1006,8 +1005,7 @@ void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
                                                unsigned CurStageNum,
                                                unsigned InstrStageNum,
                                                ValueMapTy *VRMap) {
-  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = NewMI->getOperand(i);
+  for (MachineOperand &MO : NewMI->operands()) {
     if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     Register reg = MO.getReg();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 9a4f70a6070f..29a88480fd9f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -527,9 +527,9 @@ static void updateLiveness(MachineFunction &MF) {
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+  for (const CalleeSavedInfo &I : CSI) {
     for (MachineBasicBlock *MBB : Visited) {
-      MCPhysReg Reg = CSI[i].getReg();
+      MCPhysReg Reg = I.getReg();
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
       if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg))
@@ -540,17 +540,16 @@ static void updateLiveness(MachineFunction &MF) {
     // each MBB between the prologue and epilogue so that it is not clobbered
     // before it is reloaded in the epilogue. The Visited set contains all
     // blocks outside of the region delimited by prologue/epilogue.
-    if (CSI[i].isSpilledToReg()) {
+    if (I.isSpilledToReg()) {
       for (MachineBasicBlock &MBB : MF) {
         if (Visited.count(&MBB))
           continue;
-        MCPhysReg DstReg = CSI[i].getDstReg();
+        MCPhysReg DstReg = I.getDstReg();
         if (!MBB.isLiveIn(DstReg))
           MBB.addLiveIn(DstReg);
       }
     }
   }
-
 }
 
 /// Insert restore code for the callee-saved registers used in the function.
@@ -902,9 +901,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   // incoming stack pointer if a frame pointer is required and is closer
   // to the incoming rather than the final stack pointer.
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-  bool EarlyScavengingSlots = (TFI.hasFP(MF) && TFI.isFPCloseToIncomingSP() &&
-                               RegInfo->useFPForScavengingIndex(MF) &&
-                               !RegInfo->hasStackRealignment(MF));
+  bool EarlyScavengingSlots = TFI.allocateScavengingFrameIndexesNearIncomingSP(MF);
   if (RS && EarlyScavengingSlots) {
     SmallVector<int, 2> SFIs;
     RS->getScavengingFrameIndices(SFIs);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
index 68920e2e50df..6653145d3d2a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -1258,8 +1258,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     // Free registers occupied by defs.
     // Iterate operands in reverse order, so we see the implicit super register
     // defs first (we added them earlier in case of <def,read-undef>).
-    for (unsigned I = MI.getNumOperands(); I-- > 0;) {
-      MachineOperand &MO = MI.getOperand(I);
+    for (MachineOperand &MO : llvm::reverse(MI.operands())) {
       if (!MO.isReg() || !MO.isDef())
         continue;
 
@@ -1362,8 +1361,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
 
   // Free early clobbers.
   if (HasEarlyClobber) {
-    for (unsigned I = MI.getNumOperands(); I-- > 0; ) {
-      MachineOperand &MO = MI.getOperand(I);
+    for (MachineOperand &MO : llvm::reverse(MI.operands())) {
       if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber())
         continue;
       // subreg defs don't free the full register. We left the subreg number
@@ -1440,8 +1438,7 @@ void RegAllocFast::handleBundle(MachineInstr &MI) {
   MachineBasicBlock::instr_iterator BundledMI = MI.getIterator();
   ++BundledMI;
   while (BundledMI->isBundledWithPred()) {
-    for (unsigned I = 0; I < BundledMI->getNumOperands(); ++I) {
-      MachineOperand &MO = BundledMI->getOperand(I);
+    for (MachineOperand &MO : BundledMI->operands()) {
       if (!MO.isReg())
         continue;
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 5a93b58e0baf..50411c177007 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -199,7 +199,8 @@ class RAGreedy : public MachineFunctionPass,
   struct RegInfo {
     LiveRangeStage Stage = RS_New;
 
-    // Cascade - Eviction loop prevention. See canEvictInterference().
+    // Cascade - Eviction loop prevention. See
+    // canEvictInterferenceBasedOnCost().
     unsigned Cascade = 0;
 
     RegInfo() = default;
@@ -207,13 +208,51 @@ class RAGreedy : public MachineFunctionPass,
 
   IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo;
 
+  LiveRangeStage getStage(Register Reg) const {
+    return ExtraRegInfo[Reg].Stage;
+  }
+
   LiveRangeStage getStage(const LiveInterval &VirtReg) const {
-    return ExtraRegInfo[VirtReg.reg()].Stage;
+    return getStage(VirtReg.reg());
+  }
+
+  void setStage(Register Reg, LiveRangeStage Stage) {
+    ExtraRegInfo.resize(MRI->getNumVirtRegs());
+    ExtraRegInfo[Reg].Stage = Stage;
   }
 
   void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
+    setStage(VirtReg.reg(), Stage);
+  }
+
+  /// Return the current stage of the register, if present, otherwise initialize
+  /// it and return that.
+  LiveRangeStage getOrInitStage(Register Reg) {
+    ExtraRegInfo.grow(Reg);
+    return getStage(Reg);
+  }
+
+  unsigned getCascade(Register Reg) const { return ExtraRegInfo[Reg].Cascade; }
+
+  void setCascade(Register Reg, unsigned Cascade) {
     ExtraRegInfo.resize(MRI->getNumVirtRegs());
-    ExtraRegInfo[VirtReg.reg()].Stage = Stage;
+    ExtraRegInfo[Reg].Cascade = Cascade;
+  }
+
+  unsigned getOrAssignNewCascade(Register Reg) {
+    unsigned Cascade = getCascade(Reg);
+    if (!Cascade) {
+      Cascade = NextCascade++;
+      setCascade(Reg, Cascade);
+    }
+    return Cascade;
+  }
+
+  unsigned getCascadeOrCurrentNext(Register Reg) const {
+    unsigned Cascade = getCascade(Reg);
+    if (!Cascade)
+      Cascade = NextCascade;
+    return Cascade;
   }
 
   template<typename Iterator>
@@ -410,8 +449,11 @@ private:
   void calcGapWeights(MCRegister, SmallVectorImpl<float> &);
   Register canReassign(LiveInterval &VirtReg, Register PrevReg) const;
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const;
-  bool canEvictInterference(LiveInterval &, MCRegister, bool, EvictionCost &,
-                            const SmallVirtRegSet &) const;
+  bool canEvictInterferenceBasedOnCost(LiveInterval &, MCRegister, bool,
+                                       EvictionCost &,
+                                       const SmallVirtRegSet &) const;
+  bool canEvictHintInterference(LiveInterval &, MCRegister,
+                                const SmallVirtRegSet &) const;
   bool canEvictInterferenceInRange(const LiveInterval &VirtReg,
                                    MCRegister PhysReg, SlotIndex Start,
                                    SlotIndex End, EvictionCost &MaxCost) const;
@@ -683,15 +725,16 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
   assert(Reg.isVirtual() && "Can only enqueue virtual registers");
   unsigned Prio;
 
-  ExtraRegInfo.grow(Reg);
-  if (ExtraRegInfo[Reg].Stage == RS_New)
-    ExtraRegInfo[Reg].Stage = RS_Assign;
-
-  if (ExtraRegInfo[Reg].Stage == RS_Split) {
+  auto Stage = getOrInitStage(Reg);
+  if (Stage == RS_New) {
+    Stage = RS_Assign;
+    setStage(Reg, Stage);
+  }
+  if (Stage == RS_Split) {
     // Unsplit ranges that couldn't be allocated immediately are deferred until
     // everything else has been allocated.
     Prio = Size;
-  } else if (ExtraRegInfo[Reg].Stage == RS_Memory) {
+  } else if (Stage == RS_Memory) {
     // Memory operand should be considered last.
     // Change the priority such that Memory operand are assigned in
     // the reverse order that they came in.
@@ -706,7 +749,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
     bool ForceGlobal = !ReverseLocal &&
       (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC));
 
-    if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
+    if (Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
         LIS->intervalIsInOneMBB(*LI)) {
       // Allocate original local ranges in linear instruction order. Since they
       // are singly defined, this produces optimal coloring in the absence of
@@ -780,10 +823,8 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg,
     if (Order.isHint(Hint)) {
       MCRegister PhysHint = Hint.asMCReg();
       LLVM_DEBUG(dbgs() << "missed hint " << printReg(PhysHint, TRI) << '\n');
-      EvictionCost MaxCost;
-      MaxCost.setBrokenHints(1);
-      if (canEvictInterference(VirtReg, PhysHint, true, MaxCost,
-                               FixedRegisters)) {
+
+      if (canEvictHintInterference(VirtReg, PhysHint, FixedRegisters)) {
         evictInterference(VirtReg, PhysHint, NewVRegs);
         return PhysHint;
       }
@@ -864,8 +905,19 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
   return false;
 }
 
-/// canEvictInterference - Return true if all interferences between VirtReg and
-/// PhysReg can be evicted.
+/// canEvictHintInterference - return true if the interference for VirtReg
+/// on the PhysReg, which is VirtReg's hint, can be evicted in favor of VirtReg.
+bool RAGreedy::canEvictHintInterference(
+    LiveInterval &VirtReg, MCRegister PhysReg,
+    const SmallVirtRegSet &FixedRegisters) const {
+  EvictionCost MaxCost;
+  MaxCost.setBrokenHints(1);
+  return canEvictInterferenceBasedOnCost(VirtReg, PhysReg, true, MaxCost,
+                                         FixedRegisters);
+}
+
+/// canEvictInterferenceBasedOnCost - Return true if all interferences between
+/// VirtReg and PhysReg can be evicted.
 ///
 /// @param VirtReg Live range that is about to be assigned.
 /// @param PhysReg Desired register for assignment.
@@ -873,7 +925,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 /// @param MaxCost Only look for cheaper candidates and update with new cost
 ///                when returning true.
 /// @returns True when interference can be evicted cheaper than MaxCost.
-bool RAGreedy::canEvictInterference(
+bool RAGreedy::canEvictInterferenceBasedOnCost(
     LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
     EvictionCost &MaxCost, const SmallVirtRegSet &FixedRegisters) const {
   // It is only possible to evict virtual register interference.
@@ -1054,9 +1106,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
   // evicted by a newer cascade, preventing infinite loops.
-  unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade;
-  if (!Cascade)
-    Cascade = ExtraRegInfo[VirtReg.reg()].Cascade = NextCascade++;
+  unsigned Cascade = getOrAssignNewCascade(VirtReg.reg());
 
   LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI)
                     << " interference: Cascade " << Cascade << '\n');
@@ -1082,10 +1132,10 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
     LastEvicted.addEviction(PhysReg, VirtReg.reg(), Intf->reg());
 
     Matrix->unassign(*Intf);
-    assert((ExtraRegInfo[Intf->reg()].Cascade < Cascade ||
+    assert((getCascade(Intf->reg()) < Cascade ||
             VirtReg.isSpillable() < Intf->isSpillable()) &&
            "Cannot decrease cascade number, illegal eviction");
-    ExtraRegInfo[Intf->reg()].Cascade = Cascade;
+    setCascade(Intf->reg(), Cascade);
     ++NumEvicted;
     NewVRegs.push_back(Intf->reg());
   }
@@ -1150,8 +1200,8 @@ MCRegister RAGreedy::tryFindEvictionCandidate(
       continue;
     }
 
-    if (!canEvictInterference(VirtReg, PhysReg, false, BestCost,
-                              FixedRegisters))
+    if (!canEvictInterferenceBasedOnCost(VirtReg, PhysReg, false, BestCost,
+                                         FixedRegisters))
       continue;
 
     // Best so far.
@@ -1756,7 +1806,6 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   SE->finish(&IntvMap);
   DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
-  ExtraRegInfo.resize(MRI->getNumVirtRegs());
   unsigned OrigBlocks = SA->getNumLiveBlocks();
 
   // Sort out the new intervals created by splitting. We get four kinds:
@@ -1765,10 +1814,10 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // - Block-local splits are candidates for local splitting.
   // - DCE leftovers should go back on the queue.
   for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
-    LiveInterval &Reg = LIS->getInterval(LREdit.get(I));
+    const LiveInterval &Reg = LIS->getInterval(LREdit.get(I));
 
     // Ignore old intervals from DCE.
-    if (getStage(Reg) != RS_New)
+    if (getOrInitStage(Reg.reg()) != RS_New)
       continue;
 
     // Remainder interval. Don't try splitting again, spill if it doesn't
@@ -2012,13 +2061,11 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // Tell LiveDebugVariables about the new ranges.
   DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
-  ExtraRegInfo.resize(MRI->getNumVirtRegs());
-
   // Sort out the new intervals created by splitting. The remainder interval
   // goes straight to spilling, the new local ranges get to stay RS_New.
   for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
-    LiveInterval &LI = LIS->getInterval(LREdit.get(I));
-    if (getStage(LI) == RS_New && IntvMap[I] == 0)
+    const LiveInterval &LI = LIS->getInterval(LREdit.get(I));
+    if (getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0)
       setStage(LI, RS_Spill);
   }
 
@@ -2104,8 +2151,6 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
   DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
-  ExtraRegInfo.resize(MRI->getNumVirtRegs());
-
   // Assign all new registers to RS_Spill. This was the last chance.
   setStage(LREdit.begin(), LREdit.end(), RS_Spill);
   return 0;
@@ -2400,7 +2445,6 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
   DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
-
   // If the new range has the same number of instructions as before, mark it as
   // RS_Split2 so the next split will be forced to make progress. Otherwise,
   // leave the new intervals as RS_New so they can compete.
@@ -3021,7 +3065,7 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
 
   LiveRangeStage Stage = getStage(VirtReg);
   LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade "
-                    << ExtraRegInfo[VirtReg.reg()].Cascade << '\n');
+                    << getCascade(VirtReg.reg()) << '\n');
 
   // Try to evict a less worthy live range, but only for ranges from the primary
   // queue. The RS_Split ranges already failed to do this, and they should not
@@ -3311,7 +3355,6 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
   SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI));
   ExtraRegInfo.clear();
-  ExtraRegInfo.resize(MRI->getNumVirtRegs());
   NextCascade = 1;
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
index c847068bca90..4c8534cf2d01 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -3908,20 +3908,20 @@ void RegisterCoalescer::lateLiveIntervalUpdate() {
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
-  for (unsigned i = 0, e = CurrList.size(); i != e; ++i) {
-    if (!CurrList[i])
+  for (MachineInstr *&MI : CurrList) {
+    if (!MI)
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.count(CurrList[i])) {
-      CurrList[i] = nullptr;
+    if (ErasedInstrs.count(MI)) {
+      MI = nullptr;
       continue;
     }
     bool Again = false;
-    bool Success = joinCopy(CurrList[i], Again);
+    bool Success = joinCopy(MI, Again);
     Progress |= Success;
     if (Success || !Again)
-      CurrList[i] = nullptr;
+      MI = nullptr;
   }
   return Progress;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 3f013eb6024e..0e8e8338b46d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -406,11 +406,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
       // register in later operands. The lanes of other defs will now be live
       // after this instruction, so these should not be treated as killed by the
       // instruction even though they appear to be killed in this one operand.
-      for (int I = OperIdx + 1, E = MI->getNumOperands(); I != E; ++I) {
-        const MachineOperand &OtherMO = MI->getOperand(I);
+      for (const MachineOperand &OtherMO :
+           llvm::drop_begin(MI->operands(), OperIdx + 1))
         if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg)
           KillLaneMask &= ~getLaneMaskForMO(OtherMO);
-      }
     }
 
     // Clear undef flag, we'll re-add it later once we know which subregister
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ce400ea43f29..df5a041b87cd 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4436,7 +4436,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
     SDValue OptimizedDiv =
         isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
-    if (OptimizedDiv.getNode()) {
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
       // If the equivalent Div node also exists, update its users.
       unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
       if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(),
@@ -4464,6 +4464,9 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
   SDLoc DL(N);
 
   if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
     // fold (mulhs x, 0) -> 0
     // do not return N0/N1, because undef node may exist.
     if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) ||
@@ -4521,6 +4524,9 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   SDLoc DL(N);
 
   if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
     // fold (mulhu x, 0) -> 0
     // do not return N0/N1, because undef node may exist.
     if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) ||
@@ -4779,6 +4785,106 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
   return SDValue();
 }
 
+// Function to calculate whether the Min/Max pair of SDNodes (potentially
+// swapped around) make a signed saturate pattern, clamping to between -2^(BW-1)
+// and 2^(BW-1)-1. Returns the node being clamped and the bitwidth of the clamp
+// in BW. Should work with both SMIN/SMAX nodes and setcc/select combo. The
+// operands are the same as SimplifySelectCC. N0<N1 ? N2 : N3
+static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
+                                  SDValue N3, ISD::CondCode CC, unsigned &BW) {
+  auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3,
+                            ISD::CondCode CC) {
+    // The compare and select operand should be the same or the select operands
+    // should be truncated versions of the comparison.
+    if (N0 != N2 && (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0)))
+      return 0;
+    // The constants need to be the same or a truncated version of each other.
+    ConstantSDNode *N1C = isConstOrConstSplat(N1);
+    ConstantSDNode *N3C = isConstOrConstSplat(N3);
+    if (!N1C || !N3C)
+      return 0;
+    const APInt &C1 = N1C->getAPIntValue();
+    const APInt &C2 = N3C->getAPIntValue();
+    if (C1.getBitWidth() < C2.getBitWidth() ||
+        C1 != C2.sextOrSelf(C1.getBitWidth()))
+      return 0;
+    return CC == ISD::SETLT ? ISD::SMIN : (CC == ISD::SETGT ? ISD::SMAX : 0);
+  };
+
+  // Check the initial value is a SMIN/SMAX equivalent.
+  unsigned Opcode0 = isSignedMinMax(N0, N1, N2, N3, CC);
+  if (!Opcode0)
+    return SDValue();
+
+  SDValue N00, N01, N02, N03;
+  ISD::CondCode N0CC;
+  switch (N0.getOpcode()) {
+  case ISD::SMIN:
+  case ISD::SMAX:
+    N00 = N02 = N0.getOperand(0);
+    N01 = N03 = N0.getOperand(1);
+    N0CC = N0.getOpcode() == ISD::SMIN ? ISD::SETLT : ISD::SETGT;
+    break;
+  case ISD::SELECT_CC:
+    N00 = N0.getOperand(0);
+    N01 = N0.getOperand(1);
+    N02 = N0.getOperand(2);
+    N03 = N0.getOperand(3);
+    N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
+    break;
+  case ISD::SELECT:
+  case ISD::VSELECT:
+    if (N0.getOperand(0).getOpcode() != ISD::SETCC)
+      return SDValue();
+    N00 = N0.getOperand(0).getOperand(0);
+    N01 = N0.getOperand(0).getOperand(1);
+    N02 = N0.getOperand(1);
+    N03 = N0.getOperand(2);
+    N0CC = cast<CondCodeSDNode>(N0.getOperand(0).getOperand(2))->get();
+    break;
+  default:
+    return SDValue();
+  }
+
+  unsigned Opcode1 = isSignedMinMax(N00, N01, N02, N03, N0CC);
+  if (!Opcode1 || Opcode0 == Opcode1)
+    return SDValue();
+
+  ConstantSDNode *MinCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N1 : N01);
+  ConstantSDNode *MaxCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N01 : N1);
+  if (!MinCOp || !MaxCOp || MinCOp->getValueType(0) != MaxCOp->getValueType(0))
+    return SDValue();
+
+  const APInt &MinC = MinCOp->getAPIntValue();
+  const APInt &MaxC = MaxCOp->getAPIntValue();
+  APInt MinCPlus1 = MinC + 1;
+  if (-MaxC != MinCPlus1 || !MinCPlus1.isPowerOf2())
+    return SDValue();
+  BW = MinCPlus1.exactLogBase2() + 1;
+  return N02;
+}
+
+static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
+                                           SDValue N3, ISD::CondCode CC,
+                                           SelectionDAG &DAG) {
+  unsigned BW;
+  SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW);
+  if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT)
+    return SDValue();
+  EVT FPVT = Fp.getOperand(0).getValueType();
+  EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
+  if (FPVT.isVector())
+    NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
+                             FPVT.getVectorElementCount());
+  if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(
+          ISD::FP_TO_SINT_SAT, Fp.getOperand(0).getValueType(), NewVT))
+    return SDValue();
+  SDLoc DL(Fp);
+  SDValue Sat = DAG.getNode(ISD::FP_TO_SINT_SAT, DL, NewVT, Fp.getOperand(0),
+                            DAG.getValueType(NewVT.getScalarType()));
+  return DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0));
+}
+
 SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4817,6 +4923,11 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
       return DAG.getNode(AltOpcode, DL, VT, N0, N1);
   }
 
+  if (Opcode == ISD::SMIN || Opcode == ISD::SMAX)
+    if (SDValue S = PerformMinMaxFpToSatCombine(
+            N0, N1, N0, N1, Opcode == ISD::SMIN ? ISD::SETLT : ISD::SETGT, DAG))
+      return S;
+
   // Simplify the operands using demanded-bits information.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
@@ -9940,9 +10051,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
 
   // If this is a masked load with an all ones mask, we can use a unmasked load.
   // FIXME: Can we do this for indexed, compressing, or truncating stores?
-  if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) &&
-      MST->isUnindexed() && !MST->isCompressingStore() &&
-      !MST->isTruncatingStore())
+  if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() &&
+      !MST->isCompressingStore() && !MST->isTruncatingStore())
     return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(),
                         MST->getBasePtr(), MST->getMemOperand());
 
@@ -9997,9 +10107,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
 
   // If this is a masked load with an all ones mask, we can use a unmasked load.
   // FIXME: Can we do this for indexed, expanding, or extending loads?
-  if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) &&
-      MLD->isUnindexed() && !MLD->isExpandingLoad() &&
-      MLD->getExtensionType() == ISD::NON_EXTLOAD) {
+  if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MLD->isUnindexed() &&
+      !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) {
     SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(),
                                 MLD->getBasePtr(), MLD->getMemOperand());
     return CombineTo(N, NewLd, NewLd.getValue(1));
@@ -10138,6 +10247,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
         return FMinMax;
     }
 
+    if (SDValue S = PerformMinMaxFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
+      return S;
+
     // If this select has a condition (setcc) with narrower operands than the
     // select, try to widen the compare to match the select width.
     // TODO: This should be extended to handle any constant.
@@ -15007,7 +15119,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-       TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+      TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -23034,6 +23146,9 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
                        DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT));
   }
 
+  if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG))
+    return S;
+
   return SDValue();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index c1bb65409282..331e0325aea3 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -765,7 +765,7 @@ InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD,
   assert(!SD->isVariadic());
   SDDbgOperand DbgOperand = SD->getLocationOps()[0];
   MDNode *Var = SD->getVariable();
-  MDNode *Expr = SD->getExpression();
+  DIExpression *Expr = (DIExpression*)SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
   const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_INSTR_REF);
 
@@ -775,6 +775,13 @@ InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD,
       DbgOperand.getKind() == SDDbgOperand::CONST)
     return EmitDbgValueFromSingleOp(SD, VRBaseMap);
 
+  // Immediately fold any indirectness from the LLVM-IR intrinsic into the
+  // expression:
+  if (SD->isIndirect()) {
+    std::vector<uint64_t> Elts = {dwarf::DW_OP_deref};
+    Expr = DIExpression::append(Expr, Elts);
+  }
+
   // It may not be immediately possible to identify the MachineInstr that
   // defines a VReg, it can depend for example on the order blocks are
   // emitted in. When this happens, or when further analysis is needed later,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index eb9d2286aeb4..08598eeded7a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3553,9 +3553,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // Node.
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
-    if (Tmp2.getOpcode() == ISD::SETCC) {
-      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
-                         Tmp1, Tmp2.getOperand(2),
+    if (Tmp2.getOpcode() == ISD::SETCC &&
+        TLI.isOperationLegalOrCustom(ISD::BR_CC,
+                                     Tmp2.getOperand(0).getValueType())) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1, Tmp2.getOperand(2),
                          Tmp2.getOperand(0), Tmp2.getOperand(1),
                          Node->getOperand(2));
     } else {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 1f73c9eea104..98312f91d8c0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 static cl::opt<bool>
 EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
 
-/// Do extensive, expensive, sanity checking.
+/// Do extensive, expensive, basic correctness checking.
 void DAGTypeLegalizer::PerformExpensiveChecks() {
   // If a node is not processed, then none of its values should be mapped by any
   // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
@@ -534,7 +534,8 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
       // The node morphed into a different node.  Normally for this to happen
       // the original node would have to be marked NewNode.  However this can
       // in theory momentarily not be the case while ReplaceValueWith is doing
-      // its stuff.  Mark the original node NewNode to help sanity checking.
+      // its stuff.  Mark the original node NewNode to help basic correctness
+      // checking.
       N->setNodeId(NewNode);
       if (M->getNodeId() != NewNode && M->getNodeId() != Unanalyzed)
         // It morphed into a previously analyzed node - nothing more to do.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 539c9cb9c256..7ec2638b1e71 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1820,10 +1820,10 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
   else
     std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
 
-  unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize());
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoSize, Alignment,
-      MLD->getAAInfo(), MLD->getRanges());
+      MLD->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, MLD->getAAInfo(),
+      MLD->getRanges());
 
   Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
                          MMO, MLD->getAddressingMode(), ExtType,
@@ -1837,7 +1837,6 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
     // Generate hi masked load.
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
                                      MLD->isExpandingLoad());
-    unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize());
 
     MachinePointerInfo MPI;
     if (LoMemVT.isScalableVector())
@@ -1847,8 +1846,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
           LoMemVT.getStoreSize().getFixedSize());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOLoad, HiSize, Alignment, MLD->getAAInfo(),
-        MLD->getRanges());
+        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
+        MLD->getAAInfo(), MLD->getRanges());
 
     Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
                            HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
@@ -2662,10 +2661,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
       DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);
 
   SDValue Lo, Hi, Res;
-  unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize());
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      N->getPointerInfo(), MachineMemOperand::MOStore, LoSize, Alignment,
-      N->getAAInfo(), N->getRanges());
+      N->getPointerInfo(), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
 
   Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
                           N->getAddressingMode(), N->isTruncatingStore(),
@@ -2689,10 +2687,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
       MPI = N->getPointerInfo().getWithOffset(
           LoMemVT.getStoreSize().getFixedSize());
 
-    unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize());
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOStore, HiSize, Alignment, N->getAAInfo(),
-        N->getRanges());
+        MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
+        N->getAAInfo(), N->getRanges());
 
     Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
                             N->getAddressingMode(), N->isTruncatingStore(),
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 55fe26eb64cd..2695ed36991c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -268,8 +268,8 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {
 
   // Now see if there are no other dependencies
   // to instructions already in the packet.
-  for (unsigned i = 0, e = Packet.size(); i != e; ++i)
-    for (const SDep &Succ : Packet[i]->Succs) {
+  for (const SUnit *S : Packet)
+    for (const SDep &Succ : S->Succs) {
       // Since we do not add pseudos to packets, might as well
       // ignore order deps.
       if (Succ.isCtrl())
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 95f7e43b151d..84e6d2a16422 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -706,8 +706,8 @@ void ScheduleDAGSDNodes::dump() const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ScheduleDAGSDNodes::dumpSchedule() const {
-  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
-    if (SUnit *SU = Sequence[i])
+  for (const SUnit *SU : Sequence) {
+    if (SU)
       dumpNode(*SU);
     else
       dbgs() << "**** NOOP ****\n";
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 008665d50233..c282e03387dd 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -406,8 +406,8 @@ bool ISD::isVPOpcode(unsigned Opcode) {
   switch (Opcode) {
   default:
     return false;
-#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
-  case ISD::SDOPC:                                                             \
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...)                                    \
+  case ISD::VPSD:                                                              \
     return true;
 #include "llvm/IR/VPIntrinsics.def"
   }
@@ -416,23 +416,25 @@ bool ISD::isVPOpcode(unsigned Opcode) {
 bool ISD::isVPBinaryOp(unsigned Opcode) {
   switch (Opcode) {
   default:
-    return false;
-#define PROPERTY_VP_BINARYOP_SDNODE(SDOPC)                                     \
-  case ISD::SDOPC:                                                             \
-    return true;
+    break;
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD:
+#define VP_PROPERTY_BINARYOP return true;
+#define END_REGISTER_VP_SDNODE(VPSD) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
+  return false;
 }
 
 bool ISD::isVPReduction(unsigned Opcode) {
   switch (Opcode) {
   default:
-    return false;
-#define PROPERTY_VP_REDUCTION_SDNODE(SDOPC)                                    \
-  case ISD::SDOPC:                                                             \
-    return true;
+    break;
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD:
+#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true;
+#define END_REGISTER_VP_SDNODE(VPSD) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
+  return false;
 }
 
 /// The operand position of the vector mask.
@@ -440,8 +442,8 @@ Optional<unsigned> ISD::getVPMaskIdx(unsigned Opcode) {
   switch (Opcode) {
   default:
     return None;
-#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, ...)        \
-  case ISD::SDOPC:                                                             \
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, TDNAME, MASKPOS, ...)         \
+  case ISD::VPSD:                                                              \
     return MASKPOS;
 #include "llvm/IR/VPIntrinsics.def"
   }
@@ -452,8 +454,8 @@ Optional<unsigned> ISD::getVPExplicitVectorLengthIdx(unsigned Opcode) {
   switch (Opcode) {
   default:
     return None;
-#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, EVLPOS)     \
-  case ISD::SDOPC:                                                             \
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, TDNAME, MASKPOS, EVLPOS)      \
+  case ISD::VPSD:                                                              \
     return EVLPOS;
 #include "llvm/IR/VPIntrinsics.def"
   }
@@ -974,7 +976,7 @@ void SelectionDAG::DeallocateNode(SDNode *N) {
 }
 
 #ifndef NDEBUG
-/// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
+/// VerifySDNode - Check the given SDNode.  Aborts if it is invalid.
 static void VerifySDNode(SDNode *N) {
   switch (N->getOpcode()) {
   default:
@@ -4540,10 +4542,25 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
 }
 
 // FIXME: unify with llvm::haveNoCommonBitsSet.
-// FIXME: could also handle masked merge pattern (X & ~M) op (Y & M)
 bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   assert(A.getValueType() == B.getValueType() &&
          "Values must have the same type");
+  // Match masked merge pattern (X & ~M) op (Y & M)
+  if (A->getOpcode() == ISD::AND && B->getOpcode() == ISD::AND) {
+    auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue And) {
+      if (isBitwiseNot(NotM, true)) {
+        SDValue NotOperand = NotM->getOperand(0);
+        return NotOperand == And->getOperand(0) ||
+               NotOperand == And->getOperand(1);
+      }
+      return false;
+    };
+    if (MatchNoCommonBitsPattern(A->getOperand(0), B) ||
+        MatchNoCommonBitsPattern(A->getOperand(1), B) ||
+        MatchNoCommonBitsPattern(B->getOperand(0), A) ||
+        MatchNoCommonBitsPattern(B->getOperand(1), A))
+      return true;
+  }
   return KnownBits::haveNoCommonBitsSet(computeKnownBits(A),
                                         computeKnownBits(B));
 }
@@ -5070,7 +5087,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return getUNDEF(VT);
     break;
   case ISD::BITCAST:
-    // Basic sanity checking.
     assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
            "Cannot BITCAST between types of different sizes!");
     if (VT == Operand.getValueType()) return Operand;  // noop conversion.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 5d911c165293..7726a0007e44 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4336,9 +4336,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
-      // TODO: Make MachineMemOperands aware of scalable
-      // vectors.
-      VT.getStoreSize().getKnownMinSize(), *Alignment, I.getAAMetadata());
+      MemoryLocation::UnknownSize, *Alignment, I.getAAMetadata());
   SDValue StoreNode =
       DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
                          ISD::UNINDEXED, false /* Truncating */, IsCompressing);
@@ -4496,22 +4494,14 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   // Do not serialize masked loads of constant memory with anything.
-  MemoryLocation ML;
-  if (VT.isScalableVector())
-    ML = MemoryLocation::getAfter(PtrOperand);
-  else
-    ML = MemoryLocation(PtrOperand, LocationSize::precise(
-                           DAG.getDataLayout().getTypeStoreSize(I.getType())),
-                           AAInfo);
+  MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
   bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
 
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
-      // TODO: Make MachineMemOperands aware of scalable
-      // vectors.
-      VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo, Ranges);
+      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
 
   SDValue Load =
       DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
@@ -5807,8 +5797,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::vscale: {
     match(&I, m_VScale(DAG.getDataLayout()));
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-    setValue(&I,
-             DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1)));
+    setValue(&I, DAG.getVScale(sdl, VT, APInt(VT.getSizeInBits(), 1)));
     return;
   }
   case Intrinsic::vastart:  visitVAStart(I); return;
@@ -6942,10 +6931,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
     const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
     unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
-    SDValue N =
-        DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT);
+    SDValue N = DAG.getCopyFromReg(DAG.getEntryNode(), sdl, VReg, PtrVT);
     if (Intrinsic == Intrinsic::eh_exceptioncode)
-      N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
+      N = DAG.getZExtOrTrunc(N, sdl, MVT::i32);
     setValue(&I, N);
     return;
   }
@@ -6957,7 +6945,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     if (Triple.getArch() != Triple::x86_64)
       return;
 
-    SDLoc DL = getCurSDLoc();
     SmallVector<SDValue, 8> Ops;
 
     // We want to say that we always want the arguments in registers.
@@ -6974,7 +6961,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // see that some registers may be assumed clobbered and have to preserve
     // them across calls to the intrinsic.
     MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL,
-                                           DL, NodeTys, Ops);
+                                           sdl, NodeTys, Ops);
     SDValue patchableNode = SDValue(MN, 0);
     DAG.setRoot(patchableNode);
     setValue(&I, patchableNode);
@@ -6988,7 +6975,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     if (Triple.getArch() != Triple::x86_64)
       return;
 
-    SDLoc DL = getCurSDLoc();
     SmallVector<SDValue, 8> Ops;
 
     // We want to say that we always want the arguments in registers.
@@ -7009,7 +6995,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // see that some registers may be assumed clobbered and have to preserve
     // them across calls to the intrinsic.
     MachineSDNode *MN = DAG.getMachineNode(
-        TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, DL, NodeTys, Ops);
+        TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, sdl, NodeTys, Ops);
     SDValue patchableNode = SDValue(MN, 0);
     DAG.setRoot(patchableNode);
     setValue(&I, patchableNode);
@@ -7047,7 +7033,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     if (!Base)
       report_fatal_error(
           "llvm.icall.branch.funnel operand must be a GlobalValue");
-    Ops.push_back(DAG.getTargetGlobalAddress(Base, getCurSDLoc(), MVT::i64, 0));
+    Ops.push_back(DAG.getTargetGlobalAddress(Base, sdl, MVT::i64, 0));
 
     struct BranchFunnelTarget {
       int64_t Offset;
@@ -7068,8 +7054,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         report_fatal_error(
             "llvm.icall.branch.funnel operand must be a GlobalValue");
       Targets.push_back({Offset, DAG.getTargetGlobalAddress(
-                                     GA->getGlobal(), getCurSDLoc(),
-                                     Val.getValueType(), GA->getOffset())});
+                                     GA->getGlobal(), sdl, Val.getValueType(),
+                                     GA->getOffset())});
     }
     llvm::sort(Targets,
                [](const BranchFunnelTarget &T1, const BranchFunnelTarget &T2) {
@@ -7077,13 +7063,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                });
 
     for (auto &T : Targets) {
-      Ops.push_back(DAG.getTargetConstant(T.Offset, getCurSDLoc(), MVT::i32));
+      Ops.push_back(DAG.getTargetConstant(T.Offset, sdl, MVT::i32));
       Ops.push_back(T.Target);
     }
 
     Ops.push_back(DAG.getRoot()); // Chain
-    SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL,
-                                 getCurSDLoc(), MVT::Other, Ops),
+    SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL, sdl,
+                                 MVT::Other, Ops),
               0);
     DAG.setRoot(N);
     setValue(&I, N);
@@ -7102,7 +7088,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
     bool ZeroMemory = Intrinsic == Intrinsic::aarch64_settag_zero;
     SDValue Val = TSI.EmitTargetCodeForSetTag(
-        DAG, getCurSDLoc(), getRoot(), getValue(I.getArgOperand(0)),
+        DAG, sdl, getRoot(), getValue(I.getArgOperand(0)),
         getValue(I.getArgOperand(1)), MachinePointerInfo(I.getArgOperand(0)),
         ZeroMemory);
     DAG.setRoot(Val);
@@ -7114,46 +7100,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Const = getValue(I.getOperand(1));
 
     EVT PtrVT = Ptr.getValueType();
-    setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), PtrVT, Ptr,
-                             DAG.getZExtOrTrunc(Const, getCurSDLoc(), PtrVT)));
+    setValue(&I, DAG.getNode(ISD::AND, sdl, PtrVT, Ptr,
+                             DAG.getZExtOrTrunc(Const, sdl, PtrVT)));
     return;
   }
   case Intrinsic::get_active_lane_mask: {
-    auto DL = getCurSDLoc();
+    EVT CCVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     SDValue Index = getValue(I.getOperand(0));
-    SDValue TripCount = getValue(I.getOperand(1));
-    Type *ElementTy = I.getOperand(0)->getType();
-    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-    unsigned VecWidth = VT.getVectorNumElements();
+    EVT ElementVT = Index.getValueType();
 
-    SmallVector<SDValue, 16> OpsTripCount;
-    SmallVector<SDValue, 16> OpsIndex;
-    SmallVector<SDValue, 16> OpsStepConstants;
-    for (unsigned i = 0; i < VecWidth; i++) {
-      OpsTripCount.push_back(TripCount);
-      OpsIndex.push_back(Index);
-      OpsStepConstants.push_back(
-          DAG.getConstant(i, DL, EVT::getEVT(ElementTy)));
+    if (!TLI.shouldExpandGetActiveLaneMask(CCVT, ElementVT)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
     }
 
-    EVT CCVT = EVT::getVectorVT(I.getContext(), MVT::i1, VecWidth);
+    SDValue TripCount = getValue(I.getOperand(1));
+    auto VecTy = CCVT.changeVectorElementType(ElementVT);
 
-    auto VecTy = EVT::getEVT(FixedVectorType::get(ElementTy, VecWidth));
-    SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex);
-    SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants);
+    SDValue VectorIndex, VectorTripCount;
+    if (VecTy.isScalableVector()) {
+      VectorIndex = DAG.getSplatVector(VecTy, sdl, Index);
+      VectorTripCount = DAG.getSplatVector(VecTy, sdl, TripCount);
+    } else {
+      VectorIndex = DAG.getSplatBuildVector(VecTy, sdl, Index);
+      VectorTripCount = DAG.getSplatBuildVector(VecTy, sdl, TripCount);
+    }
+    SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
     SDValue VectorInduction = DAG.getNode(
-       ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
-    SDValue VectorTripCount = DAG.getBuildVector(VecTy, DL, OpsTripCount);
-    SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0),
+        ISD::UADDO, sdl, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
+    SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction.getValue(0),
                                  VectorTripCount, ISD::CondCode::SETULT);
-    setValue(&I, DAG.getNode(ISD::AND, DL, CCVT,
-                             DAG.getNOT(DL, VectorInduction.getValue(1), CCVT),
+    setValue(&I, DAG.getNode(ISD::AND, sdl, CCVT,
+                             DAG.getNOT(sdl, VectorInduction.getValue(1), CCVT),
                              SetCC));
     return;
   }
   case Intrinsic::experimental_vector_insert: {
-    auto DL = getCurSDLoc();
-
     SDValue Vec = getValue(I.getOperand(0));
     SDValue SubVec = getValue(I.getOperand(1));
     SDValue Index = getValue(I.getOperand(2));
@@ -7163,16 +7145,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     MVT VectorIdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
     if (Index.getValueType() != VectorIdxTy)
       Index = DAG.getVectorIdxConstant(
-          cast<ConstantSDNode>(Index)->getZExtValue(), DL);
+          cast<ConstantSDNode>(Index)->getZExtValue(), sdl);
 
     EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-    setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResultVT, Vec, SubVec,
+    setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, ResultVT, Vec, SubVec,
                              Index));
     return;
   }
   case Intrinsic::experimental_vector_extract: {
-    auto DL = getCurSDLoc();
-
     SDValue Vec = getValue(I.getOperand(0));
     SDValue Index = getValue(I.getOperand(1));
     EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
@@ -7182,9 +7162,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     MVT VectorIdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
     if (Index.getValueType() != VectorIdxTy)
       Index = DAG.getVectorIdxConstant(
-          cast<ConstantSDNode>(Index)->getZExtValue(), DL);
+          cast<ConstantSDNode>(Index)->getZExtValue(), sdl);
 
-    setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index));
+    setValue(&I,
+             DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
     return;
   }
   case Intrinsic::experimental_vector_reverse:
@@ -7314,9 +7295,9 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
 static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
   Optional<unsigned> ResOPC;
   switch (VPIntrin.getIntrinsicID()) {
-#define BEGIN_REGISTER_VP_INTRINSIC(INTRIN, ...) case Intrinsic::INTRIN:
-#define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) ResOPC = ISD::VPSDID;
-#define END_REGISTER_VP_INTRINSIC(...) break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) ResOPC = ISD::VPSD;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e4a69adff05b..737695b5eabe 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -645,6 +645,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
   if (DemandedBits == 0 || DemandedElts == 0)
     return DAG.getUNDEF(Op.getValueType());
 
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   unsigned NumElts = DemandedElts.getBitWidth();
   unsigned BitWidth = DemandedBits.getBitWidth();
   KnownBits LHSKnown, RHSKnown;
@@ -663,16 +664,15 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
               Src, DemandedBits, DemandedElts, DAG, Depth + 1))
         return DAG.getBitcast(DstVT, V);
 
-    // TODO - bigendian once we have test coverage.
-    if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0 &&
-        DAG.getDataLayout().isLittleEndian()) {
+    if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0) {
       unsigned Scale = NumDstEltBits / NumSrcEltBits;
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
       APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
       APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
       for (unsigned i = 0; i != Scale; ++i) {
-        unsigned Offset = i * NumSrcEltBits;
-        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
+        unsigned EltOffset = IsLE ? i : (Scale - 1 - i);
+        unsigned BitOffset = EltOffset * NumSrcEltBits;
+        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset);
         if (!Sub.isZero()) {
           DemandedSrcBits |= Sub;
           for (unsigned j = 0; j != NumElts; ++j)
@@ -687,8 +687,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     }
 
     // TODO - bigendian once we have test coverage.
-    if ((NumSrcEltBits % NumDstEltBits) == 0 &&
-        DAG.getDataLayout().isLittleEndian()) {
+    if (IsLE && (NumSrcEltBits % NumDstEltBits) == 0) {
       unsigned Scale = NumSrcEltBits / NumDstEltBits;
       unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
       APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
@@ -802,8 +801,8 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     SDValue Src = Op.getOperand(0);
     EVT SrcVT = Src.getValueType();
     EVT DstVT = Op.getValueType();
-    if (DemandedElts == 1 && DstVT.getSizeInBits() == SrcVT.getSizeInBits() &&
-        DAG.getDataLayout().isLittleEndian() &&
+    if (IsLE && DemandedElts == 1 &&
+        DstVT.getSizeInBits() == SrcVT.getSizeInBits() &&
         DemandedBits.getActiveBits() <= SrcVT.getScalarSizeInBits()) {
       return DAG.getBitcast(DstVT, Src);
     }
@@ -913,6 +912,7 @@ bool TargetLowering::SimplifyDemandedBits(
   if (Op.getValueType().isScalableVector())
     return false;
 
+  bool IsLE = TLO.DAG.getDataLayout().isLittleEndian();
   unsigned NumElts = OriginalDemandedElts.getBitWidth();
   assert((!Op.getValueType().isVector() ||
           NumElts == Op.getValueType().getVectorNumElements()) &&
@@ -1725,11 +1725,40 @@ bool TargetLowering::SimplifyDemandedBits(
   case ISD::ROTR: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
+    bool IsROTL = (Op.getOpcode() == ISD::ROTL);
 
     // If we're rotating an 0/-1 value, then it stays an 0/-1 value.
     if (BitWidth == TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1))
       return TLO.CombineTo(Op, Op0);
 
+    if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
+      unsigned Amt = SA->getAPIntValue().urem(BitWidth);
+      unsigned RevAmt = BitWidth - Amt;
+
+      // rotl: (Op0 << Amt) | (Op0 >> (BW - Amt))
+      // rotr: (Op0 << (BW - Amt)) | (Op0 >> Amt)
+      APInt Demanded0 = DemandedBits.rotr(IsROTL ? Amt : RevAmt);
+      if (SimplifyDemandedBits(Op0, Demanded0, DemandedElts, Known2, TLO,
+                               Depth + 1))
+        return true;
+
+      // rot*(x, 0) --> x
+      if (Amt == 0)
+        return TLO.CombineTo(Op, Op0);
+
+      // See if we don't demand either half of the rotated bits.
+      if ((!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT)) &&
+          DemandedBits.countTrailingZeros() >= (IsROTL ? Amt : RevAmt)) {
+        Op1 = TLO.DAG.getConstant(IsROTL ? Amt : RevAmt, dl, Op1.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, Op1));
+      }
+      if ((!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT)) &&
+          DemandedBits.countLeadingZeros() >= (IsROTL ? RevAmt : Amt)) {
+        Op1 = TLO.DAG.getConstant(IsROTL ? RevAmt : Amt, dl, Op1.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
+      }
+    }
+
     // For pow-2 bitwidths we only demand the bottom modulo amt bits.
     if (isPowerOf2_32(BitWidth)) {
       APInt DemandedAmtBits(Op1.getScalarValueSizeInBits(), BitWidth - 1);
@@ -1887,9 +1916,8 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.getActiveBits() <= InBits) {
       // If we only need the non-extended bits of the bottom element
       // then we can just bitcast to the result.
-      if (IsVecInReg && DemandedElts == 1 &&
-          VT.getSizeInBits() == SrcVT.getSizeInBits() &&
-          TLO.DAG.getDataLayout().isLittleEndian())
+      if (IsLE && IsVecInReg && DemandedElts == 1 &&
+          VT.getSizeInBits() == SrcVT.getSizeInBits())
         return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
 
       unsigned Opc =
@@ -1925,9 +1953,8 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.getActiveBits() <= InBits) {
       // If we only need the non-extended bits of the bottom element
       // then we can just bitcast to the result.
-      if (IsVecInReg && DemandedElts == 1 &&
-          VT.getSizeInBits() == SrcVT.getSizeInBits() &&
-          TLO.DAG.getDataLayout().isLittleEndian())
+      if (IsLE && IsVecInReg && DemandedElts == 1 &&
+          VT.getSizeInBits() == SrcVT.getSizeInBits())
         return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
 
       unsigned Opc =
@@ -1976,9 +2003,8 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // If we only need the bottom element then we can just bitcast.
     // TODO: Handle ANY_EXTEND?
-    if (IsVecInReg && DemandedElts == 1 &&
-        VT.getSizeInBits() == SrcVT.getSizeInBits() &&
-        TLO.DAG.getDataLayout().isLittleEndian())
+    if (IsLE && IsVecInReg && DemandedElts == 1 &&
+        VT.getSizeInBits() == SrcVT.getSizeInBits())
       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
 
     APInt InDemandedBits = DemandedBits.trunc(InBits);
@@ -2140,16 +2166,15 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // Bitcast from a vector using SimplifyDemanded Bits/VectorElts.
     // Demand the elt/bit if any of the original elts/bits are demanded.
-    // TODO - bigendian once we have test coverage.
-    if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0 &&
-        TLO.DAG.getDataLayout().isLittleEndian()) {
+    if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0) {
       unsigned Scale = BitWidth / NumSrcEltBits;
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
       APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
       APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
       for (unsigned i = 0; i != Scale; ++i) {
-        unsigned Offset = i * NumSrcEltBits;
-        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
+        unsigned EltOffset = IsLE ? i : (Scale - 1 - i);
+        unsigned BitOffset = EltOffset * NumSrcEltBits;
+        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset);
         if (!Sub.isZero()) {
           DemandedSrcBits |= Sub;
           for (unsigned j = 0; j != NumElts; ++j)
@@ -2167,8 +2192,8 @@ bool TargetLowering::SimplifyDemandedBits(
       if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
                                KnownSrcBits, TLO, Depth + 1))
         return true;
-    } else if ((NumSrcEltBits % BitWidth) == 0 &&
-               TLO.DAG.getDataLayout().isLittleEndian()) {
+    } else if (IsLE && (NumSrcEltBits % BitWidth) == 0) {
+      // TODO - bigendian once we have test coverage.
       unsigned Scale = NumSrcEltBits / BitWidth;
       unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
       APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
@@ -2409,6 +2434,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
   SDLoc DL(Op);
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  bool IsLE = TLO.DAG.getDataLayout().isLittleEndian();
 
   // Helper for demanding the specified elements and all the bits of both binary
   // operands.
@@ -2484,7 +2510,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       // Try calling SimplifyDemandedBits, converting demanded elts to the bits
       // of the large element.
       // TODO - bigendian once we have test coverage.
-      if (TLO.DAG.getDataLayout().isLittleEndian()) {
+      if (IsLE) {
         unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
         APInt SrcDemandedBits = APInt::getZero(SrcEltSizeInBits);
         for (unsigned i = 0; i != NumElts; ++i)
@@ -2797,9 +2823,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     KnownZero = SrcZero.zextOrTrunc(NumElts);
     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
 
-    if (Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG &&
+    if (IsLE && Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG &&
         Op.getValueSizeInBits() == Src.getValueSizeInBits() &&
-        DemandedSrcElts == 1 && TLO.DAG.getDataLayout().isLittleEndian()) {
+        DemandedSrcElts == 1) {
       // aext - if we just need the bottom element then we can bitcast.
       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
     }
@@ -2812,8 +2838,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
       // zext - if we just need the bottom element then we can mask:
       // zext(and(x,c)) -> and(x,c') iff the zext is the only user of the and.
-      if (DemandedSrcElts == 1 && TLO.DAG.getDataLayout().isLittleEndian() &&
-          Src.getOpcode() == ISD::AND && Op->isOnlyUserOf(Src.getNode()) &&
+      if (IsLE && DemandedSrcElts == 1 && Src.getOpcode() == ISD::AND &&
+          Op->isOnlyUserOf(Src.getNode()) &&
           Op.getValueSizeInBits() == Src.getValueSizeInBits()) {
         SDLoc DL(Op);
         EVT SrcVT = Src.getValueType();
@@ -2834,9 +2860,19 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
   // TODO: There are more binop opcodes that could be handled here - MIN,
   // MAX, saturated math, etc.
+  case ISD::ADD: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    if (Op0 == Op1 && Op->isOnlyUserOf(Op0.getNode())) {
+      APInt UndefLHS, ZeroLHS;
+      if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                     Depth + 1, /*AssumeSingleUse*/ true))
+        return true;
+    }
+    LLVM_FALLTHROUGH;
+  }
   case ISD::OR:
   case ISD::XOR:
-  case ISD::ADD:
   case ISD::SUB:
   case ISD::FADD:
   case ISD::FSUB:
@@ -5586,7 +5622,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
                   .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
                   .trunc(W);
     assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
-    assert((D0 * P).isOne() && "Multiplicative inverse sanity check.");
+    assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // Q = floor((2^W - 1) u/ D)
     // R = ((2^W - 1) u% D)
@@ -5832,7 +5868,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
                   .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
                   .trunc(W);
     assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
-    assert((D0 * P).isOne() && "Multiplicative inverse sanity check.");
+    assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // A = floor((2^(W - 1) - 1) / D0) & -2^K
     APInt A = APInt::getSignedMaxValue(W).udiv(D0);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
index 9aea5a7a8853..f49ba5ccd447 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -159,8 +159,7 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
   // FIXME: Need the equivalent of MachineRegisterInfo for frameindex operands.
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI.getOperand(i);
+      for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isFI())
           continue;
         int FI = MO.getIndex();
@@ -394,8 +393,7 @@ void StackSlotColoring::RewriteInstruction(MachineInstr &MI,
                                            SmallVectorImpl<int> &SlotMapping,
                                            MachineFunction &MF) {
   // Update the operands.
-  for (unsigned i = 0, ee = MI.getNumOperands(); i != ee; ++i) {
-    MachineOperand &MO = MI.getOperand(i);
+  for (MachineOperand &MO : MI.operands()) {
     if (!MO.isFI())
       continue;
     int OldFI = MO.getIndex();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
index 943bd18c6c8b..54fc6ee45d00 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -70,12 +70,6 @@ static cl::opt<unsigned> TailDupIndirectBranchSize(
              "end with indirect branches."), cl::init(20),
     cl::Hidden);
 
-static cl::opt<unsigned> TailDupJmpTableLoopSize(
-    "tail-dup-jmptable-loop-size",
-    cl::desc("Maximum loop latches to consider tail duplication that are "
-             "successors of loop header."),
-    cl::init(128), cl::Hidden);
-
 static cl::opt<bool>
     TailDupVerify("tail-dup-verify",
                   cl::desc("Verify sanity of PHI instructions during taildup"),
@@ -569,29 +563,6 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   if (TailBB.isSuccessor(&TailBB))
     return false;
 
-  // When doing tail-duplication with jumptable loops like:
-  //    1 -> 2 <-> 3                 |
-  //          \  <-> 4               |
-  //           \   <-> 5             |
-  //            \    <-> ...         |
-  //             \---> rest          |
-  // quadratic number of edges and much more loops are added to CFG. This
-  // may cause compile time regression when jumptable is quiet large.
-  // So set the limit on jumptable cases.
-  auto isLargeJumpTableLoop = [](const MachineBasicBlock &TailBB) {
-    const SmallPtrSet<const MachineBasicBlock *, 8> Preds(TailBB.pred_begin(),
-                                                          TailBB.pred_end());
-    // Check the basic block has large number of successors, all of them only
-    // have one successor which is the basic block itself.
-    return llvm::count_if(
-               TailBB.successors(), [&](const MachineBasicBlock *SuccBB) {
-                 return Preds.count(SuccBB) && SuccBB->succ_size() == 1;
-               }) > TailDupJmpTableLoopSize;
-  };
-
-  if (isLargeJumpTableLoop(TailBB))
-    return false;
-
   // Set the limit on the cost to duplicate. When optimizing for size,
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index b0594ec086b2..fbf190a52585 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -136,6 +136,16 @@ unsigned TargetFrameLowering::getStackAlignmentSkew(
   return 0;
 }
 
+bool TargetFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
+  const MachineFunction &MF) const {
+  if (!hasFP(MF))
+    return false;
+
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  return RegInfo->useFPForScavengingIndex(MF) &&
+         !RegInfo->hasStackRealignment(MF);
+}
+
 bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) {
   if (!F.hasLocalLinkage() || F.hasAddressTaken() ||
       !F.hasFnAttribute(Attribute::NoRecurse))
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
index e74b3195a130..5119dac36713 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -957,8 +957,7 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
 
   // If any of the registers accessed are non-constant, conservatively assume
   // the instruction is not rematerializable.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg()) continue;
     Register Reg = MO.getReg();
     if (Reg == 0)
@@ -1401,3 +1400,21 @@ std::string TargetInstrInfo::createMIROperandComment(
 }
 
 TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() {}
+
+void TargetInstrInfo::mergeOutliningCandidateAttributes(
+    Function &F, std::vector<outliner::Candidate> &Candidates) const {
+  // Include target features from an arbitrary candidate for the outlined
+  // function. This makes sure the outlined function knows what kinds of
+  // instructions are going into it. This is fine, since all parent functions
+  // must necessarily support the instructions that are in the outlined region.
+  outliner::Candidate &FirstCand = Candidates.front();
+  const Function &ParentFn = FirstCand.getMF()->getFunction();
+  if (ParentFn.hasFnAttribute("target-features"))
+    F.addFnAttr(ParentFn.getFnAttribute("target-features"));
+
+  // Set nounwind, so we don't generate eh_frame.
+  if (llvm::all_of(Candidates, [](const outliner::Candidate &C) {
+        return C.getMF()->getFunction().hasFnAttribute(Attribute::NoUnwind);
+      }))
+    F.addFnAttr(Attribute::NoUnwind);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 1d3bb286c882..d1c2cdeb133b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1082,7 +1082,7 @@ const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference(
   if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
     return nullptr;
 
-  // Basic sanity checks.
+  // Basic correctness checks.
   if (LHS->getType()->getPointerAddressSpace() != 0 ||
       RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
       RHS->isThreadLocal())
@@ -2135,7 +2135,7 @@ const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
   if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
     return nullptr;
 
-  // Basic sanity checks.
+  // Basic correctness checks.
   if (LHS->getType()->getPointerAddressSpace() != 0 ||
       RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
       RHS->isThreadLocal())
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 46cec5407565..dfd962be2882 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -373,19 +373,25 @@ static bool isTwoAddrUse(MachineInstr &MI, Register Reg, Register &DstReg) {
   return false;
 }
 
-/// Given a register, if has a single in-basic block use, return the use
-/// instruction if it's a copy or a two-address use.
+/// Given a register, if all its uses are in the same basic block, return the
+/// last use instruction if it's a copy or a two-address use.
 static MachineInstr *
 findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
                        MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
-                       bool &IsCopy, Register &DstReg, bool &IsDstPhys) {
-  if (!MRI->hasOneNonDBGUse(Reg))
-    // None or more than one use.
-    return nullptr;
-  MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg);
-  MachineInstr &UseMI = *UseOp.getParent();
-  if (UseMI.getParent() != MBB)
+                       bool &IsCopy, Register &DstReg, bool &IsDstPhys,
+                       LiveIntervals *LIS) {
+  MachineOperand *UseOp = nullptr;
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB)
+      return nullptr;
+    if (isPlainlyKilled(MI, Reg, LIS))
+      UseOp = &MO;
+  }
+  if (!UseOp)
     return nullptr;
+  MachineInstr &UseMI = *UseOp->getParent();
+
   Register SrcReg;
   bool IsSrcPhys;
   if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
@@ -399,7 +405,7 @@ findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
   }
   if (UseMI.isCommutable()) {
     unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex;
-    unsigned Src2 = UseMI.getOperandNo(&UseOp);
+    unsigned Src2 = UseMI.getOperandNo(UseOp);
     if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) {
       MachineOperand &MO = UseMI.getOperand(Src1);
       if (MO.isReg() && MO.isUse() &&
@@ -492,8 +498,7 @@ void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) {
       return;
   }
 
-  for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (MO.isRegMask()) {
       removeMapRegEntry(MO, SrcRegMap, TRI);
       continue;
@@ -685,7 +690,6 @@ bool TwoAddressInstructionPass::convertInstTo3Addr(
 
   // If the old instruction is debug value tracked, an update is required.
   if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-    // Sanity check.
     assert(mi->getNumExplicitDefs() == 1);
     assert(NewMI->getNumExplicitDefs() == 1);
 
@@ -724,7 +728,7 @@ void TwoAddressInstructionPass::scanUses(Register DstReg) {
   Register NewReg;
   Register Reg = DstReg;
   while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
-                                                      NewReg, IsDstPhys)) {
+                                                      NewReg, IsDstPhys, LIS)) {
     if (IsCopy && !Processed.insert(UseMI).second)
       break;
 
@@ -1336,8 +1340,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
           // Success, or at least we made an improvement. Keep the unfolded
           // instructions and discard the original.
           if (LV) {
-            for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-              MachineOperand &MO = MI.getOperand(i);
+            for (const MachineOperand &MO : MI.operands()) {
               if (MO.isReg() && MO.getReg().isVirtual()) {
                 if (MO.isUse()) {
                   if (MO.isKill()) {
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index fb0798f204e1..7673a721c4ea 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -15,6 +15,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/WithColor.h"
@@ -317,12 +318,33 @@ bool DWARFVerifier::handleDebugAbbrev() {
   return NumErrors == 0;
 }
 
-unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
-                                          DWARFSectionKind SectionKind) {
+unsigned DWARFVerifier::verifyUnits(const DWARFUnitVector &Units) {
+  unsigned NumDebugInfoErrors = 0;
+  ReferenceMap CrossUnitReferences;
+
+  for (const auto &Unit : Units) {
+      ReferenceMap UnitLocalReferences;
+      NumDebugInfoErrors +=
+          verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
+      NumDebugInfoErrors += verifyDebugInfoReferences(
+          UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); });
+  }
+
+  NumDebugInfoErrors += verifyDebugInfoReferences(
+      CrossUnitReferences, [&](uint64_t Offset) -> DWARFUnit * {
+        if (DWARFUnit *U = Units.getUnitForOffset(Offset))
+          return U;
+        return nullptr;
+      });
+
+  return NumDebugInfoErrors;
+}
+
+unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S) {
   const DWARFObject &DObj = DCtx.getDWARFObj();
   DWARFDataExtractor DebugInfoData(DObj, S, DCtx.isLittleEndian(), 0);
   unsigned NumDebugInfoErrors = 0;
-  uint64_t OffsetStart = 0, Offset = 0, UnitIdx = 0;
+  uint64_t Offset = 0, UnitIdx = 0;
   uint8_t UnitType = 0;
   bool isUnitDWARF64 = false;
   bool isHeaderChainValid = true;
@@ -334,48 +356,11 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
   /// lies between to valid DIEs.
   ReferenceMap CrossUnitReferences;
   while (hasDIE) {
-    OffsetStart = Offset;
     if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType,
                           isUnitDWARF64)) {
       isHeaderChainValid = false;
       if (isUnitDWARF64)
         break;
-    } else {
-      DWARFUnitHeader Header;
-      Header.extract(DCtx, DebugInfoData, &OffsetStart, SectionKind);
-      ReferenceMap UnitLocalReferences;
-      DWARFUnit *Unit;
-      switch (UnitType) {
-      case dwarf::DW_UT_type:
-      case dwarf::DW_UT_split_type: {
-        Unit = TypeUnitVector.addUnit(std::make_unique<DWARFTypeUnit>(
-            DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(),
-            &DObj.getLocSection(), DObj.getStrSection(),
-            DObj.getStrOffsetsSection(), &DObj.getAddrSection(),
-            DObj.getLineSection(), DCtx.isLittleEndian(), false,
-            TypeUnitVector));
-        break;
-      }
-      case dwarf::DW_UT_skeleton:
-      case dwarf::DW_UT_split_compile:
-      case dwarf::DW_UT_compile:
-      case dwarf::DW_UT_partial:
-      // UnitType = 0 means that we are verifying a compile unit in DWARF v4.
-      case 0: {
-        Unit = CompileUnitVector.addUnit(std::make_unique<DWARFCompileUnit>(
-            DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(),
-            &DObj.getLocSection(), DObj.getStrSection(),
-            DObj.getStrOffsetsSection(), &DObj.getAddrSection(),
-            DObj.getLineSection(), DCtx.isLittleEndian(), false,
-            CompileUnitVector));
-        break;
-      }
-      default: { llvm_unreachable("Invalid UnitType."); }
-      }
-      NumDebugInfoErrors +=
-          verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
-      NumDebugInfoErrors += verifyDebugInfoReferences(
-          UnitLocalReferences, [&](uint64_t Offset) { return Unit; });
     }
     hasDIE = DebugInfoData.isValidOffset(Offset);
     ++UnitIdx;
@@ -386,14 +371,6 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
   }
   if (!isHeaderChainValid)
     ++NumDebugInfoErrors;
-  NumDebugInfoErrors += verifyDebugInfoReferences(
-      CrossUnitReferences, [&](uint64_t Offset) -> DWARFUnit * {
-        if (DWARFUnit *U = TypeUnitVector.getUnitForOffset(Offset))
-          return U;
-        if (DWARFUnit *U = CompileUnitVector.getUnitForOffset(Offset))
-          return U;
-        return nullptr;
-      });
   return NumDebugInfoErrors;
 }
 
@@ -403,13 +380,16 @@ bool DWARFVerifier::handleDebugInfo() {
 
   OS << "Verifying .debug_info Unit Header Chain...\n";
   DObj.forEachInfoSections([&](const DWARFSection &S) {
-    NumErrors += verifyUnitSection(S, DW_SECT_INFO);
+    NumErrors += verifyUnitSection(S);
   });
 
   OS << "Verifying .debug_types Unit Header Chain...\n";
   DObj.forEachTypesSections([&](const DWARFSection &S) {
-    NumErrors += verifyUnitSection(S, DW_SECT_EXT_TYPES);
+    NumErrors += verifyUnitSection(S);
   });
+
+  OS << "Verifying non-dwo Units...\n";
+  NumErrors += verifyUnits(DCtx.getNormalUnitsVector());
   return NumErrors == 0;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp b/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp
index d2f1bf4323ee..f380aa90035e 100644
--- a/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp
+++ b/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp
@@ -14,12 +14,250 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/Demangle/StringView.h"
 #include "llvm/Demangle/Utility.h"
 
+#include <cctype>
 #include <cstring>
+#include <limits>
 
 using namespace llvm;
 using llvm::itanium_demangle::OutputBuffer;
+using llvm::itanium_demangle::StringView;
+
+namespace {
+
+/// Demangle information structure.
+struct Demangler {
+  /// Initialize the information structure we use to pass around information.
+  ///
+  /// \param Mangled String to demangle.
+  Demangler(const char *Mangled);
+
+  /// Extract and demangle the mangled symbol and append it to the output
+  /// string.
+  ///
+  /// \param Demangled Output buffer to write the demangled name.
+  ///
+  /// \return The remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#name_mangling .
+  /// \see https://dlang.org/spec/abi.html#MangledName .
+  const char *parseMangle(OutputBuffer *Demangled);
+
+private:
+  /// Extract and demangle a given mangled symbol and append it to the output
+  /// string.
+  ///
+  /// \param Demangled output buffer to write the demangled name.
+  /// \param Mangled mangled symbol to be demangled.
+  ///
+  /// \return The remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#name_mangling .
+  /// \see https://dlang.org/spec/abi.html#MangledName .
+  const char *parseMangle(OutputBuffer *Demangled, const char *Mangled);
+
+  /// Extract the number from a given string.
+  ///
+  /// \param Mangled string to extract the number.
+  /// \param Ret assigned result value.
+  ///
+  /// \return The remaining string on success or nullptr on failure.
+  ///
+  /// \note A result larger than UINT_MAX is considered a failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#Number .
+  const char *decodeNumber(const char *Mangled, unsigned long *Ret);
+
+  /// Check whether it is the beginning of a symbol name.
+  ///
+  /// \param Mangled string to extract the symbol name.
+  ///
+  /// \return true on success, false otherwise.
+  ///
+  /// \see https://dlang.org/spec/abi.html#SymbolName .
+  bool isSymbolName(const char *Mangled);
+
+  /// Extract and demangle an identifier from a given mangled symbol append it
+  /// to the output string.
+  ///
+  /// \param Demangled Output buffer to write the demangled name.
+  /// \param Mangled Mangled symbol to be demangled.
+  ///
+  /// \return The remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#SymbolName .
+  const char *parseIdentifier(OutputBuffer *Demangled, const char *Mangled);
+
+  /// Extract and demangle the plain identifier from a given mangled symbol and
+  /// prepend/append it to the output string, with a special treatment for some
+  /// magic compiler generated symbols.
+  ///
+  /// \param Demangled Output buffer to write the demangled name.
+  /// \param Mangled Mangled symbol to be demangled.
+  /// \param Len Length of the mangled symbol name.
+  ///
+  /// \return The remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#LName .
+  const char *parseLName(OutputBuffer *Demangled, const char *Mangled,
+                         unsigned long Len);
+
+  /// Extract and demangle the qualified symbol from a given mangled symbol
+  /// append it to the output string.
+  ///
+  /// \param Demangled Output buffer to write the demangled name.
+  /// \param Mangled Mangled symbol to be demangled.
+  ///
+  /// \return The remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#QualifiedName .
+  const char *parseQualified(OutputBuffer *Demangled, const char *Mangled);
+
+  /// The string we are demangling.
+  const char *Str;
+};
+
+} // namespace
+
+const char *Demangler::decodeNumber(const char *Mangled, unsigned long *Ret) {
+  // Return nullptr if trying to extract something that isn't a digit.
+  if (Mangled == nullptr || !std::isdigit(*Mangled))
+    return nullptr;
+
+  unsigned long Val = 0;
+
+  do {
+    unsigned long Digit = Mangled[0] - '0';
+
+    // Check for overflow.
+    if (Val > (std::numeric_limits<unsigned int>::max() - Digit) / 10)
+      return nullptr;
+
+    Val = Val * 10 + Digit;
+    ++Mangled;
+  } while (std::isdigit(*Mangled));
+
+  if (*Mangled == '\0')
+    return nullptr;
+
+  *Ret = Val;
+  return Mangled;
+}
+
+bool Demangler::isSymbolName(const char *Mangled) {
+  if (std::isdigit(*Mangled))
+    return true;
+
+  // TODO: Handle symbol back references and template instances.
+  return false;
+}
+
+const char *Demangler::parseMangle(OutputBuffer *Demangled,
+                                   const char *Mangled) {
+  // A D mangled symbol is comprised of both scope and type information.
+  //    MangleName:
+  //        _D QualifiedName Type
+  //        _D QualifiedName Z
+  //        ^
+  // The caller should have guaranteed that the start pointer is at the
+  // above location.
+  // Note that type is never a function type, but only the return type of
+  // a function or the type of a variable.
+  Mangled += 2;
+
+  Mangled = parseQualified(Demangled, Mangled);
+
+  if (Mangled != nullptr) {
+    // Artificial symbols end with 'Z' and have no type.
+    if (*Mangled == 'Z')
+      ++Mangled;
+    else {
+      // TODO: Implement symbols with types.
+      return nullptr;
+    }
+  }
+
+  return Mangled;
+}
+
+const char *Demangler::parseQualified(OutputBuffer *Demangled,
+                                      const char *Mangled) {
+  // Qualified names are identifiers separated by their encoded length.
+  // Nested functions also encode their argument types without specifying
+  // what they return.
+  //    QualifiedName:
+  //        SymbolFunctionName
+  //        SymbolFunctionName QualifiedName
+  //        ^
+  //    SymbolFunctionName:
+  //        SymbolName
+  //        SymbolName TypeFunctionNoReturn
+  //        SymbolName M TypeFunctionNoReturn
+  //        SymbolName M TypeModifiers TypeFunctionNoReturn
+  // The start pointer should be at the above location.
+
+  // Whether it has more than one symbol
+  size_t NotFirst = false;
+  do {
+    // Skip over anonymous symbols.
+    if (*Mangled == '0') {
+      do
+        ++Mangled;
+      while (*Mangled == '0');
+
+      continue;
+    }
+
+    if (NotFirst)
+      *Demangled << '.';
+    NotFirst = true;
+
+    Mangled = parseIdentifier(Demangled, Mangled);
+
+  } while (Mangled && isSymbolName(Mangled));
+
+  return Mangled;
+}
+
+const char *Demangler::parseIdentifier(OutputBuffer *Demangled,
+                                       const char *Mangled) {
+  unsigned long Len;
+
+  if (Mangled == nullptr || *Mangled == '\0')
+    return nullptr;
+
+  // TODO: Parse back references and lengthless template instances.
+
+  const char *Endptr = decodeNumber(Mangled, &Len);
+
+  if (Endptr == nullptr || Len == 0)
+    return nullptr;
+
+  if (strlen(Endptr) < Len)
+    return nullptr;
+
+  Mangled = Endptr;
+
+  // TODO: Parse template instances with a length prefix.
+
+  return parseLName(Demangled, Mangled, Len);
+}
+
+const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled,
+                                  unsigned long Len) {
+  *Demangled << StringView(Mangled, Len);
+  Mangled += Len;
+
+  return Mangled;
+}
+
+Demangler::Demangler(const char *Mangled) : Str(Mangled) {}
+
+const char *Demangler::parseMangle(OutputBuffer *Demangled) {
+  return parseMangle(Demangled, this->Str);
+}
 
 char *llvm::dlangDemangle(const char *MangledName) {
   if (MangledName == nullptr || strncmp(MangledName, "_D", 2) != 0)
@@ -29,8 +267,19 @@ char *llvm::dlangDemangle(const char *MangledName) {
   if (!initializeOutputBuffer(nullptr, nullptr, Demangled, 1024))
     return nullptr;
 
-  if (strcmp(MangledName, "_Dmain") == 0)
+  if (strcmp(MangledName, "_Dmain") == 0) {
     Demangled << "D main";
+  } else {
+
+    Demangler D = Demangler(MangledName);
+    MangledName = D.parseMangle(&Demangled);
+
+    // Check that the entire symbol was successfully demangled.
+    if (MangledName == nullptr || *MangledName != '\0') {
+      std::free(Demangled.getBuffer());
+      return nullptr;
+    }
+  }
 
   // OutputBuffer's internal buffer is not null terminated and therefore we need
   // to add it to comply with C null terminated strings.
@@ -40,6 +289,6 @@ char *llvm::dlangDemangle(const char *MangledName) {
     return Demangled.getBuffer();
   }
 
-  free(Demangled.getBuffer());
+  std::free(Demangled.getBuffer());
   return nullptr;
 }
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index fe3c433bd2c5..a14bd4d2c3fd 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -1256,8 +1256,7 @@ void ExecutionEngine::emitGlobals() {
     // If there are multiple modules, map the non-canonical globals to their
     // canonical location.
     if (!NonCanonicalGlobals.empty()) {
-      for (unsigned i = 0, e = NonCanonicalGlobals.size(); i != e; ++i) {
-        const GlobalValue *GV = NonCanonicalGlobals[i];
+      for (const GlobalValue *GV : NonCanonicalGlobals) {
         const GlobalValue *CGV = LinkedGlobalsMap[std::make_pair(
             std::string(GV->getName()), GV->getType())];
         void *Ptr = getPointerToGlobalIfAvailable(CGV);
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index fdc987751286..f9101d71dfa8 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -143,6 +143,9 @@ protected:
   // Only SHF_ALLOC sections will have graph sections.
   DenseMap<ELFSectionIndex, Section *> GraphSections;
   DenseMap<ELFSymbolIndex, Symbol *> GraphSymbols;
+  DenseMap<const typename ELFFile::Elf_Shdr *,
+           ArrayRef<typename ELFFile::Elf_Word>>
+      ShndxTables;
 };
 
 template <typename ELFT>
@@ -241,7 +244,7 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::prepare() {
     return SectionStringTabOrErr.takeError();
 
   // Get the SHT_SYMTAB section.
-  for (auto &Sec : Sections)
+  for (auto &Sec : Sections) {
     if (Sec.sh_type == ELF::SHT_SYMTAB) {
       if (!SymTabSec)
         SymTabSec = &Sec;
@@ -250,6 +253,20 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::prepare() {
                                         G->getName());
     }
 
+    // Extended table.
+    if (Sec.sh_type == ELF::SHT_SYMTAB_SHNDX) {
+      uint32_t SymtabNdx = Sec.sh_link;
+      if (SymtabNdx >= Sections.size())
+        return make_error<JITLinkError>("sh_link is out of bound");
+
+      auto ShndxTable = Obj.getSHNDXTable(Sec);
+      if (!ShndxTable)
+        return ShndxTable.takeError();
+
+      ShndxTables.insert({&Sections[SymtabNdx], *ShndxTable});
+    }
+  }
+
   return Error::success();
 }
 
@@ -299,11 +316,6 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() {
     else
       Prot = MemProt::Read | MemProt::Write;
 
-    // For now we just use this to skip the "undefined" section, probably need
-    // to revist.
-    if (Sec.sh_size == 0)
-      continue;
-
     auto &GraphSec = G->createSection(*Name, Prot);
     if (Sec.sh_type != ELF::SHT_NOBITS) {
       auto Data = Obj.template getSectionContentsAsArray<char>(Sec);
@@ -401,9 +413,19 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
         (Sym.getType() == ELF::STT_NOTYPE || Sym.getType() == ELF::STT_FUNC ||
          Sym.getType() == ELF::STT_OBJECT ||
          Sym.getType() == ELF::STT_SECTION || Sym.getType() == ELF::STT_TLS)) {
-
-      // FIXME: Handle extended tables.
-      if (auto *GraphSec = getGraphSection(Sym.st_shndx)) {
+      // Handle extended tables.
+      unsigned Shndx = Sym.st_shndx;
+      if (Shndx == ELF::SHN_XINDEX) {
+        auto ShndxTable = ShndxTables.find(SymTabSec);
+        if (ShndxTable == ShndxTables.end())
+          continue;
+        auto NdxOrErr = object::getExtendedSymbolTableIndex<ELFT>(
+            Sym, SymIndex, ShndxTable->second);
+        if (!NdxOrErr)
+          return NdxOrErr.takeError();
+        Shndx = *NdxOrErr;
+      }
+      if (auto *GraphSec = getGraphSection(Shndx)) {
         Block *B = nullptr;
         {
           auto Blocks = GraphSec->blocks();
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 6b24d6461b63..56a97f83d915 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -612,9 +612,14 @@ void LookupState::continueLookup(Error Err) {
 
 DefinitionGenerator::~DefinitionGenerator() {}
 
+JITDylib::~JITDylib() {
+  LLVM_DEBUG(dbgs() << "Destroying JITDylib " << getName() << "\n");
+}
+
 Error JITDylib::clear() {
   std::vector<ResourceTrackerSP> TrackersToRemove;
   ES.runSessionLocked([&]() {
+    assert(State != Closed && "JD is defunct");
     for (auto &KV : TrackerSymbols)
       TrackersToRemove.push_back(KV.first);
     TrackersToRemove.push_back(getDefaultResourceTracker());
@@ -628,6 +633,7 @@ Error JITDylib::clear() {
 
 ResourceTrackerSP JITDylib::getDefaultResourceTracker() {
   return ES.runSessionLocked([this] {
+    assert(State != Closed && "JD is defunct");
     if (!DefaultTracker)
       DefaultTracker = new ResourceTracker(this);
     return DefaultTracker;
@@ -636,19 +642,22 @@ ResourceTrackerSP JITDylib::getDefaultResourceTracker() {
 
 ResourceTrackerSP JITDylib::createResourceTracker() {
   return ES.runSessionLocked([this] {
+    assert(State == Open && "JD is defunct");
     ResourceTrackerSP RT = new ResourceTracker(this);
     return RT;
   });
 }
 
 void JITDylib::removeGenerator(DefinitionGenerator &G) {
-  std::lock_guard<std::mutex> Lock(GeneratorsMutex);
-  auto I = llvm::find_if(DefGenerators,
-                         [&](const std::shared_ptr<DefinitionGenerator> &H) {
-                           return H.get() == &G;
-                         });
-  assert(I != DefGenerators.end() && "Generator not found");
-  DefGenerators.erase(I);
+  ES.runSessionLocked([&] {
+    assert(State == Open && "JD is defunct");
+    auto I = llvm::find_if(DefGenerators,
+                           [&](const std::shared_ptr<DefinitionGenerator> &H) {
+                             return H.get() == &G;
+                           });
+    assert(I != DefGenerators.end() && "Generator not found");
+    DefGenerators.erase(I);
+  });
 }
 
 Expected<SymbolFlagsMap>
@@ -708,10 +717,8 @@ Error JITDylib::replace(MaterializationResponsibility &FromMR,
 
   auto Err =
       ES.runSessionLocked([&, this]() -> Error {
-        auto RT = getTracker(FromMR);
-
-        if (RT->isDefunct())
-          return make_error<ResourceTrackerDefunct>(std::move(RT));
+        if (FromMR.RT->isDefunct())
+          return make_error<ResourceTrackerDefunct>(std::move(FromMR.RT));
 
 #ifndef NDEBUG
         for (auto &KV : MU->getSymbols()) {
@@ -735,7 +742,8 @@ Error JITDylib::replace(MaterializationResponsibility &FromMR,
           if (MII != MaterializingInfos.end()) {
             if (MII->second.hasQueriesPending()) {
               MustRunMR = ES.createMaterializationResponsibility(
-                  *RT, std::move(MU->SymbolFlags), std::move(MU->InitSymbol));
+                  *FromMR.RT, std::move(MU->SymbolFlags),
+                  std::move(MU->InitSymbol));
               MustRunMU = std::move(MU);
               return Error::success();
             }
@@ -743,10 +751,8 @@ Error JITDylib::replace(MaterializationResponsibility &FromMR,
         }
 
         // Otherwise, make MU responsible for all the symbols.
-        auto RTI = MRTrackers.find(&FromMR);
-        assert(RTI != MRTrackers.end() && "No tracker for FromMR");
-        auto UMI =
-            std::make_shared<UnmaterializedInfo>(std::move(MU), RTI->second);
+        auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU),
+                                                        FromMR.RT.get());
         for (auto &KV : UMI->MU->getSymbols()) {
           auto SymI = Symbols.find(KV.first);
           assert(SymI->second.getState() == SymbolState::Materializing &&
@@ -787,13 +793,11 @@ JITDylib::delegate(MaterializationResponsibility &FromMR,
 
   return ES.runSessionLocked(
       [&]() -> Expected<std::unique_ptr<MaterializationResponsibility>> {
-        auto RT = getTracker(FromMR);
-
-        if (RT->isDefunct())
-          return make_error<ResourceTrackerDefunct>(std::move(RT));
+        if (FromMR.RT->isDefunct())
+          return make_error<ResourceTrackerDefunct>(std::move(FromMR.RT));
 
         return ES.createMaterializationResponsibility(
-            *RT, std::move(SymbolFlags), std::move(InitSymbol));
+            *FromMR.RT, std::move(SymbolFlags), std::move(InitSymbol));
       });
 }
 
@@ -903,10 +907,13 @@ Error JITDylib::resolve(MaterializationResponsibility &MR,
   AsynchronousSymbolQuerySet CompletedQueries;
 
   if (auto Err = ES.runSessionLocked([&, this]() -> Error {
-        auto RTI = MRTrackers.find(&MR);
-        assert(RTI != MRTrackers.end() && "No resource tracker for MR?");
-        if (RTI->second->isDefunct())
-          return make_error<ResourceTrackerDefunct>(RTI->second);
+        if (MR.RT->isDefunct())
+          return make_error<ResourceTrackerDefunct>(MR.RT);
+
+        if (State != Open)
+          return make_error<StringError>("JITDylib " + getName() +
+                                             " is defunct",
+                                         inconvertibleErrorCode());
 
         struct WorklistEntry {
           SymbolTable::iterator SymI;
@@ -1001,10 +1008,13 @@ Error JITDylib::emit(MaterializationResponsibility &MR,
   DenseMap<JITDylib *, SymbolNameVector> ReadySymbols;
 
   if (auto Err = ES.runSessionLocked([&, this]() -> Error {
-        auto RTI = MRTrackers.find(&MR);
-        assert(RTI != MRTrackers.end() && "No resource tracker for MR?");
-        if (RTI->second->isDefunct())
-          return make_error<ResourceTrackerDefunct>(RTI->second);
+        if (MR.RT->isDefunct())
+          return make_error<ResourceTrackerDefunct>(MR.RT);
+
+        if (State != Open)
+          return make_error<StringError>("JITDylib " + getName() +
+                                             " is defunct",
+                                         inconvertibleErrorCode());
 
         SymbolNameSet SymbolsInErrorState;
         std::vector<SymbolTable::iterator> Worklist;
@@ -1149,9 +1159,12 @@ Error JITDylib::emit(MaterializationResponsibility &MR,
 void JITDylib::unlinkMaterializationResponsibility(
     MaterializationResponsibility &MR) {
   ES.runSessionLocked([&]() {
-    auto I = MRTrackers.find(&MR);
-    assert(I != MRTrackers.end() && "MaterializationResponsibility not linked");
-    MRTrackers.erase(I);
+    auto I = TrackerMRs.find(MR.RT.get());
+    assert(I != TrackerMRs.end() && "No MRs in TrackerMRs list for RT");
+    assert(I->second.count(&MR) && "MR not in TrackerMRs list for RT");
+    I->second.erase(&MR);
+    if (I->second.empty())
+      TrackerMRs.erase(MR.RT.get());
   });
 }
 
@@ -1169,8 +1182,16 @@ JITDylib::failSymbols(FailedSymbolsWorklist Worklist) {
 
     (*FailedSymbolsMap)[&JD].insert(Name);
 
-    assert(JD.Symbols.count(Name) && "No symbol table entry for Name");
-    auto &Sym = JD.Symbols[Name];
+    // Look up the symbol to fail.
+    auto SymI = JD.Symbols.find(Name);
+
+    // It's possible that this symbol has already been removed, e.g. if a
+    // materialization failure happens concurrently with a ResourceTracker or
+    // JITDylib removal. In that case we can safely skip this symbol and
+    // continue.
+    if (SymI == JD.Symbols.end())
+      continue;
+    auto &Sym = SymI->second;
 
     // Move the symbol into the error state.
     // Note that this may be redundant: The symbol might already have been
@@ -1267,6 +1288,7 @@ JITDylib::failSymbols(FailedSymbolsWorklist Worklist) {
 void JITDylib::setLinkOrder(JITDylibSearchOrder NewLinkOrder,
                             bool LinkAgainstThisJITDylibFirst) {
   ES.runSessionLocked([&]() {
+    assert(State == Open && "JD is defunct");
     if (LinkAgainstThisJITDylibFirst) {
       LinkOrder.clear();
       if (NewLinkOrder.empty() || NewLinkOrder.front().first != this)
@@ -1285,6 +1307,7 @@ void JITDylib::addToLinkOrder(JITDylib &JD, JITDylibLookupFlags JDLookupFlags) {
 void JITDylib::replaceInLinkOrder(JITDylib &OldJD, JITDylib &NewJD,
                                   JITDylibLookupFlags JDLookupFlags) {
   ES.runSessionLocked([&]() {
+    assert(State == Open && "JD is defunct");
     for (auto &KV : LinkOrder)
       if (KV.first == &OldJD) {
         KV = {&NewJD, JDLookupFlags};
@@ -1295,6 +1318,7 @@ void JITDylib::replaceInLinkOrder(JITDylib &OldJD, JITDylib &NewJD,
 
 void JITDylib::removeFromLinkOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
+    assert(State == Open && "JD is defunct");
     auto I = llvm::find_if(LinkOrder,
                            [&](const JITDylibSearchOrder::value_type &KV) {
                              return KV.first == &JD;
@@ -1306,6 +1330,7 @@ void JITDylib::removeFromLinkOrder(JITDylib &JD) {
 
 Error JITDylib::remove(const SymbolNameSet &Names) {
   return ES.runSessionLocked([&]() -> Error {
+    assert(State == Open && "JD is defunct");
     using SymbolMaterializerItrPair =
         std::pair<SymbolTable::iterator, UnmaterializedInfosMap::iterator>;
     std::vector<SymbolMaterializerItrPair> SymbolsToRemove;
@@ -1365,8 +1390,23 @@ Error JITDylib::remove(const SymbolNameSet &Names) {
 void JITDylib::dump(raw_ostream &OS) {
   ES.runSessionLocked([&, this]() {
     OS << "JITDylib \"" << getName() << "\" (ES: "
-       << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n"
-       << "Link order: " << LinkOrder << "\n"
+       << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES))
+       << ", State = ";
+    switch (State) {
+    case Open:
+      OS << "Open";
+      break;
+    case Closing:
+      OS << "Closing";
+      break;
+    case Closed:
+      OS << "Closed";
+      break;
+    }
+    OS << ")\n";
+    if (State == Closed)
+      return;
+    OS << "Link order: " << LinkOrder << "\n"
        << "Symbol table:\n";
 
     for (auto &KV : Symbols) {
@@ -1454,17 +1494,11 @@ JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
   LinkOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols});
 }
 
-ResourceTrackerSP JITDylib::getTracker(MaterializationResponsibility &MR) {
-  auto I = MRTrackers.find(&MR);
-  assert(I != MRTrackers.end() && "MR is not linked");
-  assert(I->second && "Linked tracker is null");
-  return I->second;
-}
-
 std::pair<JITDylib::AsynchronousSymbolQuerySet,
           std::shared_ptr<SymbolDependenceMap>>
 JITDylib::removeTracker(ResourceTracker &RT) {
   // Note: Should be called under the session lock.
+  assert(State != Closed && "JD is defunct");
 
   SymbolNameVector SymbolsToRemove;
   std::vector<std::pair<JITDylib *, SymbolStringPtr>> SymbolsToFail;
@@ -1525,6 +1559,7 @@ JITDylib::removeTracker(ResourceTracker &RT) {
 }
 
 void JITDylib::transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT) {
+  assert(State != Closed && "JD is defunct");
   assert(&DstRT != &SrcRT && "No-op transfers shouldn't call transferTracker");
   assert(&DstRT.getJITDylib() == this && "DstRT is not for this JITDylib");
   assert(&SrcRT.getJITDylib() == this && "SrcRT is not for this JITDylib");
@@ -1536,9 +1571,22 @@ void JITDylib::transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT) {
   }
 
   // Update trackers for any active materialization responsibilities.
-  for (auto &KV : MRTrackers) {
-    if (KV.second == &SrcRT)
-      KV.second = &DstRT;
+  {
+    auto I = TrackerMRs.find(&SrcRT);
+    if (I != TrackerMRs.end()) {
+      auto &SrcMRs = I->second;
+      auto &DstMRs = TrackerMRs[&DstRT];
+      for (auto *MR : SrcMRs)
+        MR->RT = &DstRT;
+      if (DstMRs.empty())
+        DstMRs = std::move(SrcMRs);
+      else
+        for (auto *MR : SrcMRs)
+          DstMRs.insert(MR);
+      // Erase SrcRT entry in TrackerMRs. Use &SrcRT key rather than iterator I
+      // for this, since I may have been invalidated by 'TrackerMRs[&DstRT]'.
+      TrackerMRs.erase(&SrcRT);
+    }
   }
 
   // If we're transfering to the default tracker we just need to delete the
@@ -1872,6 +1920,40 @@ Expected<JITDylib &> ExecutionSession::createJITDylib(std::string Name) {
   return JD;
 }
 
+Error ExecutionSession::removeJITDylib(JITDylib &JD) {
+  // Keep JD alive throughout this routine, even if all other references
+  // have been dropped.
+  JITDylibSP JDKeepAlive = &JD;
+
+  // Set JD to 'Closing' state and remove JD from the ExecutionSession.
+  runSessionLocked([&] {
+    assert(JD.State == JITDylib::Open && "JD already closed");
+    JD.State = JITDylib::Closing;
+    auto I = llvm::find(JDs, &JD);
+    assert(I != JDs.end() && "JD does not appear in session JDs");
+    JDs.erase(I);
+  });
+
+  // Clear the JITDylib.
+  auto Err = JD.clear();
+
+  // Set JD to closed state. Clear remaining data structures.
+  runSessionLocked([&] {
+    assert(JD.State == JITDylib::Closing && "JD should be closing");
+    JD.State = JITDylib::Closed;
+    assert(JD.Symbols.empty() && "JD.Symbols is not empty after clear");
+    assert(JD.UnmaterializedInfos.empty() &&
+           "JD.UnmaterializedInfos is not empty after clear");
+    assert(JD.MaterializingInfos.empty() &&
+           "JD.MaterializingInfos is not empty after clear");
+    assert(JD.TrackerSymbols.empty() &&
+           "TrackerSymbols is not empty after clear");
+    JD.DefGenerators.clear();
+    JD.LinkOrder.clear();
+  });
+  return Err;
+}
+
 std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
   if (JDs.empty())
     return {};
@@ -1883,6 +1965,8 @@ std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
 
     for (auto &JD : JDs) {
 
+      assert(JD->State == Open && "JD is defunct");
+
       if (Visited.count(JD.get()))
         continue;
 
@@ -2311,8 +2395,11 @@ void ExecutionSession::OL_applyQueryPhase1(
       });
 
       // Build the definition generator stack for this JITDylib.
-      for (auto &DG : reverse(JD.DefGenerators))
-        IPLS->CurDefGeneratorStack.push_back(DG);
+      runSessionLocked([&] {
+        IPLS->CurDefGeneratorStack.reserve(JD.DefGenerators.size());
+        for (auto &DG : reverse(JD.DefGenerators))
+          IPLS->CurDefGeneratorStack.push_back(DG);
+      });
 
       // Flag that we've done our initialization.
       IPLS->NewJITDylib = false;
@@ -2629,17 +2716,15 @@ void ExecutionSession::OL_completeLookup(
 
       LLVM_DEBUG(dbgs() << "Adding MUs to dispatch:\n");
       for (auto &KV : CollectedUMIs) {
-        auto &JD = *KV.first;
         LLVM_DEBUG({
+          auto &JD = *KV.first;
           dbgs() << "  For " << JD.getName() << ": Adding " << KV.second.size()
                  << " MUs.\n";
         });
         for (auto &UMI : KV.second) {
-          std::unique_ptr<MaterializationResponsibility> MR(
-              new MaterializationResponsibility(
-                  &JD, std::move(UMI->MU->SymbolFlags),
-                  std::move(UMI->MU->InitSymbol)));
-          JD.MRTrackers[MR.get()] = UMI->RT;
+          auto MR = createMaterializationResponsibility(
+              *UMI->RT, std::move(UMI->MU->SymbolFlags),
+              std::move(UMI->MU->InitSymbol));
           OutstandingMUs.push_back(
               std::make_pair(std::move(UMI->MU), std::move(MR)));
         }
@@ -2757,18 +2842,18 @@ void ExecutionSession::OL_destroyMaterializationResponsibility(
 
   assert(MR.SymbolFlags.empty() &&
          "All symbols should have been explicitly materialized or failed");
-  MR.JD->unlinkMaterializationResponsibility(MR);
+  MR.JD.unlinkMaterializationResponsibility(MR);
 }
 
 SymbolNameSet ExecutionSession::OL_getRequestedSymbols(
     const MaterializationResponsibility &MR) {
-  return MR.JD->getRequestedSymbols(MR.SymbolFlags);
+  return MR.JD.getRequestedSymbols(MR.SymbolFlags);
 }
 
 Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR,
                                           const SymbolMap &Symbols) {
   LLVM_DEBUG({
-    dbgs() << "In " << MR.JD->getName() << " resolving " << Symbols << "\n";
+    dbgs() << "In " << MR.JD.getName() << " resolving " << Symbols << "\n";
   });
 #ifndef NDEBUG
   for (auto &KV : Symbols) {
@@ -2783,15 +2868,16 @@ Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR,
   }
 #endif
 
-  return MR.JD->resolve(MR, Symbols);
+  return MR.JD.resolve(MR, Symbols);
 }
 
 Error ExecutionSession::OL_notifyEmitted(MaterializationResponsibility &MR) {
   LLVM_DEBUG({
-    dbgs() << "In " << MR.JD->getName() << " emitting " << MR.SymbolFlags << "\n";
+    dbgs() << "In " << MR.JD.getName() << " emitting " << MR.SymbolFlags
+           << "\n";
   });
 
-  if (auto Err = MR.JD->emit(MR, MR.SymbolFlags))
+  if (auto Err = MR.JD.emit(MR, MR.SymbolFlags))
     return Err;
 
   MR.SymbolFlags.clear();
@@ -2802,10 +2888,11 @@ Error ExecutionSession::OL_defineMaterializing(
     MaterializationResponsibility &MR, SymbolFlagsMap NewSymbolFlags) {
 
   LLVM_DEBUG({
-    dbgs() << "In " << MR.JD->getName() << " defining materializing symbols "
+    dbgs() << "In " << MR.JD.getName() << " defining materializing symbols "
            << NewSymbolFlags << "\n";
   });
-  if (auto AcceptedDefs = MR.JD->defineMaterializing(std::move(NewSymbolFlags))) {
+  if (auto AcceptedDefs =
+          MR.JD.defineMaterializing(std::move(NewSymbolFlags))) {
     // Add all newly accepted symbols to this responsibility object.
     for (auto &KV : *AcceptedDefs)
       MR.SymbolFlags.insert(KV);
@@ -2817,14 +2904,14 @@ Error ExecutionSession::OL_defineMaterializing(
 void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) {
 
   LLVM_DEBUG({
-    dbgs() << "In " << MR.JD->getName() << " failing materialization for "
+    dbgs() << "In " << MR.JD.getName() << " failing materialization for "
            << MR.SymbolFlags << "\n";
   });
 
   JITDylib::FailedSymbolsWorklist Worklist;
 
   for (auto &KV : MR.SymbolFlags)
-    Worklist.push_back(std::make_pair(MR.JD.get(), KV.first));
+    Worklist.push_back(std::make_pair(&MR.JD, KV.first));
   MR.SymbolFlags.clear();
 
   if (Worklist.empty())
@@ -2834,9 +2921,8 @@ void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) {
   std::shared_ptr<SymbolDependenceMap> FailedSymbols;
 
   runSessionLocked([&]() {
-    auto RTI = MR.JD->MRTrackers.find(&MR);
-    assert(RTI != MR.JD->MRTrackers.end() && "No tracker for this");
-    if (RTI->second->isDefunct())
+    // If the tracker is defunct then there's nothing to do here.
+    if (MR.RT->isDefunct())
       return;
 
     std::tie(FailedQueries, FailedSymbols) =
@@ -2858,12 +2944,12 @@ Error ExecutionSession::OL_replace(MaterializationResponsibility &MR,
   if (MU->getInitializerSymbol() == MR.InitSymbol)
     MR.InitSymbol = nullptr;
 
-  LLVM_DEBUG(MR.JD->getExecutionSession().runSessionLocked([&]() {
-    dbgs() << "In " << MR.JD->getName() << " replacing symbols with " << *MU
+  LLVM_DEBUG(MR.JD.getExecutionSession().runSessionLocked([&]() {
+    dbgs() << "In " << MR.JD.getName() << " replacing symbols with " << *MU
            << "\n";
   }););
 
-  return MR.JD->replace(MR, std::move(MU));
+  return MR.JD.replace(MR, std::move(MU));
 }
 
 Expected<std::unique_ptr<MaterializationResponsibility>>
@@ -2886,8 +2972,8 @@ ExecutionSession::OL_delegate(MaterializationResponsibility &MR,
     MR.SymbolFlags.erase(I);
   }
 
-  return MR.JD->delegate(MR, std::move(DelegatedFlags),
-                         std::move(DelegatedInitSymbol));
+  return MR.JD.delegate(MR, std::move(DelegatedFlags),
+                        std::move(DelegatedInitSymbol));
 }
 
 void ExecutionSession::OL_addDependencies(
@@ -2899,7 +2985,7 @@ void ExecutionSession::OL_addDependencies(
   });
   assert(MR.SymbolFlags.count(Name) &&
          "Symbol not covered by this MaterializationResponsibility instance");
-  MR.JD->addDependencies(Name, Dependencies);
+  MR.JD.addDependencies(Name, Dependencies);
 }
 
 void ExecutionSession::OL_addDependenciesForAll(
@@ -2910,7 +2996,7 @@ void ExecutionSession::OL_addDependenciesForAll(
            << Dependencies << "\n";
   });
   for (auto &KV : MR.SymbolFlags)
-    MR.JD->addDependencies(KV.first, Dependencies);
+    MR.JD.addDependencies(KV.first, Dependencies);
 }
 
 #ifndef NDEBUG
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 1b7fdb588275..0de76ab78e0f 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -1301,7 +1301,7 @@ RuntimeDyldELF::processRelocationRef(
         MemMgr.allowStubAllocation()) {
       resolveAArch64Branch(SectionID, Value, RelI, Stubs);
     } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
-      // Craete new GOT entry or find existing one. If GOT entry is
+      // Create new GOT entry or find existing one. If GOT entry is
       // to be created, then we also emit ABS64 relocation for it.
       uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64);
       resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
diff --git a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ce998df757ec..18f1a2314853 100644
--- a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -993,6 +993,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections(
   Value *ST = ConstantInt::get(I32Ty, 1);
   llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop(
       Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
+  Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
+  AllocaIP = Builder.saveIP();
   InsertPointTy AfterIP =
       applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, true);
   BasicBlock *LoopAfterBB = AfterIP.getBlock();
diff --git a/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp
index 7734c0a8de58..c9748e1387eb 100644
--- a/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp
@@ -353,12 +353,11 @@ void llvm::printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name) {
   // Scan the name to see if it needs quotes first.
   bool NeedsQuotes = isdigit(static_cast<unsigned char>(Name[0]));
   if (!NeedsQuotes) {
-    for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    for (unsigned char C : Name) {
       // By making this unsigned, the value passed in to isalnum will always be
       // in the range 0-255.  This is important when building with MSVC because
       // its implementation will assert.  This situation can arise when dealing
       // with UTF-8 multibyte characters.
-      unsigned char C = Name[i];
       if (!isalnum(static_cast<unsigned char>(C)) && C != '-' && C != '.' &&
           C != '_') {
         NeedsQuotes = true;
@@ -1309,27 +1308,8 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
                                    bool FromValue = false);
 
 static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
-  if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U)) {
-    // 'Fast' is an abbreviation for all fast-math-flags.
-    if (FPO->isFast())
-      Out << " fast";
-    else {
-      if (FPO->hasAllowReassoc())
-        Out << " reassoc";
-      if (FPO->hasNoNaNs())
-        Out << " nnan";
-      if (FPO->hasNoInfs())
-        Out << " ninf";
-      if (FPO->hasNoSignedZeros())
-        Out << " nsz";
-      if (FPO->hasAllowReciprocal())
-        Out << " arcp";
-      if (FPO->hasAllowContract())
-        Out << " contract";
-      if (FPO->hasApproxFunc())
-        Out << " afn";
-    }
-  }
+  if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U))
+    Out << FPO->getFastMathFlags();
 
   if (const OverflowingBinaryOperator *OBO =
         dyn_cast<OverflowingBinaryOperator>(U)) {
diff --git a/contrib/llvm-project/llvm/lib/IR/Core.cpp b/contrib/llvm-project/llvm/lib/IR/Core.cpp
index 905372982dc2..2c396ae97499 100644
--- a/contrib/llvm-project/llvm/lib/IR/Core.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Core.cpp
@@ -2266,6 +2266,14 @@ LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
                                   unwrap<Constant>(Aliasee), unwrap(M)));
 }
 
+LLVMValueRef LLVMAddAlias2(LLVMModuleRef M, LLVMTypeRef ValueTy,
+                           unsigned AddrSpace, LLVMValueRef Aliasee,
+                           const char *Name) {
+  return wrap(GlobalAlias::create(unwrap(ValueTy), AddrSpace,
+                                  GlobalValue::ExternalLinkage, Name,
+                                  unwrap<Constant>(Aliasee), unwrap(M)));
+}
+
 LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M,
                                      const char *Name, size_t NameLen) {
   return wrap(unwrap(M)->getNamedAlias(Name));
diff --git a/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp
index ca7dafc814ce..548962bd6a98 100644
--- a/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp
@@ -34,7 +34,20 @@ static cl::opt<bool>
 DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU)
     : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr),
       ValueFn(nullptr), LabelFn(nullptr),
-      AllowUnresolvedNodes(AllowUnresolvedNodes) {}
+      AllowUnresolvedNodes(AllowUnresolvedNodes) {
+  if (CUNode) {
+    if (const auto &ETs = CUNode->getEnumTypes())
+      AllEnumTypes.assign(ETs.begin(), ETs.end());
+    if (const auto &RTs = CUNode->getRetainedTypes())
+      AllRetainTypes.assign(RTs.begin(), RTs.end());
+    if (const auto &GVs = CUNode->getGlobalVariables())
+      AllGVs.assign(GVs.begin(), GVs.end());
+    if (const auto &IMs = CUNode->getImportedEntities())
+      AllImportedModules.assign(IMs.begin(), IMs.end());
+    if (const auto &MNs = CUNode->getMacros())
+      AllMacrosPerParent.insert({nullptr, {MNs.begin(), MNs.end()}});
+  }
+}
 
 void DIBuilder::trackIfUnresolved(MDNode *N) {
   if (!N)
diff --git a/contrib/llvm-project/llvm/lib/IR/Instructions.cpp b/contrib/llvm-project/llvm/lib/IR/Instructions.cpp
index c42df49d97ea..ad27a6d8c08e 100644
--- a/contrib/llvm-project/llvm/lib/IR/Instructions.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Instructions.cpp
@@ -2474,7 +2474,7 @@ bool ShuffleVectorInst::isReplicationMask(ArrayRef<int> Mask,
   // Additionally, mask size is a replication factor multiplied by vector size,
   // which further significantly reduces the search space.
 
-  // Before doing that, let's perform basic sanity check first.
+  // Before doing that, let's perform basic correctness checking first.
   int Largest = -1;
   for (int MaskElt : Mask) {
     if (MaskElt == UndefMaskElem)
diff --git a/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp b/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp
index 7552906fd07a..9206cd37a6d1 100644
--- a/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp
@@ -358,13 +358,13 @@ Value *VPIntrinsic::getMemoryPointerParam() const {
 Optional<unsigned> VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) {
   switch (VPID) {
   default:
-    return None;
-
-#define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS)                          \
-  case Intrinsic::VPID:                                                        \
-    return POINTERPOS;
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_MEMOP(POINTERPOS, ...) return POINTERPOS;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
+  return None;
 }
 
 /// \return The data (payload) operand of this store or scatter.
@@ -378,52 +378,51 @@ Value *VPIntrinsic::getMemoryDataParam() const {
 Optional<unsigned> VPIntrinsic::getMemoryDataParamPos(Intrinsic::ID VPID) {
   switch (VPID) {
   default:
-    return None;
-
-#define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS)                          \
-  case Intrinsic::VPID:                                                        \
-    return DATAPOS;
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS) return DATAPOS;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
+  return None;
 }
 
 bool VPIntrinsic::isVPIntrinsic(Intrinsic::ID ID) {
   switch (ID) {
   default:
-    return false;
-
+    break;
 #define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                    \
   case Intrinsic::VPID:                                                        \
-    break;
+    return true;
 #include "llvm/IR/VPIntrinsics.def"
   }
-  return true;
+  return false;
 }
 
 // Equivalent non-predicated opcode
 Optional<unsigned> VPIntrinsic::getFunctionalOpcodeForVP(Intrinsic::ID ID) {
-  Optional<unsigned> FunctionalOC;
   switch (ID) {
   default:
     break;
 #define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define HANDLE_VP_TO_OPC(OPC) FunctionalOC = Instruction::OPC;
-#define END_REGISTER_VP_INTRINSIC(...) break;
+#define VP_PROPERTY_FUNCTIONAL_OPC(OPC) return Instruction::OPC;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
-
-  return FunctionalOC;
+  return None;
 }
 
 Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) {
   switch (IROPC) {
   default:
-    return Intrinsic::not_intrinsic;
+    break;
 
-#define HANDLE_VP_TO_OPC(OPC) case Instruction::OPC:
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) break;
+#define VP_PROPERTY_FUNCTIONAL_OPC(OPC) case Instruction::OPC:
 #define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID;
 #include "llvm/IR/VPIntrinsics.def"
   }
+  return Intrinsic::not_intrinsic;
 }
 
 bool VPIntrinsic::canIgnoreVectorLengthParam() const {
@@ -516,13 +515,13 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
 bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) {
   switch (ID) {
   default:
-    return false;
-#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
-  case Intrinsic::VPID:                                                        \
     break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
-  return true;
+  return false;
 }
 
 unsigned VPReductionIntrinsic::getVectorParamPos() const {
@@ -535,24 +534,26 @@ unsigned VPReductionIntrinsic::getStartParamPos() const {
 
 Optional<unsigned> VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) {
   switch (ID) {
-#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
-  case Intrinsic::VPID:                                                        \
-    return VECTORPOS;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return VECTORPOS;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   default:
-    return None;
+    break;
   }
+  return None;
 }
 
 Optional<unsigned> VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) {
   switch (ID) {
-#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
-  case Intrinsic::VPID:                                                        \
-    return STARTPOS;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return STARTPOS;
+#define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   default:
-    return None;
+    break;
   }
+  return None;
 }
 
 Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const {
diff --git a/contrib/llvm-project/llvm/lib/IR/Operator.cpp b/contrib/llvm-project/llvm/lib/IR/Operator.cpp
index cf309ffd6212..d15fcfbc5b9f 100644
--- a/contrib/llvm-project/llvm/lib/IR/Operator.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Operator.cpp
@@ -226,4 +226,25 @@ bool GEPOperator::collectOffset(
   }
   return true;
 }
+
+void FastMathFlags::print(raw_ostream &O) const {
+  if (all())
+    O << " fast";
+  else {
+    if (allowReassoc())
+      O << " reassoc";
+    if (noNaNs())
+      O << " nnan";
+    if (noInfs())
+      O << " ninf";
+    if (noSignedZeros())
+      O << " nsz";
+    if (allowReciprocal())
+      O << " arcp";
+    if (allowContract())
+      O << " contract";
+    if (approxFunc())
+      O << " afn";
+  }
+}
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp b/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp
index d0c1517f480b..a03fafec9fac 100644
--- a/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp
@@ -187,7 +187,7 @@ Timer &TimePassesHandler::getPassTimer(StringRef PassID) {
 
   Timer *T = new Timer(PassID, FullDesc, TG);
   Timers.emplace_back(T);
-  assert(Count == Timers.size() && "sanity check");
+  assert(Count == Timers.size() && "Timers vector not adjusted correctly.");
 
   return *T;
 }
diff --git a/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp b/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp
index 9be6de693ee3..2117527a64f0 100644
--- a/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Run a sanity check on the IR to ensure that Safepoints - if they've been
-// inserted - were inserted correctly.  In particular, look for use of
-// non-relocated values after a safepoint.  It's primary use is to check the
+// Run a basic correctness check on the IR to ensure that Safepoints - if
+// they've been inserted - were inserted correctly.  In particular, look for use
+// of non-relocated values after a safepoint.  It's primary use is to check the
 // correctness of safepoint insertion immediately after insertion, but it can
 // also be used to verify that later transforms have not found a way to break
 // safepoint semenatics.
diff --git a/contrib/llvm-project/llvm/lib/IR/Verifier.cpp b/contrib/llvm-project/llvm/lib/IR/Verifier.cpp
index dc4370d4b6ed..154b59835b01 100644
--- a/contrib/llvm-project/llvm/lib/IR/Verifier.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Verifier.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines the function verifier interface, that can be used for some
-// sanity checking of input to the system.
+// basic correctness checking of input to the system.
 //
 // Note that this does not provide full `Java style' security and verifications,
 // instead it just tries to ensure that code is well-formed.
@@ -1604,7 +1604,7 @@ Verifier::visitModuleFlag(const MDNode *Op,
   Assert(ID, "invalid ID operand in module flag (expected metadata string)",
          Op->getOperand(1));
 
-  // Sanity check the values for behaviors with additional requirements.
+  // Check the values for behaviors with additional requirements.
   switch (MFB) {
   case Module::Error:
   case Module::Warning:
@@ -5269,24 +5269,32 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       Op0ElemTy =
           cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
       break;
-    case Intrinsic::matrix_column_major_load:
+    case Intrinsic::matrix_column_major_load: {
       Stride = dyn_cast<ConstantInt>(Call.getArgOperand(1));
       NumRows = cast<ConstantInt>(Call.getArgOperand(3));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
       ResultTy = cast<VectorType>(Call.getType());
-      Op0ElemTy =
-          cast<PointerType>(Call.getArgOperand(0)->getType())->getElementType();
+
+      PointerType *Op0PtrTy =
+          cast<PointerType>(Call.getArgOperand(0)->getType());
+      if (!Op0PtrTy->isOpaque())
+        Op0ElemTy = Op0PtrTy->getElementType();
       break;
-    case Intrinsic::matrix_column_major_store:
+    }
+    case Intrinsic::matrix_column_major_store: {
       Stride = dyn_cast<ConstantInt>(Call.getArgOperand(2));
       NumRows = cast<ConstantInt>(Call.getArgOperand(4));
       NumColumns = cast<ConstantInt>(Call.getArgOperand(5));
       ResultTy = cast<VectorType>(Call.getArgOperand(0)->getType());
       Op0ElemTy =
           cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
-      Op1ElemTy =
-          cast<PointerType>(Call.getArgOperand(1)->getType())->getElementType();
+
+      PointerType *Op1PtrTy =
+          cast<PointerType>(Call.getArgOperand(1)->getType());
+      if (!Op1PtrTy->isOpaque())
+        Op1ElemTy = Op1PtrTy->getElementType();
       break;
+    }
     default:
       llvm_unreachable("unexpected intrinsic");
     }
@@ -5295,9 +5303,10 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
            ResultTy->getElementType()->isFloatingPointTy(),
            "Result type must be an integer or floating-point type!", IF);
 
-    Assert(ResultTy->getElementType() == Op0ElemTy,
-           "Vector element type mismatch of the result and first operand "
-           "vector!", IF);
+    if (Op0ElemTy)
+      Assert(ResultTy->getElementType() == Op0ElemTy,
+             "Vector element type mismatch of the result and first operand "
+             "vector!", IF);
 
     if (Op1ElemTy)
       Assert(ResultTy->getElementType() == Op1ElemTy,
diff --git a/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp
index d41c7d3217d7..0d1a864f31ac 100644
--- a/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp
+++ b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp
@@ -372,7 +372,7 @@ Error appendToError(Error Err, StringRef After) {
 
 /// This function populates a DynamicEntries struct using an ELFT::DynRange.
 /// After populating the struct, the members are validated with
-/// some basic sanity checks.
+/// some basic correctness checks.
 ///
 /// @param Dyn Target DynamicEntries struct to populate.
 /// @param DynTable Source dynamic table.
diff --git a/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp
index 154b2d051f34..2ca921017171 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp
@@ -1069,16 +1069,14 @@ void MCAsmStreamer::PrintQuotedString(StringRef Data, raw_ostream &OS) const {
   OS << '"';
 
   if (MAI->hasPairedDoubleQuoteStringConstants()) {
-    for (unsigned i = 0, e = Data.size(); i != e; ++i) {
-      unsigned char C = Data[i];
+    for (unsigned char C : Data) {
       if (C == '"')
         OS << "\"\"";
       else
         OS << (char)C;
     }
   } else {
-    for (unsigned i = 0, e = Data.size(); i != e; ++i) {
-      unsigned char C = Data[i];
+    for (unsigned char C : Data) {
       if (C == '"' || C == '\\') {
         OS << '\\' << (char)C;
         continue;
diff --git a/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp
index 1ba999a63113..fbf3c860368a 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp
@@ -646,8 +646,6 @@ void MCELFStreamer::emitBundleAlignMode(unsigned AlignPow2) {
 void MCELFStreamer::emitBundleLock(bool AlignToEnd) {
   MCSection &Sec = *getCurrentSectionOnly();
 
-  // Sanity checks
-  //
   if (!getAssembler().isBundlingEnabled())
     report_fatal_error(".bundle_lock forbidden when bundling is disabled");
 
@@ -667,7 +665,6 @@ void MCELFStreamer::emitBundleLock(bool AlignToEnd) {
 void MCELFStreamer::emitBundleUnlock() {
   MCSection &Sec = *getCurrentSectionOnly();
 
-  // Sanity checks
   if (!getAssembler().isBundlingEnabled())
     report_fatal_error(".bundle_unlock forbidden when bundling is disabled");
   else if (!isBundleLocked())
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index ddc41d0a08ab..e95019c12db7 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -676,14 +676,14 @@ EndStmt:
       getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
                                  IsComdat, UniqueID, LinkedToSym);
   getStreamer().SwitchSection(Section, Subsection);
-  if (Section->getType() != Type &&
+  // Check that flags are used consistently. However, the GNU assembler permits
+  // to leave out in subsequent uses of the same sections; for compatibility,
+  // do likewise.
+  if (!TypeName.empty() && Section->getType() != Type &&
       !allowSectionTypeMismatch(getContext().getTargetTriple(), SectionName,
                                 Type))
     Error(loc, "changed section type for " + SectionName + ", expected: 0x" +
                    utohexstr(Section->getType()));
-  // Check that flags are used consistently. However, the GNU assembler permits
-  // to leave out in subsequent uses of the same sections; for compatibility,
-  // do likewise.
   if ((extraFlags || Size || !TypeName.empty()) && Section->getFlags() != Flags)
     Error(loc, "changed section flags for " + SectionName + ", expected: 0x" +
                    utohexstr(Section->getFlags()));
diff --git a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 646f416821ae..73c687331d30 100644
--- a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -56,6 +56,8 @@ using llvm::support::endian::write32le;
 
 namespace {
 
+constexpr int OffsetLabelIntervalBits = 20;
+
 using name = SmallString<COFF::NameSize>;
 
 enum AuxiliaryType {
@@ -120,6 +122,8 @@ public:
   relocations Relocations;
 
   COFFSection(StringRef Name) : Name(std::string(Name)) {}
+
+  SmallVector<COFFSymbol *, 1> OffsetSymbols;
 };
 
 class WinCOFFObjectWriter : public MCObjectWriter {
@@ -149,6 +153,7 @@ public:
   symbol_list WeakDefaults;
 
   bool UseBigObj;
+  bool UseOffsetLabels = false;
 
   bool EmitAddrsigSection = false;
   MCSectionCOFF *AddrsigSection;
@@ -174,7 +179,7 @@ public:
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol *Symbol);
   COFFSection *createSection(StringRef Name);
 
-  void defineSection(MCSectionCOFF const &Sec);
+  void defineSection(MCSectionCOFF const &Sec, const MCAsmLayout &Layout);
 
   COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol);
   void DefineSymbol(const MCSymbol &Symbol, MCAssembler &Assembler,
@@ -244,6 +249,11 @@ WinCOFFObjectWriter::WinCOFFObjectWriter(
     std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
     : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {
   Header.Machine = TargetObjectWriter->getMachine();
+  // Some relocations on ARM64 (the 21 bit ADRP relocations) have a slightly
+  // limited range for the immediate offset (+/- 1 MB); create extra offset
+  // label symbols with regular intervals to allow referencing a
+  // non-temporary symbol that is close enough.
+  UseOffsetLabels = Header.Machine == COFF::IMAGE_FILE_MACHINE_ARM64;
 }
 
 COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
@@ -299,7 +309,8 @@ static uint32_t getAlignment(const MCSectionCOFF &Sec) {
 
 /// This function takes a section data object from the assembler
 /// and creates the associated COFF section staging object.
-void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
+void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec,
+                                        const MCAsmLayout &Layout) {
   COFFSection *Section = createSection(MCSec.getName());
   COFFSymbol *Symbol = createSymbol(MCSec.getName());
   Section->Symbol = Symbol;
@@ -329,6 +340,20 @@ void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
   // Bind internal COFF section to MC section.
   Section->MCSection = &MCSec;
   SectionMap[&MCSec] = Section;
+
+  if (UseOffsetLabels && !MCSec.getFragmentList().empty()) {
+    const uint32_t Interval = 1 << OffsetLabelIntervalBits;
+    uint32_t N = 1;
+    for (uint32_t Off = Interval, E = Layout.getSectionAddressSize(&MCSec);
+         Off < E; Off += Interval) {
+      auto Name = ("$L" + MCSec.getName() + "_" + Twine(N++)).str();
+      COFFSymbol *Label = createSymbol(Name);
+      Label->Section = Section;
+      Label->Data.StorageClass = COFF::IMAGE_SYM_CLASS_LABEL;
+      Label->Data.Value = Off;
+      Section->OffsetSymbols.push_back(Label);
+    }
+  }
 }
 
 static uint64_t getSymbolValue(const MCSymbol &Symbol,
@@ -688,7 +713,7 @@ void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
   // "Define" each section & symbol. This creates section & symbol
   // entries in the staging area.
   for (const auto &Section : Asm)
-    defineSection(static_cast<const MCSectionCOFF &>(Section));
+    defineSection(static_cast<const MCSectionCOFF &>(Section), Layout);
 
   for (const MCSymbol &Symbol : Asm.symbols())
     if (!Symbol.isTemporary())
@@ -774,8 +799,23 @@ void WinCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
     assert(
         SectionMap.find(TargetSection) != SectionMap.end() &&
         "Section must already have been defined in executePostLayoutBinding!");
-    Reloc.Symb = SectionMap[TargetSection]->Symbol;
+    COFFSection *Section = SectionMap[TargetSection];
+    Reloc.Symb = Section->Symbol;
     FixedValue += Layout.getSymbolOffset(A);
+    // Technically, we should do the final adjustments of FixedValue (below)
+    // before picking an offset symbol, otherwise we might choose one which
+    // is slightly too far away. The relocations where it really matters
+    // (arm64 adrp relocations) don't get any offset though.
+    if (UseOffsetLabels && !Section->OffsetSymbols.empty()) {
+      uint64_t LabelIndex = FixedValue >> OffsetLabelIntervalBits;
+      if (LabelIndex > 0) {
+        if (LabelIndex <= Section->OffsetSymbols.size())
+          Reloc.Symb = Section->OffsetSymbols[LabelIndex - 1];
+        else
+          Reloc.Symb = Section->OffsetSymbols.back();
+        FixedValue -= Reloc.Symb->Data.Value;
+      }
+    }
   } else {
     assert(
         SymbolMap.find(&A) != SymbolMap.end() &&
diff --git a/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp b/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp
index 0ab845a4c28f..d8283f8d2682 100644
--- a/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp
@@ -612,7 +612,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
   LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
   LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
 
-  // Sanity check on the instruction descriptor.
+  // Validation check on the instruction descriptor.
   if (Error Err = verifyInstrDesc(*ID, MCI))
     return std::move(Err);
 
diff --git a/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp b/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp
index 6e021d3d9232..2b11f73b19df 100644
--- a/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp
+++ b/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp
@@ -188,7 +188,7 @@ Error ExecuteStage::execute(InstRef &IR) {
 
 #ifndef NDEBUG
   // Ensure that the HWS has not stored this instruction in its queues.
-  HWS.sanityCheck(IR);
+  HWS.instructionCheck(IR);
 #endif
 
   if (IR.getInstruction()->isEliminated())
diff --git a/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp
index 50035d6c7523..cf1f12d9a9a7 100644
--- a/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp
@@ -682,7 +682,7 @@ readDynsymVersionsImpl(const ELFFile<ELFT> &EF,
 
   std::vector<VersionEntry> Ret;
   size_t I = 0;
-  for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) {
+  for (const ELFSymbolRef &Sym : Symbols) {
     ++I;
     Expected<const typename ELFT::Versym *> VerEntryOrErr =
         EF.template getEntry<typename ELFT::Versym>(*VerSec, I);
@@ -691,7 +691,7 @@ readDynsymVersionsImpl(const ELFFile<ELFT> &EF,
                          " from " + describe(EF, *VerSec) + ": " +
                          toString(VerEntryOrErr.takeError()));
 
-    Expected<uint32_t> FlagsOrErr = It->getFlags();
+    Expected<uint32_t> FlagsOrErr = Sym.getFlags();
     if (!FlagsOrErr)
       return createError("unable to read flags for symbol with index " +
                          Twine(I) + ": " + toString(FlagsOrErr.takeError()));
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp
index 5f38ca13cfc2..66ad16db1ba4 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp
@@ -476,29 +476,25 @@ static bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
 
   assert(OS.tell() == CP.SectionTableStart);
   // Output section table.
-  for (std::vector<COFFYAML::Section>::iterator i = CP.Obj.Sections.begin(),
-                                                e = CP.Obj.Sections.end();
-       i != e; ++i) {
-    OS.write(i->Header.Name, COFF::NameSize);
-    OS << binary_le(i->Header.VirtualSize)
-       << binary_le(i->Header.VirtualAddress)
-       << binary_le(i->Header.SizeOfRawData)
-       << binary_le(i->Header.PointerToRawData)
-       << binary_le(i->Header.PointerToRelocations)
-       << binary_le(i->Header.PointerToLineNumbers)
-       << binary_le(i->Header.NumberOfRelocations)
-       << binary_le(i->Header.NumberOfLineNumbers)
-       << binary_le(i->Header.Characteristics);
+  for (const COFFYAML::Section &S : CP.Obj.Sections) {
+    OS.write(S.Header.Name, COFF::NameSize);
+    OS << binary_le(S.Header.VirtualSize)
+       << binary_le(S.Header.VirtualAddress)
+       << binary_le(S.Header.SizeOfRawData)
+       << binary_le(S.Header.PointerToRawData)
+       << binary_le(S.Header.PointerToRelocations)
+       << binary_le(S.Header.PointerToLineNumbers)
+       << binary_le(S.Header.NumberOfRelocations)
+       << binary_le(S.Header.NumberOfLineNumbers)
+       << binary_le(S.Header.Characteristics);
   }
   assert(OS.tell() == CP.SectionTableStart + CP.SectionTableSize);
 
   unsigned CurSymbol = 0;
   StringMap<unsigned> SymbolTableIndexMap;
-  for (std::vector<COFFYAML::Symbol>::iterator I = CP.Obj.Symbols.begin(),
-                                               E = CP.Obj.Symbols.end();
-       I != E; ++I) {
-    SymbolTableIndexMap[I->Name] = CurSymbol;
-    CurSymbol += 1 + I->Header.NumberOfAuxSymbols;
+  for (const COFFYAML::Symbol &Sym : CP.Obj.Symbols) {
+    SymbolTableIndexMap[Sym.Name] = CurSymbol;
+    CurSymbol += 1 + Sym.Header.NumberOfAuxSymbols;
   }
 
   // Output section data.
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp
index fdf9aeae1622..e0dde4433d24 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -155,6 +155,10 @@ void ScalarEnumerationTraits<ELFYAML::ELF_NT>::enumeration(
   ECase(NT_FREEBSD_PROCSTAT_OSREL);
   ECase(NT_FREEBSD_PROCSTAT_PSSTRINGS);
   ECase(NT_FREEBSD_PROCSTAT_AUXV);
+  // NetBSD core note types.
+  ECase(NT_NETBSDCORE_PROCINFO);
+  ECase(NT_NETBSDCORE_AUXV);
+  ECase(NT_NETBSDCORE_LWPSTATUS);
   // OpenBSD core note types.
   ECase(NT_OPENBSD_PROCINFO);
   ECase(NT_OPENBSD_AUXV);
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp
index c653c29ec9a7..e5ffb12df434 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -54,6 +54,7 @@ private:
   void writeNameList(raw_ostream &OS);
   void writeStringTable(raw_ostream &OS);
   void writeExportTrie(raw_ostream &OS);
+  void writeDynamicSymbolTable(raw_ostream &OS);
 
   void dumpExportEntry(raw_ostream &OS, MachOYAML::ExportEntry &Entry);
   void ZeroToOffset(raw_ostream &OS, size_t offset);
@@ -482,6 +483,7 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) {
 
   MachO::dyld_info_command *DyldInfoOnlyCmd = 0;
   MachO::symtab_command *SymtabCmd = 0;
+  MachO::dysymtab_command *DSymtabCmd = 0;
   for (auto &LC : Obj.LoadCommands) {
     switch (LC.Data.load_command_data.cmd) {
     case MachO::LC_SYMTAB:
@@ -504,6 +506,11 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) {
       WriteQueue.push_back(std::make_pair(DyldInfoOnlyCmd->export_off,
                                           &MachOWriter::writeExportTrie));
       break;
+    case MachO::LC_DYSYMTAB:
+      DSymtabCmd = &LC.Data.dysymtab_command_data;
+      WriteQueue.push_back(std::make_pair(
+          DSymtabCmd->indirectsymoff, &MachOWriter::writeDynamicSymbolTable));
+      break;
     }
   }
 
@@ -556,6 +563,12 @@ void MachOWriter::writeStringTable(raw_ostream &OS) {
   }
 }
 
+void MachOWriter::writeDynamicSymbolTable(raw_ostream &OS) {
+  for (auto Data : Obj.LinkEdit.IndirectSymbols)
+    OS.write(reinterpret_cast<const char *>(&Data),
+             sizeof(yaml::Hex32::BaseType));
+}
+
 class UniversalWriter {
 public:
   UniversalWriter(yaml::YamlObjectFile &ObjectFile)
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp
index c9562bd72258..f32009458110 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp
@@ -164,6 +164,7 @@ void MappingTraits<MachOYAML::LinkEditData>::mapping(
     IO.mapOptional("ExportTrie", LinkEditData.ExportTrie);
   IO.mapOptional("NameList", LinkEditData.NameList);
   IO.mapOptional("StringTable", LinkEditData.StringTable);
+  IO.mapOptional("IndirectSymbols", LinkEditData.IndirectSymbols);
 }
 
 void MappingTraits<MachOYAML::RebaseOpcode>::mapping(
diff --git a/contrib/llvm-project/llvm/lib/Option/OptTable.cpp b/contrib/llvm-project/llvm/lib/Option/OptTable.cpp
index 37c2fcbab181..19e05b9272bb 100644
--- a/contrib/llvm-project/llvm/lib/Option/OptTable.cpp
+++ b/contrib/llvm-project/llvm/lib/Option/OptTable.cpp
@@ -150,10 +150,9 @@ OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase)
   for (StringSet<>::const_iterator I = PrefixesUnion.begin(),
                                    E = PrefixesUnion.end(); I != E; ++I) {
     StringRef Prefix = I->getKey();
-    for (StringRef::const_iterator C = Prefix.begin(), CE = Prefix.end();
-                                   C != CE; ++C)
-      if (!is_contained(PrefixChars, *C))
-        PrefixChars.push_back(*C);
+    for (char C : Prefix)
+      if (!is_contained(PrefixChars, C))
+        PrefixChars.push_back(C);
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp b/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp
index ac5dfdbdd540..de1b0ace7876 100644
--- a/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1765,6 +1765,8 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
   if (LTOPreLink)
     addRequiredLTOPreLinkPasses(MPM);
 
+  MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
+
   return MPM;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp b/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp
index 8e6be6730ea4..27a6c519ff82 100644
--- a/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -225,8 +225,8 @@ std::string doSystemDiff(StringRef Before, StringRef After,
     return "Unable to read result.";
 
   // Clean up.
-  for (unsigned I = 0; I < NumFiles; ++I) {
-    std::error_code EC = sys::fs::remove(FileName[I]);
+  for (const std::string &I : FileName) {
+    std::error_code EC = sys::fs::remove(I);
     if (EC)
       return "Unable to remove temporary file.";
   }
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 94bd4807041d..c6691e321b3c 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -83,7 +83,6 @@ Error RawCoverageReader::readIntMax(uint64_t &Result, uint64_t MaxPlus1) {
 Error RawCoverageReader::readSize(uint64_t &Result) {
   if (auto Err = readULEB128(Result))
     return Err;
-  // Sanity check the number.
   if (Result > Data.size())
     return make_error<CoverageMapError>(coveragemap_error::malformed);
   return Error::success();
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp b/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp
index 1168ad27fe52..ab3487ecffe8 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp
@@ -657,19 +657,18 @@ void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input,
   Input.sortByTargetValues();
   auto I = ValueData.begin();
   auto IE = ValueData.end();
-  for (auto J = Input.ValueData.begin(), JE = Input.ValueData.end(); J != JE;
-       ++J) {
-    while (I != IE && I->Value < J->Value)
+  for (const InstrProfValueData &J : Input.ValueData) {
+    while (I != IE && I->Value < J.Value)
       ++I;
-    if (I != IE && I->Value == J->Value) {
+    if (I != IE && I->Value == J.Value) {
       bool Overflowed;
-      I->Count = SaturatingMultiplyAdd(J->Count, Weight, I->Count, &Overflowed);
+      I->Count = SaturatingMultiplyAdd(J.Count, Weight, I->Count, &Overflowed);
       if (Overflowed)
         Warn(instrprof_error::counter_overflow);
       ++I;
       continue;
     }
-    ValueData.insert(I, *J);
+    ValueData.insert(I, J);
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp
index b4e8025dbef9..885c1fe49240 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -62,7 +62,6 @@ InstrProfReader::create(const Twine &Path) {
 
 Expected<std::unique_ptr<InstrProfReader>>
 InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
-  // Sanity check the buffer.
   if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint64_t>::max())
     return make_error<InstrProfError>(instrprof_error::too_large);
 
@@ -113,7 +112,6 @@ IndexedInstrProfReader::create(const Twine &Path, const Twine &RemappingPath) {
 Expected<std::unique_ptr<IndexedInstrProfReader>>
 IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
                                std::unique_ptr<MemoryBuffer> RemappingBuffer) {
-  // Sanity check the buffer.
   if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint64_t>::max())
     return make_error<InstrProfError>(instrprof_error::too_large);
 
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/RawMemProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/RawMemProfReader.cpp
new file mode 100644
index 000000000000..f8d13c74fac3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ProfileData/RawMemProfReader.cpp
@@ -0,0 +1,121 @@
+//===- RawMemProfReader.cpp - Instrumented memory profiling reader --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading MemProf profiling data.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProfData.inc"
+#include "llvm/ProfileData/RawMemProfReader.h"
+
+namespace llvm {
+namespace memprof {
+namespace {
+
+struct Summary {
+  uint64_t Version;
+  uint64_t TotalSizeBytes;
+  uint64_t NumSegments;
+  uint64_t NumMIBInfo;
+  uint64_t NumStackOffsets;
+};
+
+template <class T = uint64_t> inline T alignedRead(const char *Ptr) {
+  static_assert(std::is_pod<T>::value, "Not a pod type.");
+  assert(reinterpret_cast<size_t>(Ptr) % sizeof(T) == 0 && "Unaligned Read");
+  return *reinterpret_cast<const T *>(Ptr);
+}
+
+Summary computeSummary(const char *Start) {
+  auto *H = reinterpret_cast<const Header *>(Start);
+
+  // Check alignment while reading the number of items in each section.
+  return Summary{
+      H->Version,
+      H->TotalSize,
+      alignedRead(Start + H->SegmentOffset),
+      alignedRead(Start + H->MIBOffset),
+      alignedRead(Start + H->StackOffset),
+  };
+}
+
+} // namespace
+
+Expected<std::unique_ptr<RawMemProfReader>>
+RawMemProfReader::create(const Twine &Path) {
+  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path, /*IsText=*/true);
+  if (std::error_code EC = BufferOr.getError())
+    return errorCodeToError(EC);
+
+  std::unique_ptr<MemoryBuffer> Buffer(BufferOr.get().release());
+
+  if (Buffer->getBufferSize() == 0)
+    return make_error<InstrProfError>(instrprof_error::empty_raw_profile);
+
+  if (!RawMemProfReader::hasFormat(*Buffer))
+    return make_error<InstrProfError>(instrprof_error::bad_magic);
+
+  if (Buffer->getBufferSize() < sizeof(Header)) {
+    return make_error<InstrProfError>(instrprof_error::truncated);
+  }
+
+  // The size of the buffer can be > header total size since we allow repeated
+  // serialization of memprof profiles to the same file.
+  uint64_t TotalSize = 0;
+  const char *Next = Buffer->getBufferStart();
+  while (Next < Buffer->getBufferEnd()) {
+    auto *H = reinterpret_cast<const Header *>(Next);
+    if (H->Version != MEMPROF_RAW_VERSION) {
+      return make_error<InstrProfError>(instrprof_error::unsupported_version);
+    }
+
+    TotalSize += H->TotalSize;
+    Next += H->TotalSize;
+  }
+
+  if (Buffer->getBufferSize() != TotalSize) {
+    return make_error<InstrProfError>(instrprof_error::malformed);
+  }
+
+  return std::make_unique<RawMemProfReader>(std::move(Buffer));
+}
+
+bool RawMemProfReader::hasFormat(const MemoryBuffer &Buffer) {
+  if (Buffer.getBufferSize() < sizeof(uint64_t))
+    return false;
+  // Aligned read to sanity check that the buffer was allocated with at least 8b
+  // alignment.
+  const uint64_t Magic = alignedRead(Buffer.getBufferStart());
+  return Magic == MEMPROF_RAW_MAGIC_64;
+}
+
+void RawMemProfReader::printSummaries(raw_ostream &OS) const {
+  int Count = 0;
+  const char *Next = DataBuffer->getBufferStart();
+  while (Next < DataBuffer->getBufferEnd()) {
+    auto Summary = computeSummary(Next);
+    OS << "MemProf Profile " << ++Count << "\n";
+    OS << "  Version: " << Summary.Version << "\n";
+    OS << "  TotalSizeBytes: " << Summary.TotalSizeBytes << "\n";
+    OS << "  NumSegments: " << Summary.NumSegments << "\n";
+    OS << "  NumMIBInfo: " << Summary.NumMIBInfo << "\n";
+    OS << "  NumStackOffsets: " << Summary.NumStackOffsets << "\n";
+    // TODO: Print the build ids once we can record them using the
+    // sanitizer_procmaps library for linux.
+
+    auto *H = reinterpret_cast<const Header *>(Next);
+    Next += H->TotalSize;
+  }
+}
+
+} // namespace memprof
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp
index c99a19020511..eefb7c2ba627 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -1709,7 +1709,7 @@ setupMemoryBuffer(const Twine &Filename) {
     return EC;
   auto Buffer = std::move(BufferOrErr.get());
 
-  // Sanity check the file.
+  // Check the file.
   if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint32_t>::max())
     return sampleprof_error::too_large;
 
diff --git a/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp
index b3136a91e7f5..a3e41ccd199c 100644
--- a/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp
@@ -240,52 +240,4 @@ AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) {
       return C.ArchID;
   }
   return ArchKind::INVALID;
-}
-
-// Parse a branch protection specification, which has the form
-//   standard | none | [bti,pac-ret[+b-key,+leaf]*]
-// Returns true on success, with individual elements of the specification
-// returned in `PBP`. Returns false in error, with `Err` containing
-// an erroneous part of the spec.
-bool AArch64::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
-                                    StringRef &Err) {
-  PBP = {"none", "a_key", false};
-  if (Spec == "none")
-    return true; // defaults are ok
-
-  if (Spec == "standard") {
-    PBP.Scope = "non-leaf";
-    PBP.BranchTargetEnforcement = true;
-    return true;
-  }
-
-  SmallVector<StringRef, 4> Opts;
-  Spec.split(Opts, "+");
-  for (int I = 0, E = Opts.size(); I != E; ++I) {
-    StringRef Opt = Opts[I].trim();
-    if (Opt == "bti") {
-      PBP.BranchTargetEnforcement = true;
-      continue;
-    }
-    if (Opt == "pac-ret") {
-      PBP.Scope = "non-leaf";
-      for (; I + 1 != E; ++I) {
-        StringRef PACOpt = Opts[I + 1].trim();
-        if (PACOpt == "leaf")
-          PBP.Scope = "all";
-        else if (PACOpt == "b-key")
-          PBP.Key = "b_key";
-        else
-          break;
-      }
-      continue;
-    }
-    if (Opt == "")
-      Err = "<empty>";
-    else
-      Err = Opt;
-    return false;
-  }
-
-  return true;
-}
+}
+\ No newline at end of file
diff --git a/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp b/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp
index 459691923af8..241cfb1eedbe 100644
--- a/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp
@@ -59,6 +59,10 @@ const ARMAttributeParser::DisplayHandler ARMAttributeParser::displayRoutines[] =
         ATTRIBUTE_HANDLER(DSP_extension),
         ATTRIBUTE_HANDLER(T2EE_use),
         ATTRIBUTE_HANDLER(Virtualization_use),
+        ATTRIBUTE_HANDLER(PAC_extension),
+        ATTRIBUTE_HANDLER(BTI_extension),
+        ATTRIBUTE_HANDLER(PACRET_use),
+        ATTRIBUTE_HANDLER(BTI_use),
         ATTRIBUTE_HANDLER(nodefaults),
 };
 
@@ -350,6 +354,28 @@ Error ARMAttributeParser::Virtualization_use(AttrType tag) {
   return parseStringAttribute("Virtualization_use", tag, makeArrayRef(strings));
 }
 
+Error ARMAttributeParser::PAC_extension(ARMBuildAttrs::AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Permitted in NOP space",
+                                  "Permitted"};
+  return parseStringAttribute("PAC_extension", tag, makeArrayRef(strings));
+}
+
+Error ARMAttributeParser::BTI_extension(ARMBuildAttrs::AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Permitted in NOP space",
+                                  "Permitted"};
+  return parseStringAttribute("BTI_extension", tag, makeArrayRef(strings));
+}
+
+Error ARMAttributeParser::PACRET_use(ARMBuildAttrs::AttrType tag) {
+  static const char *strings[] = {"Not Used", "Used"};
+  return parseStringAttribute("PACRET_use", tag, makeArrayRef(strings));
+}
+
+Error ARMAttributeParser::BTI_use(ARMBuildAttrs::AttrType tag) {
+  static const char *strings[] = {"Not Used", "Used"};
+  return parseStringAttribute("BTI_use", tag, makeArrayRef(strings));
+}
+
 Error ARMAttributeParser::nodefaults(AttrType tag) {
   uint64_t value = de.getULEB128(cursor);
   printAttribute(tag, value, "Unspecified Tags UNDEFINED");
diff --git a/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp b/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp
index f20521f2a2d4..815cfc62a4b0 100644
--- a/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp
@@ -50,6 +50,10 @@ static const TagNameItem tagData[] = {
     {ARMBuildAttrs::MPextension_use, "Tag_MPextension_use"},
     {ARMBuildAttrs::DIV_use, "Tag_DIV_use"},
     {ARMBuildAttrs::DSP_extension, "Tag_DSP_extension"},
+    {ARMBuildAttrs::PAC_extension, "Tag_PAC_extension"},
+    {ARMBuildAttrs::BTI_extension, "Tag_BTI_extension"},
+    {ARMBuildAttrs::BTI_use, "Tag_BTI_use"},
+    {ARMBuildAttrs::PACRET_use, "Tag_PACRET_use"},
     {ARMBuildAttrs::nodefaults, "Tag_nodefaults"},
     {ARMBuildAttrs::also_compatible_with, "Tag_also_compatible_with"},
     {ARMBuildAttrs::T2EE_use, "Tag_T2EE_use"},
diff --git a/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp b/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp
index e64934aa90cc..5b7004c86f5a 100644
--- a/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp
@@ -2656,10 +2656,13 @@ cl::getRegisteredSubcommands() {
 void cl::HideUnrelatedOptions(cl::OptionCategory &Category, SubCommand &Sub) {
   initCommonOptions();
   for (auto &I : Sub.OptionsMap) {
+    bool Unrelated = true;
     for (auto &Cat : I.second->Categories) {
-      if (Cat != &Category && Cat != &CommonOptions->GenericCategory)
-        I.second->setHiddenFlag(cl::ReallyHidden);
+      if (Cat == &Category || Cat == &CommonOptions->GenericCategory)
+        Unrelated = false;
     }
+    if (Unrelated)
+      I.second->setHiddenFlag(cl::ReallyHidden);
   }
 }
 
@@ -2667,11 +2670,14 @@ void cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories,
                               SubCommand &Sub) {
   initCommonOptions();
   for (auto &I : Sub.OptionsMap) {
+    bool Unrelated = true;
     for (auto &Cat : I.second->Categories) {
-      if (!is_contained(Categories, Cat) &&
-          Cat != &CommonOptions->GenericCategory)
-        I.second->setHiddenFlag(cl::ReallyHidden);
+      if (is_contained(Categories, Cat) ||
+          Cat == &CommonOptions->GenericCategory)
+        Unrelated = false;
     }
+    if (Unrelated)
+      I.second->setHiddenFlag(cl::ReallyHidden);
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Support/HTTPClient.cpp b/contrib/llvm-project/llvm/lib/Support/HTTPClient.cpp
new file mode 100644
index 000000000000..68ba56d1fe50
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Support/HTTPClient.cpp
@@ -0,0 +1,97 @@
+//===-- llvm/Support/HTTPClient.cpp - HTTP client library -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// This file defines the methods of the HTTPRequest, HTTPClient, and
+/// BufferedHTTPResponseHandler classes.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/HTTPClient.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+
+HTTPRequest::HTTPRequest(StringRef Url) { this->Url = Url.str(); }
+
+bool operator==(const HTTPRequest &A, const HTTPRequest &B) {
+  return A.Url == B.Url && A.Method == B.Method &&
+         A.FollowRedirects == B.FollowRedirects;
+}
+
+HTTPResponseHandler::~HTTPResponseHandler() = default;
+
+static inline bool parseContentLengthHeader(StringRef LineRef,
+                                            size_t &ContentLength) {
+  // Content-Length is a mandatory header, and the only one we handle.
+  return LineRef.consume_front("Content-Length: ") &&
+         to_integer(LineRef.trim(), ContentLength, 10);
+}
+
+Error BufferedHTTPResponseHandler::handleHeaderLine(StringRef HeaderLine) {
+  if (ResponseBuffer.Body)
+    return Error::success();
+
+  size_t ContentLength;
+  if (parseContentLengthHeader(HeaderLine, ContentLength))
+    ResponseBuffer.Body =
+        WritableMemoryBuffer::getNewUninitMemBuffer(ContentLength);
+
+  return Error::success();
+}
+
+Error BufferedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) {
+  if (!ResponseBuffer.Body)
+    return createStringError(errc::io_error,
+                             "Unallocated response buffer. HTTP Body data "
+                             "received before Content-Length header.");
+  if (Offset + BodyChunk.size() > ResponseBuffer.Body->getBufferSize())
+    return createStringError(errc::io_error,
+                             "Content size exceeds buffer size.");
+  memcpy(ResponseBuffer.Body->getBufferStart() + Offset, BodyChunk.data(),
+         BodyChunk.size());
+  Offset += BodyChunk.size();
+  return Error::success();
+}
+
+Error BufferedHTTPResponseHandler::handleStatusCode(unsigned Code) {
+  ResponseBuffer.Code = Code;
+  return Error::success();
+}
+
+Expected<HTTPResponseBuffer> HTTPClient::perform(const HTTPRequest &Request) {
+  BufferedHTTPResponseHandler Handler;
+  if (Error Err = perform(Request, Handler))
+    return std::move(Err);
+  return std::move(Handler.ResponseBuffer);
+}
+
+Expected<HTTPResponseBuffer> HTTPClient::get(StringRef Url) {
+  HTTPRequest Request(Url);
+  return perform(Request);
+}
+
+HTTPClient::HTTPClient() = default;
+
+HTTPClient::~HTTPClient() = default;
+
+bool HTTPClient::isAvailable() { return false; }
+
+void HTTPClient::cleanup() {}
+
+void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) {}
+
+Error HTTPClient::perform(const HTTPRequest &Request,
+                          HTTPResponseHandler &Handler) {
+  llvm_unreachable("No HTTP Client implementation available.");
+}
diff --git a/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp b/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp
index 90483817c302..554e3248524c 100644
--- a/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp
@@ -421,11 +421,10 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
          "Self multiplication knownbits mismatch");
 
   // Compute a conservative estimate for high known-0 bits.
-  unsigned LeadZ =
-      std::max(LHS.countMinLeadingZeros() + RHS.countMinLeadingZeros(),
-               BitWidth) -
-      BitWidth;
-  LeadZ = std::min(LeadZ, BitWidth);
+  unsigned LHSLeadZ = LHS.countMinLeadingZeros();
+  unsigned RHSLeadZ = RHS.countMinLeadingZeros();
+  unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth;
+  assert(LeadZ <= BitWidth && "More zeros than bits?");
 
   // The result of the bottom bits of an integer multiply can be
   // inferred by looking at the bottom bits of both operands and
diff --git a/contrib/llvm-project/llvm/lib/Support/Regex.cpp b/contrib/llvm-project/llvm/lib/Support/Regex.cpp
index 0d5cc1c00db1..7a804a1a2297 100644
--- a/contrib/llvm-project/llvm/lib/Support/Regex.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Regex.cpp
@@ -218,10 +218,10 @@ bool Regex::isLiteralERE(StringRef Str) {
 
 std::string Regex::escape(StringRef String) {
   std::string RegexStr;
-  for (unsigned i = 0, e = String.size(); i != e; ++i) {
-    if (strchr(RegexMetachars, String[i]))
+  for (char C : String) {
+    if (strchr(RegexMetachars, C))
       RegexStr += '\\';
-    RegexStr += String[i];
+    RegexStr += C;
   }
 
   return RegexStr;
diff --git a/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp b/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp
index 8abf9f7ce0f1..5683d7005584 100644
--- a/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp
@@ -60,8 +60,7 @@ void llvm::SplitString(StringRef Source,
 }
 
 void llvm::printEscapedString(StringRef Name, raw_ostream &Out) {
-  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
-    unsigned char C = Name[i];
+  for (unsigned char C : Name) {
     if (C == '\\')
       Out << '\\' << C;
     else if (isPrint(C) && C != '"')
diff --git a/contrib/llvm-project/llvm/lib/Support/StringRef.cpp b/contrib/llvm-project/llvm/lib/Support/StringRef.cpp
index c532a1abe906..652303fdb6a0 100644
--- a/contrib/llvm-project/llvm/lib/Support/StringRef.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/StringRef.cpp
@@ -227,8 +227,8 @@ size_t StringRef::rfind_insensitive(StringRef Str) const {
 StringRef::size_type StringRef::find_first_of(StringRef Chars,
                                               size_t From) const {
   std::bitset<1 << CHAR_BIT> CharBits;
-  for (size_type i = 0; i != Chars.size(); ++i)
-    CharBits.set((unsigned char)Chars[i]);
+  for (char C : Chars)
+    CharBits.set((unsigned char)C);
 
   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
     if (CharBits.test((unsigned char)Data[i]))
@@ -252,8 +252,8 @@ StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
                                                   size_t From) const {
   std::bitset<1 << CHAR_BIT> CharBits;
-  for (size_type i = 0; i != Chars.size(); ++i)
-    CharBits.set((unsigned char)Chars[i]);
+  for (char C : Chars)
+    CharBits.set((unsigned char)C);
 
   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
     if (!CharBits.test((unsigned char)Data[i]))
@@ -268,8 +268,8 @@ StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
 StringRef::size_type StringRef::find_last_of(StringRef Chars,
                                              size_t From) const {
   std::bitset<1 << CHAR_BIT> CharBits;
-  for (size_type i = 0; i != Chars.size(); ++i)
-    CharBits.set((unsigned char)Chars[i]);
+  for (char C : Chars)
+    CharBits.set((unsigned char)C);
 
   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
     if (CharBits.test((unsigned char)Data[i]))
@@ -293,8 +293,8 @@ StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
 StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
                                                  size_t From) const {
   std::bitset<1 << CHAR_BIT> CharBits;
-  for (size_type i = 0, e = Chars.size(); i != e; ++i)
-    CharBits.set((unsigned char)Chars[i]);
+  for (char C : Chars)
+    CharBits.set((unsigned char)C);
 
   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
     if (!CharBits.test((unsigned char)Data[i]))
diff --git a/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp
index 1dadce4b9040..4acc23dd455b 100644
--- a/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp
@@ -333,3 +333,51 @@ bool getCPUFeaturesExceptStdExt(CPUKind Kind,
 
 } // namespace RISCV
 } // namespace llvm
+
+// Parse a branch protection specification, which has the form
+//   standard | none | [bti,pac-ret[+b-key,+leaf]*]
+// Returns true on success, with individual elements of the specification
+// returned in `PBP`. Returns false in error, with `Err` containing
+// an erroneous part of the spec.
+bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
+                                StringRef &Err) {
+  PBP = {"none", "a_key", false};
+  if (Spec == "none")
+    return true; // defaults are ok
+
+  if (Spec == "standard") {
+    PBP.Scope = "non-leaf";
+    PBP.BranchTargetEnforcement = true;
+    return true;
+  }
+
+  SmallVector<StringRef, 4> Opts;
+  Spec.split(Opts, "+");
+  for (int I = 0, E = Opts.size(); I != E; ++I) {
+    StringRef Opt = Opts[I].trim();
+    if (Opt == "bti") {
+      PBP.BranchTargetEnforcement = true;
+      continue;
+    }
+    if (Opt == "pac-ret") {
+      PBP.Scope = "non-leaf";
+      for (; I + 1 != E; ++I) {
+        StringRef PACOpt = Opts[I + 1].trim();
+        if (PACOpt == "leaf")
+          PBP.Scope = "all";
+        else if (PACOpt == "b-key")
+          PBP.Key = "b_key";
+        else
+          break;
+      }
+      continue;
+    }
+    if (Opt == "")
+      Err = "<empty>";
+    else
+      Err = Opt;
+    return false;
+  }
+
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp b/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp
index 81926d8071b2..c11e16d3cf98 100644
--- a/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp
@@ -29,7 +29,7 @@ ThreadPool::ThreadPool(ThreadPoolStrategy S)
     Threads.emplace_back([S, ThreadID, this] {
       S.apply_thread_strategy(ThreadID);
       while (true) {
-        PackagedTaskTy Task;
+        std::function<void()> Task;
         {
           std::unique_lock<std::mutex> LockGuard(QueueLock);
           // Wait for tasks to be pushed in the queue
@@ -80,23 +80,6 @@ bool ThreadPool::isWorkerThread() const {
   return false;
 }
 
-std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
-  /// Wrap the Task in a packaged_task to return a future object.
-  PackagedTaskTy PackagedTask(std::move(Task));
-  auto Future = PackagedTask.get_future();
-  {
-    // Lock the queue and push the new task
-    std::unique_lock<std::mutex> LockGuard(QueueLock);
-
-    // Don't allow enqueueing after disabling the pool
-    assert(EnableFlag && "Queuing a thread during ThreadPool destruction");
-
-    Tasks.push(std::move(PackagedTask));
-  }
-  QueueCondition.notify_one();
-  return Future.share();
-}
-
 // The destructor joins all threads, waiting for completion.
 ThreadPool::~ThreadPool() {
   {
@@ -128,16 +111,6 @@ void ThreadPool::wait() {
   }
 }
 
-std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
-  // Get a Future with launch::deferred execution using std::async
-  auto Future = std::async(std::launch::deferred, std::move(Task)).share();
-  // Wrap the future so that both ThreadPool::wait() can operate and the
-  // returned future can be sync'ed on.
-  PackagedTaskTy PackagedTask([Future]() { Future.get(); });
-  Tasks.push(std::move(PackagedTask));
-  return Future;
-}
-
 ThreadPool::~ThreadPool() { wait(); }
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp
index 2acac63ce843..25079fe33edb 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp
+++ b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp
@@ -1017,12 +1017,10 @@ void TGLexer::prepSkipToLineEnd() {
 }
 
 bool TGLexer::prepIsProcessingEnabled() {
-  for (auto I = PrepIncludeStack.back()->rbegin(),
-            E = PrepIncludeStack.back()->rend();
-       I != E; ++I) {
-    if (!I->IsDefined)
+  for (const PreprocessorControlDesc &I :
+       llvm::reverse(*PrepIncludeStack.back()))
+    if (!I.IsDefined)
       return false;
-  }
 
   return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 9f527a17d390..aeebb49675b2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -818,18 +818,9 @@ void AArch64AsmPrinter::emitJumpTableInfo() {
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   if (JT.empty()) return;
 
-  const Function &F = MF->getFunction();
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  bool JTInDiffSection =
-      !STI->isTargetCOFF() ||
-      !TLOF.shouldPutJumpTableInFunctionSection(
-          MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
-          F);
-  if (JTInDiffSection) {
-      // Drop it in the readonly section.
-      MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(F, TM);
-      OutStreamer->SwitchSection(ReadOnlySec);
-  }
+  MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
+  OutStreamer->SwitchSection(ReadOnlySec);
 
   auto AFI = MF->getInfo<AArch64FunctionInfo>();
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 533ab3b05de9..ff4a4dfc1b95 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -88,12 +88,9 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
   // If this is already the flag setting version of the instruction (e.g., SUBS)
   // just make sure the implicit-def of NZCV isn't marked dead.
   if (IsFlagSetting) {
-    for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands();
-         I != E; ++I) {
-      MachineOperand &MO = MI.getOperand(I);
+    for (MachineOperand &MO : MI.implicit_operands())
       if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV)
         MO.setIsDead(false);
-    }
     return &MI;
   }
   bool Is64Bit;
@@ -104,8 +101,8 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
 
   MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
                                     TII->get(NewOpc), NewDestReg);
-  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
-    MIB.add(MI.getOperand(I));
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+    MIB.add(MO);
 
   return MIB;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 4c04e04a7d3c..ee6e670fe3cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -102,9 +102,8 @@ INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
 static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
                            MachineInstrBuilder &DefMI) {
   const MCInstrDesc &Desc = OldMI.getDesc();
-  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
-       ++i) {
-    const MachineOperand &MO = OldMI.getOperand(i);
+  for (const MachineOperand &MO :
+       llvm::drop_begin(OldMI.operands(), Desc.getNumOperands())) {
     assert(MO.isReg() && MO.getReg());
     if (MO.isUse())
       UseMI.add(MO);
@@ -733,8 +732,9 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER(
         MOP.getReg(), /*Def=*/false, /*Implicit=*/true));
     RegMaskStartIdx++;
   }
-  for (; RegMaskStartIdx < MI.getNumOperands(); ++RegMaskStartIdx)
-    OriginalCall->addOperand(MI.getOperand(RegMaskStartIdx));
+  for (const MachineOperand &MO :
+       llvm::drop_begin(MI.operands(), RegMaskStartIdx))
+    OriginalCall->addOperand(MO);
 
   auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
                      .addReg(AArch64::FP, RegState::Define)
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6e9e61c8e7ac..72461aa1f772 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -890,7 +890,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::ABS);
   setTargetDAGCombine(ISD::SUB);
-  setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
@@ -930,6 +929,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::VECREDUCE_ADD);
   setTargetDAGCombine(ISD::STEP_VECTOR);
 
+  setTargetDAGCombine(ISD::FP_EXTEND);
+
   setTargetDAGCombine(ISD::GlobalAddress);
 
   // In case of strict alignment, avoid an excessive number of byte wide stores.
@@ -1323,6 +1324,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
     }
 
     setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
@@ -1504,6 +1506,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   }
 }
 
+bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
+                                                          EVT OpVT) const {
+  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
+  if (!Subtarget->hasSVE())
+    return true;
+
+  // We can only support legal predicate result types.
+  if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
+      ResVT != MVT::nxv16i1)
+    return true;
+
+  // The whilelo instruction only works with i32 or i64 scalar inputs.
+  if (OpVT != MVT::i32 && OpVT != MVT::i64)
+    return true;
+
+  return false;
+}
+
 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
 
@@ -1528,7 +1548,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
     setCondCodeAction(ISD::SETUNE, VT, Expand);
   }
 
-  // Mark integer truncating stores as having custom lowering
+  // Mark integer truncating stores/extending loads as having custom lowering
   if (VT.isInteger()) {
     MVT InnerVT = VT.changeVectorElementType(MVT::i8);
     while (InnerVT != VT) {
@@ -1540,6 +1560,18 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
     }
   }
 
+  // Mark floating-point truncating stores/extending loads as having custom
+  // lowering
+  if (VT.isFloatingPoint()) {
+    MVT InnerVT = VT.changeVectorElementType(MVT::f16);
+    while (InnerVT != VT) {
+      setTruncStoreAction(VT, InnerVT, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
+      InnerVT = InnerVT.changeVectorElementType(
+          MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
+    }
+  }
+
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ABS, VT, Custom);
   setOperationAction(ISD::ADD, VT, Custom);
@@ -1950,6 +1982,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::UDIV_PRED)
     MAKE_CASE(AArch64ISD::UMAX_PRED)
     MAKE_CASE(AArch64ISD::UMIN_PRED)
+    MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
     MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
@@ -2316,6 +2349,8 @@ static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
                                                 SelectionDAG &DAG);
+static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
+                                             EVT VT);
 
 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
 static bool isZerosVector(const SDNode *N) {
@@ -4288,6 +4323,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   }
+  case Intrinsic::get_active_lane_mask: {
+    SDValue ID =
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
+                       Op.getOperand(1), Op.getOperand(2));
+  }
   }
 }
 
@@ -4506,7 +4547,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
     }
     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
     Mask = DAG.getNode(
-        ISD::ZERO_EXTEND, DL,
+        ISD::SIGN_EXTEND, DL,
         VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
   }
 
@@ -4618,7 +4659,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
         VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
     StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
     Mask = DAG.getNode(
-        ISD::ZERO_EXTEND, DL,
+        ISD::SIGN_EXTEND, DL,
         VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
   } else if (VT.isFloatingPoint()) {
     // Handle FP data by casting the data so an integer scatter can be used.
@@ -10963,8 +11004,40 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
   return SDValue();
 }
 
+static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
+  if (Op.getOpcode() != AArch64ISD::DUP &&
+      Op.getOpcode() != ISD::SPLAT_VECTOR &&
+      Op.getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  if (Op.getOpcode() == ISD::BUILD_VECTOR &&
+      !isAllConstantBuildVector(Op, SplatVal))
+    return false;
+
+  if (Op.getOpcode() != ISD::BUILD_VECTOR &&
+      !isa<ConstantSDNode>(Op->getOperand(0)))
+    return false;
+
+  SplatVal = Op->getConstantOperandVal(0);
+  if (Op.getValueType().getVectorElementType() != MVT::i64)
+    SplatVal = (int32_t)SplatVal;
+
+  Negated = false;
+  if (isPowerOf2_64(SplatVal))
+    return true;
+
+  Negated = true;
+  if (isPowerOf2_64(-SplatVal)) {
+    SplatVal = -SplatVal;
+    return true;
+  }
+
+  return false;
+}
+
 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
+  SDLoc dl(Op);
 
   if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
     return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
@@ -10974,6 +11047,19 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   bool Signed = Op.getOpcode() == ISD::SDIV;
   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
 
+  bool Negated;
+  uint64_t SplatVal;
+  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+    SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
+    SDValue Res =
+        DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
+                    DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
+    if (Negated)
+      Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
+
+    return Res;
+  }
+
   if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
     return LowerToPredicatedOp(Op, DAG, PredOpcode);
 
@@ -10987,7 +11073,6 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   else
     llvm_unreachable("Unexpected Custom DIV operation");
 
-  SDLoc dl(Op);
   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
@@ -11924,6 +12009,12 @@ static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
   return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
 }
 
+static bool isSplatShuffle(Value *V) {
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
+    return is_splat(Shuf->getShuffleMask());
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -11934,12 +12025,24 @@ bool AArch64TargetLowering::shouldSinkOperands(
 
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
+    case Intrinsic::aarch64_neon_smull:
     case Intrinsic::aarch64_neon_umull:
-      if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
-        return false;
-      Ops.push_back(&II->getOperandUse(0));
-      Ops.push_back(&II->getOperandUse(1));
-      return true;
+      if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
+        Ops.push_back(&II->getOperandUse(0));
+        Ops.push_back(&II->getOperandUse(1));
+        return true;
+      }
+      LLVM_FALLTHROUGH;
+
+    case Intrinsic::aarch64_neon_sqdmull:
+    case Intrinsic::aarch64_neon_sqdmulh:
+    case Intrinsic::aarch64_neon_sqrdmulh:
+      // Sink splats for index lane variants
+      if (isSplatShuffle(II->getOperand(0)))
+        Ops.push_back(&II->getOperandUse(0));
+      if (isSplatShuffle(II->getOperand(1)))
+        Ops.push_back(&II->getOperandUse(1));
+      return !Ops.empty();
 
     case Intrinsic::aarch64_neon_pmull64:
       if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
@@ -12961,8 +13064,14 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   if (isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
 
-  // fold (sdiv X, pow2)
   EVT VT = N->getValueType(0);
+
+  // For scalable and fixed types, mark them as cheap so we can handle it much
+  // later. This allows us to handle larger than legal types.
+  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
+    return SDValue(N, 0);
+
+  // fold (sdiv X, pow2)
   if ((VT != MVT::i32 && VT != MVT::i64) ||
       !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
     return SDValue();
@@ -13858,34 +13967,6 @@ static SDValue performANDCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performSRLCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
-  // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
-  // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() == ISD::BSWAP) {
-    SDLoc DL(N);
-    SDValue N1 = N->getOperand(1);
-    SDValue N00 = N0.getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
-      uint64_t ShiftAmt = C->getZExtValue();
-      if (VT == MVT::i32 && ShiftAmt == 16 &&
-          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
-        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
-      if (VT == MVT::i64 && ShiftAmt == 32 &&
-          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
-        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
-    }
-  }
-  return SDValue();
-}
-
 // Attempt to form urhadd(OpA, OpB) from
 // truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
 // or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
@@ -14031,6 +14112,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
 
+  if (VT.isScalableVector())
+    return SDValue();
+
   // Optimize concat_vectors of truncated vectors, where the intermediate
   // type is illegal, to avoid said illegality,  e.g.,
   //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
@@ -15089,6 +15173,9 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_sve_uqsub_x:
     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_sve_asrd:
+    return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
   case Intrinsic::aarch64_sve_cmphs:
     if (!N->getOperand(2).getValueType().isFloatingPoint())
       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
@@ -15883,6 +15970,22 @@ static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
                                    const AArch64Subtarget *Subtarget) {
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr = ST->getBasePtr();
+
+  // If this is an FP_ROUND followed by a store, fold this into a truncating
+  // store. We can do this even if this is already a truncstore.
+  // We purposefully don't care about legality of the nodes here as we know
+  // they can be split down into something legal.
+  if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
+      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      Subtarget->useSVEForFixedLengthVectors() &&
+      Value.getValueType().isFixedLengthVector())
+    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                             ST->getMemoryVT(), ST->getMemOperand());
+
   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
     return Split;
 
@@ -17225,6 +17328,37 @@ SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
   return DAG.getBitcast(Ty, Trunc);
 }
 
+SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const AArch64Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
+    return SDValue();
+
+  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
+  // We purposefully don't care about legality of the nodes here as we know
+  // they can be split down into something legal.
+  if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
+      N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
+      VT.isFixedLengthVector()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(), LN0->getBasePtr(),
+                                     N0.getValueType(), LN0->getMemOperand());
+    DCI.CombineTo(N, ExtLoad);
+    DCI.CombineTo(N0.getNode(),
+                  DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(),
+                              ExtLoad, DAG.getIntPtrConstant(1, SDLoc(N0))),
+                  ExtLoad.getValue(1));
+    return SDValue(N, 0); // Return N so it doesn't get rechecked!
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -17253,8 +17387,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performORCombine(N, DCI, Subtarget);
   case ISD::AND:
     return performANDCombine(N, DCI);
-  case ISD::SRL:
-    return performSRLCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
     return performIntrinsicCombine(N, DCI, Subtarget);
   case ISD::ANY_EXTEND:
@@ -17283,6 +17415,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performSTORECombine(N, DCI, DAG, Subtarget);
   case ISD::VECTOR_SPLICE:
     return performSVESpliceCombine(N, DAG);
+  case ISD::FP_EXTEND:
+    return performFPExtendCombine(N, DAG, DCI, Subtarget);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
   case AArch64ISD::TBNZ:
@@ -18414,6 +18548,15 @@ bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   return VT.isScalarInteger();
 }
 
+bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
+                                                 EVT VT) const {
+  // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
+  // legalize.
+  if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
+    return false;
+  return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
+}
+
 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
 }
@@ -18591,12 +18734,29 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+  EVT LoadVT = ContainerVT;
+  EVT MemVT = Load->getMemoryVT();
+
+  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
+
+  if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+    LoadVT = ContainerVT.changeTypeToInteger();
+    MemVT = MemVT.changeTypeToInteger();
+  }
 
   auto NewLoad = DAG.getMaskedLoad(
-      ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
-      getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
-      Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
-      Load->getExtensionType());
+      LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
+      DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
+      Load->getAddressingMode(), Load->getExtensionType());
+
+  if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+    EVT ExtendVT = ContainerVT.changeVectorElementType(
+        Load->getMemoryVT().getVectorElementType());
+
+    NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG);
+    NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+                          Pg, NewLoad, DAG.getUNDEF(ContainerVT));
+  }
 
   auto Result = convertFromScalableVector(DAG, VT, NewLoad);
   SDValue MergedValues[2] = {Result, Load->getChain()};
@@ -18609,12 +18769,15 @@ static SDValue convertFixedMaskToScalableVector(SDValue Mask,
   EVT InVT = Mask.getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
 
+  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
+
+  if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+    return Pg;
+
   auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
   auto Op2 = DAG.getConstant(0, DL, ContainerVT);
-  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
 
-  EVT CmpVT = Pg.getValueType();
-  return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
+  return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
                      {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
 }
 
@@ -18668,13 +18831,26 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
   SDLoc DL(Op);
   EVT VT = Store->getValue().getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+  EVT MemVT = Store->getMemoryVT();
 
+  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
-  return DAG.getMaskedStore(
-      Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
-      getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
-      Store->getMemOperand(), Store->getAddressingMode(),
-      Store->isTruncatingStore());
+
+  if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
+    EVT TruncVT = ContainerVT.changeVectorElementType(
+        Store->getMemoryVT().getVectorElementType());
+    MemVT = MemVT.changeTypeToInteger();
+    NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
+                           NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
+                           DAG.getUNDEF(TruncVT));
+    NewValue =
+        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
+  }
+
+  return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
+                            Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
+                            Store->getMemOperand(), Store->getAddressingMode(),
+                            Store->isTruncatingStore());
 }
 
 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
@@ -18706,6 +18882,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
   bool Signed = Op.getOpcode() == ISD::SDIV;
   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
 
+  bool Negated;
+  uint64_t SplatVal;
+  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+    SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+    SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
+
+    SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
+    SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
+    if (Negated)
+      Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
+
+    return convertFromScalableVector(DAG, VT, Res);
+  }
+
   // Scalable vector i32/i64 DIV is supported.
   if (EltVT == MVT::i32 || EltVT == MVT::i64)
     return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 392e22b68366..ea884cdccd28 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -104,6 +104,8 @@ enum NodeType : unsigned {
   // Unpredicated vector instructions
   BIC,
 
+  SRAD_MERGE_OP1,
+
   // Predicated instructions with the result of inactive lanes provided by the
   // last operand.
   FABS_MERGE_PASSTHRU,
@@ -774,6 +776,8 @@ public:
 
   bool preferIncOfAddToSubOfNot(EVT VT) const override;
 
+  bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
+
   bool hasBitPreservingFPLogic(EVT VT) const override {
     // FIXME: Is this always true? It should be true for vectors at least.
     return VT == MVT::f32 || VT == MVT::f64;
@@ -842,6 +846,8 @@ public:
   EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty,
                              bool AllowUnknown = false) const override;
 
+  bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index db8e0c5dac4a..decee117d2d5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -437,6 +437,18 @@ def non_temporal_store :
          cast<MaskedStoreSDNode>(N)->isNonTemporal();
 }]>;
 
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 GPR32:$src), [{
+  return SDValue(N,0)->getValueType(0) == MVT::i32 &&
+         CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+  }]>;
+
+// top32Zero - answer true if the upper 32 bits of $src are 0, false otherwise
+def top32Zero: PatLeaf<(i64 GPR64:$src), [{
+  return SDValue(N,0)->getValueType(0) == MVT::i64 &&
+         CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 32));
+  }]>;
+
 // Node definitions.
 def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64adr           : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -2046,6 +2058,10 @@ def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;
 def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
 def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 
+// Match (srl (bswap x), C) -> revC if the upper bswap bits are known zero.
+def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
+def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
+
 //===----------------------------------------------------------------------===//
 // Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 67d8fbb45cf5..25d53f4ab065 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -199,6 +199,13 @@ def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
 def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
 def AArch64umulh_p : SDNode<"AArch64ISD::MULHU_PRED", SDT_AArch64Arith>;
 
+def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3,i32>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>
+]>;
+
+def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>;
+
 def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
   SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4>
@@ -1575,7 +1582,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr",  "ASR_ZPZI",  int_aarch64_sve_asr>;
   defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr",  "LSR_ZPZI",  int_aarch64_sve_lsr>;
   defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl",  "LSL_ZPZI",  int_aarch64_sve_lsl>;
-  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<    0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<    0b0100, "asrd", "ASRD_ZPZI", AArch64asrd_m1>;
 
   defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
@@ -1586,7 +1593,7 @@ let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
   defm ASR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
   defm LSR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
   defm LSL_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
-  defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
+  defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<AArch64asrd_m1>;
 } // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
 
 let Predicates = [HasSVEorStreamingSVE] in {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 63d6fa5bbb26..34015d2dbd49 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -833,17 +833,12 @@ static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
   };
 
-  // The OpMultiplier variable should always point to the dup (if any), so
-  // swap if necessary.
-  if (IsUnitDup(OpMultiplicand) || IsUnitSplat(OpMultiplicand))
-    std::swap(OpMultiplier, OpMultiplicand);
-
   if (IsUnitSplat(OpMultiplier)) {
-    // [f]mul pg (dupx 1) %n => %n
+    // [f]mul pg %n, (dupx 1) => %n
     OpMultiplicand->takeName(&II);
     return IC.replaceInstUsesWith(II, OpMultiplicand);
   } else if (IsUnitDup(OpMultiplier)) {
-    // [f]mul pg (dup pg 1) %n => %n
+    // [f]mul pg %n, (dup pg 1) => %n
     auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
     auto *DupPg = DupInst->getOperand(1);
     // TODO: this is naive. The optimization is still valid if DupPg
@@ -2142,6 +2137,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
   case RecurKind::FMax:
   case RecurKind::SelectICmp:
   case RecurKind::SelectFCmp:
+  case RecurKind::FMulAdd:
     return true;
   default:
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e090d87d59a2..3d9a626d3ac3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1920,35 +1920,6 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   switch (I.getOpcode()) {
-  case TargetOpcode::G_SHL:
-  case TargetOpcode::G_ASHR:
-  case TargetOpcode::G_LSHR: {
-    // These shifts are legalized to have 64 bit shift amounts because we want
-    // to take advantage of the existing imported selection patterns that assume
-    // the immediates are s64s. However, if the shifted type is 32 bits and for
-    // some reason we receive input GMIR that has an s64 shift amount that's not
-    // a G_CONSTANT, insert a truncate so that we can still select the s32
-    // register-register variant.
-    Register SrcReg = I.getOperand(1).getReg();
-    Register ShiftReg = I.getOperand(2).getReg();
-    const LLT ShiftTy = MRI.getType(ShiftReg);
-    const LLT SrcTy = MRI.getType(SrcReg);
-    if (SrcTy.isVector())
-      return false;
-    assert(!ShiftTy.isVector() && "unexpected vector shift ty");
-    if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
-      return false;
-    auto *AmtMI = MRI.getVRegDef(ShiftReg);
-    assert(AmtMI && "could not find a vreg definition for shift amount");
-    if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
-      // Insert a subregister copy to implement a 64->32 trunc
-      auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
-                       .addReg(ShiftReg, 0, AArch64::sub_32);
-      MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
-      I.getOperand(2).setReg(Trunc.getReg(0));
-    }
-    return true;
-  }
   case TargetOpcode::G_STORE: {
     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
     MachineOperand &SrcOp = I.getOperand(0);
@@ -2950,6 +2921,28 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     if (Opcode == TargetOpcode::G_SHL &&
         MRI.getType(I.getOperand(0).getReg()).isVector())
       return selectVectorSHL(I, MRI);
+
+    // These shifts were legalized to have 64 bit shift amounts because we
+    // want to take advantage of the selection patterns that assume the
+    // immediates are s64s, however, selectBinaryOp will assume both operands
+    // will have the same bit size.
+    {
+      Register SrcReg = I.getOperand(1).getReg();
+      Register ShiftReg = I.getOperand(2).getReg();
+      const LLT ShiftTy = MRI.getType(ShiftReg);
+      const LLT SrcTy = MRI.getType(SrcReg);
+      if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
+          ShiftTy.getSizeInBits() == 64) {
+        assert(!ShiftTy.isVector() && "unexpected vector shift ty");
+        assert(MRI.getVRegDef(ShiftReg) &&
+               "could not find a vreg definition for shift amount");
+        // Insert a subregister copy to implement a 64->32 trunc
+        auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
+                         .addReg(ShiftReg, 0, AArch64::sub_32);
+        MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
+        I.getOperand(2).setReg(Trunc.getReg(0));
+      }
+    }
     LLVM_FALLTHROUGH;
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
@@ -6452,8 +6445,7 @@ static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder MIB(MI);
 
   // Go through each operand and ensure it has the same regbank.
-  for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
-    MachineOperand &MO = MI.getOperand(OpIdx);
+  for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
     if (!MO.isReg())
       continue;
     Register OpReg = MO.getReg();
@@ -6511,8 +6503,7 @@ void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
     // %endbb:
     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
     bool HasGPROp = false, HasFPROp = false;
-    for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
-      const auto &MO = MI->getOperand(OpIdx);
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
       if (!MO.isReg())
         continue;
       const LLT &Ty = MRI.getType(MO.getReg());
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index f2a470857d21..78c0e90b1384 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -177,8 +177,8 @@ public:
     // We can't just use EmitIntValue here, as that will emit a data mapping
     // symbol, and swap the endianness on big-endian systems (instructions are
     // always little-endian).
-    for (unsigned I = 0; I < 4; ++I) {
-      Buffer[I] = uint8_t(Inst);
+    for (char &C : Buffer) {
+      C = uint8_t(Inst);
       Inst >>= 8;
     }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index cf1a60643efd..92552c3d41d5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -101,8 +101,8 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) {
   // We can't just use EmitIntValue here, as that will swap the
   // endianness on big-endian systems (instructions are always
   // little-endian).
-  for (unsigned I = 0; I < 4; ++I) {
-    Buffer[I] = uint8_t(Inst);
+  for (char &C : Buffer) {
+    C = uint8_t(Inst);
     Inst >>= 8;
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index df2f9a0fa3a9..c7c5ff7bcbe7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -26,6 +26,14 @@ def uchar_to_float : GICombineRule<
          [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
   (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;
 
+
+def rcp_sqrt_to_rsq : GICombineRule<
+  (defs root:$rcp, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp,
+         [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
+
+
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;
 
 def cvt_f32_ubyteN : GICombineRule<
@@ -86,7 +94,8 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
 def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
   "AMDGPUGenPostLegalizerCombinerHelper",
   [all_combines, gfx6gfx7_combines,
-   uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> {
+   uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
+   rcp_sqrt_to_rsq]> {
   let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
   let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
   let AdditionalArguments = [];
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index cee56ee97294..8236e6672247 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -654,6 +654,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectMAD_64_32(N);
     return;
   }
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    return SelectMUL_LOHI(N);
   case ISD::CopyToReg: {
     const SITargetLowering& Lowering =
       *static_cast<const SITargetLowering*>(getTargetLowering());
@@ -719,6 +722,18 @@ bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
          Term->getMetadata("structurizecfg.uniform");
 }
 
+bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
+                                             unsigned ShAmtBits) const {
+  assert(N->getOpcode() == ISD::AND);
+
+  const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  if (RHS.countTrailingOnes() >= ShAmtBits)
+    return true;
+
+  const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
+  return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits;
+}
+
 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
                                           SDValue &N0, SDValue &N1) {
   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
@@ -1001,6 +1016,32 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
+  SDLoc SL(N);
+  bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
+  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+
+  SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
+  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
+  SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
+  if (!SDValue(N, 0).use_empty()) {
+    SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
+    SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
+                                        MVT::i32, SDValue(Mad, 0), Sub0);
+    ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
+  }
+  if (!SDValue(N, 1).use_empty()) {
+    SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
+    SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
+                                        MVT::i32, SDValue(Mad, 0), Sub1);
+    ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
+  }
+  CurDAG->RemoveDeadNode(N);
+}
+
 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
   if (!isUInt<16>(Offset))
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index c1d9673f067e..d638d9877a9b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -136,6 +136,10 @@ private:
   bool isUniformLoad(const SDNode *N) const;
   bool isUniformBr(const SDNode *N) const;
 
+  // Returns true if ISD::AND SDNode `N`'s masking of the shift amount operand's
+  // `ShAmtBits` bits is unneeded.
+  bool isUnneededShiftMask(const SDNode *N, unsigned ShAmtBits) const;
+
   bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
                                   SDValue &RHS) const;
 
@@ -231,6 +235,7 @@ private:
   void SelectUADDO_USUBO(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
   void SelectMAD_64_32(SDNode *N);
+  void SelectMUL_LOHI(SDNode *N);
   void SelectFMA_W_CHAIN(SDNode *N);
   void SelectFMUL_W_CHAIN(SDNode *N);
   SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 523fa2d3724b..54177564afbc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -594,6 +594,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SMUL_LOHI);
+  setTargetDAGCombine(ISD::UMUL_LOHI);
   setTargetDAGCombine(ISD::MULHU);
   setTargetDAGCombine(ISD::MULHS);
   setTargetDAGCombine(ISD::SELECT);
@@ -3462,6 +3464,50 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   return DAG.getSExtOrTrunc(Mul, DL, VT);
 }
 
+SDValue
+AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
+  // in the source into any_extends if the result of the mul is truncated. Since
+  // we can assume the high bits are whatever we want, use the underlying value
+  // to avoid the unknown high bits from interfering.
+  if (N0.getOpcode() == ISD::ANY_EXTEND)
+    N0 = N0.getOperand(0);
+  if (N1.getOpcode() == ISD::ANY_EXTEND)
+    N1 = N1.getOperand(0);
+
+  // Try to use two fast 24-bit multiplies (one for each half of the result)
+  // instead of one slow extending multiply.
+  unsigned LoOpcode, HiOpcode;
+  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+    LoOpcode = AMDGPUISD::MUL_U24;
+    HiOpcode = AMDGPUISD::MULHI_U24;
+  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+    LoOpcode = AMDGPUISD::MUL_I24;
+    HiOpcode = AMDGPUISD::MULHI_I24;
+  } else {
+    return SDValue();
+  }
+
+  SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
+  SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
+  DCI.CombineTo(N, Lo, Hi);
+  return SDValue(N, 0);
+}
+
 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
@@ -4103,6 +4149,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performTruncateCombine(N, DCI);
   case ISD::MUL:
     return performMulCombine(N, DCI);
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    return performMulLoHiCombine(N, DCI);
   case ISD::MULHS:
     return performMulhsCombine(N, DCI);
   case ISD::MULHU:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 03632ac18598..daaca8737c5d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -91,6 +91,7 @@ protected:
   SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 28cb2fc57ac7..e16bead81b65 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3880,6 +3880,22 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
   return KnownBits->signBitIsZero(Base);
 }
 
+bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
+                                                    unsigned ShAmtBits) const {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+  Optional<APInt> RHS = getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
+  if (!RHS)
+    return false;
+
+  if (RHS->countTrailingOnes() >= ShAmtBits)
+    return true;
+
+  const APInt &LHSKnownZeros =
+      KnownBits->getKnownZeroes(MI.getOperand(1).getReg());
+  return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
     MachineOperand &Root) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index b70e6883bae2..26996e42af53 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -300,6 +300,10 @@ private:
   bool isInlineImmediate64(int64_t Imm) const;
   bool isInlineImmediate(const APFloat &Imm) const;
 
+  // Returns true if TargetOpcode::G_AND MachineInstr `MI`'s masking of the
+  // shift amount operand's `ShAmtBits` bits is unneeded.
+  bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
+
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
   const AMDGPURegisterBankInfo &RBI;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index bad9f6265b36..0528b552f475 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -242,25 +242,41 @@ def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>;
 //===----------------------------------------------------------------------===//
 
 // Constrained shift PatFrags.
+
+def csh_mask_16 : PatFrag<(ops node:$src0), (and node:$src0, imm),
+  [{ return isUnneededShiftMask(N, 4); }]> {
+    let GISelPredicateCode = [{ return isUnneededShiftMask(MI, 4); }];
+  }
+
+def csh_mask_32 : PatFrag<(ops node:$src0), (and node:$src0, imm),
+  [{ return isUnneededShiftMask(N, 5); }]> {
+    let GISelPredicateCode = [{ return isUnneededShiftMask(MI, 5); }];
+  }
+
+def csh_mask_64 : PatFrag<(ops node:$src0), (and node:$src0, imm),
+  [{ return isUnneededShiftMask(N, 6); }]> {
+    let GISelPredicateCode = [{ return isUnneededShiftMask(MI, 6); }];
+  }
+
 foreach width = [16, 32, 64] in {
-defvar mask = !sub(width, 1);
+defvar csh_mask = !cast<SDPatternOperator>("csh_mask_"#width);
 
 def cshl_#width : PatFrags<(ops node:$src0, node:$src1),
-  [(shl node:$src0, node:$src1), (shl node:$src0, (and node:$src1, mask))]>;
+  [(shl node:$src0, node:$src1), (shl node:$src0, (csh_mask node:$src1))]>;
 defvar cshl = !cast<SDPatternOperator>("cshl_"#width);
 def cshl_#width#_oneuse : HasOneUseBinOp<cshl>;
 def clshl_rev_#width : PatFrag <(ops node:$src0, node:$src1),
   (cshl $src1, $src0)>;
 
 def csrl_#width : PatFrags<(ops node:$src0, node:$src1),
-  [(srl node:$src0, node:$src1), (srl node:$src0, (and node:$src1, mask))]>;
+  [(srl node:$src0, node:$src1), (srl node:$src0, (csh_mask node:$src1))]>;
 defvar csrl = !cast<SDPatternOperator>("csrl_"#width);
 def csrl_#width#_oneuse : HasOneUseBinOp<csrl>;
 def clshr_rev_#width : PatFrag <(ops node:$src0, node:$src1),
   (csrl $src1, $src0)>;
 
 def csra_#width : PatFrags<(ops node:$src0, node:$src1),
-  [(sra node:$src0, node:$src1), (sra node:$src0, (and node:$src1, mask))]>;
+  [(sra node:$src0, node:$src1), (sra node:$src0, (csh_mask node:$src1))]>;
 defvar csra = !cast<SDPatternOperator>("csra_"#width);
 def csra_#width#_oneuse : HasOneUseBinOp<csra>;
 def cashr_rev_#width : PatFrag <(ops node:$src0, node:$src1),
@@ -696,11 +712,6 @@ class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
   (RcpInst $src)
 >;
 
-class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
-  (AMDGPUrcp (fsqrt vt:$src)),
-  (RsqInst $src)
->;
-
 // Instructions which select to the same v_min_f*
 def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
   [(fminnum_ieee node:$src0, node:$src1),
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index fc984d2dda64..1479933a2850 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
@@ -58,6 +59,9 @@ public:
   bool matchUCharToFloat(MachineInstr &MI);
   void applyUCharToFloat(MachineInstr &MI);
 
+  bool matchRcpSqrtToRsq(MachineInstr &MI,
+                         std::function<void(MachineIRBuilder &)> &MatchInfo);
+
   // FIXME: Should be able to have 2 separate matchdatas rather than custom
   // struct boilerplate.
   struct CvtF32UByteMatchInfo {
@@ -203,6 +207,48 @@ void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+
+  auto getRcpSrc = [=](const MachineInstr &MI) {
+    MachineInstr *ResMI = nullptr;
+    if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
+        MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
+      ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+
+    return ResMI;
+  };
+
+  auto getSqrtSrc = [=](const MachineInstr &MI) {
+    MachineInstr *SqrtSrcMI = nullptr;
+    mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
+    return SqrtSrcMI;
+  };
+
+  MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
+  // rcp(sqrt(x))
+  if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
+    MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
+      B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+          .addUse(SqrtSrcMI->getOperand(0).getReg())
+          .setMIFlags(MI.getFlags());
+    };
+    return true;
+  }
+
+  // sqrt(rcp(x))
+  if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
+    MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
+      B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+          .addUse(RcpSrcMI->getOperand(0).getReg())
+          .setMIFlags(MI.getFlags());
+    };
+    return true;
+  }
+
+  return false;
+}
+
 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
   Register SrcReg = MI.getOperand(1).getReg();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index d560d2043f42..7c4eb71882c7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -280,10 +280,10 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
       }
       LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str()
                         << '\n');
-      for (size_t I = 0; I < Str.size(); ++I) {
+      for (char C : Str) {
         // Rest of the C escape sequences (e.g. \') are handled correctly
         // by the MDParser
-        switch (Str[I]) {
+        switch (C) {
         case '\a':
           Sizes << "\\a";
           break;
@@ -308,7 +308,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
           Sizes << "\\72";
           break;
         default:
-          Sizes << Str[I];
+          Sizes << C;
           break;
         }
       }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index d7dc9ee4117b..12b5830ef930 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -45,6 +45,7 @@ public:
         TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){};
 
   bool isVgprRegBank(Register Reg);
+  Register getAsVgpr(Register Reg);
 
   struct MinMaxMedOpc {
     unsigned Min, Max, Med;
@@ -69,6 +70,23 @@ bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
   return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
 }
 
+Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) {
+  if (isVgprRegBank(Reg))
+    return Reg;
+
+  // Search for existing copy of Reg to vgpr.
+  for (MachineInstr &Use : MRI.use_instructions(Reg)) {
+    Register Def = Use.getOperand(0).getReg();
+    if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def))
+      return Def;
+  }
+
+  // Copy Reg to vgpr.
+  Register VgprReg = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
+  MRI.setRegBank(VgprReg, RBI.getRegBank(AMDGPU::VGPRRegBankID));
+  return VgprReg;
+}
+
 AMDGPURegBankCombinerHelper::MinMaxMedOpc
 AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
   switch (Opc) {
@@ -134,7 +152,9 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
                                             Med3MatchInfo &MatchInfo) {
   B.setInstrAndDebugLoc(MI);
   B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
-               {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags());
+               {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1),
+                getAsVgpr(MatchInfo.Val2)},
+               MI.getFlags());
   MI.eraseFromParent();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ab3ce980c3f6..5988403c0a29 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3189,10 +3189,10 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
                                                 const MachineInstr &MI) const {
   unsigned RegBank = AMDGPU::InvalidRegBankID;
 
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    if (!MI.getOperand(i).isReg())
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
       continue;
-    Register Reg = MI.getOperand(i).getReg();
+    Register Reg = MO.getReg();
     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
       RegBank = regBankUnion(RegBank, Bank->getID());
       if (RegBank == AMDGPU::VGPRRegBankID)
@@ -3206,10 +3206,10 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
-    if (!MI.getOperand(i).isReg())
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
       continue;
-    Register Reg = MI.getOperand(i).getReg();
+    Register Reg = MO.getReg();
     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
       if (Bank->getID() != AMDGPU::SGPRRegBankID)
         return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 1a9255f3240f..712f6dece911 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -706,9 +706,7 @@ bool AMDGPUCFGStructurizer::prepare() {
 
   // Remove unconditional branch instr.
   // Add dummy exit block iff there are multiple returns.
-  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
-       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
+  for (MachineBasicBlock *MBB : OrderedBlks) {
     removeUnconditionalBranch(MBB);
     removeRedundantConditionalBranch(MBB);
     if (isReturnBlock(MBB)) {
@@ -851,14 +849,10 @@ bool AMDGPUCFGStructurizer::run() {
 
 void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   int SccNum = 0;
-  MachineBasicBlock *MBB;
   for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
        ++It, ++SccNum) {
     const std::vector<MachineBasicBlock *> &SccNext = *It;
-    for (std::vector<MachineBasicBlock *>::const_iterator
-         blockIter = SccNext.begin(), blockEnd = SccNext.end();
-         blockIter != blockEnd; ++blockIter) {
-      MBB = *blockIter;
+    for (MachineBasicBlock *MBB : SccNext) {
       OrderedBlks.push_back(MBB);
       recordSccnum(MBB, SccNum);
     }
@@ -1601,11 +1595,8 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
   FuncRep->push_back(DummyExitBlk);  //insert to function
   insertInstrEnd(DummyExitBlk, R600::RETURN);
 
-  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
-       E = RetMBB.end(); It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
-    MachineInstr *MI = getReturnInstr(MBB);
-    if (MI)
+  for (MachineBasicBlock *MBB : RetMBB) {
+    if (MachineInstr *MI = getReturnInstr(MBB))
       MI->eraseFromParent();
     MBB->addSuccessor(DummyExitBlk);
     LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index f4ddbf1131c3..d18dab0554bd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -48,8 +48,6 @@ def SIN_cm : SIN_Common<0x8D>;
 def COS_cm : COS_Common<0x8E>;
 } // End isVector = 1
 
-def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
-
 def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
 
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 12224cb3f797..a9a3421e8192 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -126,7 +126,6 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
 def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
 def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
 def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
 def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
 
 def SIN_eg : SIN_Common<0x8D>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ff5d0b0af6a4..0f8dd0b3bf58 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1442,12 +1442,10 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
     bool FullReg;
     const MachineInstr *MI1;
 
-    auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
-                                     this](const MachineInstr &MI) {
+    auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
+                               this](const MachineInstr &MI) {
       if (!IsMFMAFn(MI))
         return false;
-      if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
-        return false;
       Register DstReg = MI.getOperand(0).getReg();
       FullReg = (DstReg == Reg);
       MI1 = &MI;
@@ -1458,8 +1456,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 
-    int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
-                                              MaxWaitStates);
+    int NumWaitStates =
+        getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
     if (NumWaitStates == std::numeric_limits<int>::max())
       continue;
 
@@ -1619,12 +1617,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
 
   const MachineInstr *MFMA = nullptr;
   unsigned Reg;
-  auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
-                              this](const MachineInstr &MI) {
+  auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) {
     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
       return false;
-    if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
-      return false;
     MFMA = &MI;
     return true;
   };
@@ -1675,8 +1670,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       }
 
       MFMA = nullptr;
-      WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
-                                                 MaxWaitStates);
+      WaitStatesSinceDef =
+          getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
       if (!MFMA)
         continue;
 
@@ -1750,8 +1745,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
                                                     WaitStatesSinceDef);
 
     MFMA = nullptr;
-    WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
-                                               MaxWaitStates);
+    WaitStatesSinceDef =
+        getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
     if (MFMA) {
       int NeedWaitStates = MaxWaitStates;
       switch (TSchedModel.computeInstrLatency(MFMA)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 3456f9a6156c..82c09378acac 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -74,11 +74,11 @@ unsigned GCNRegPressure::getRegKind(Register Reg,
   assert(Reg.isVirtual());
   const auto RC = MRI.getRegClass(Reg);
   auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
-  return STI->isSGPRClass(RC) ?
-    (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) :
-    STI->hasAGPRs(RC) ?
-      (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) :
-      (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
+  return STI->isSGPRClass(RC)
+             ? (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE)
+         : STI->isAGPRClass(RC)
+             ? (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE)
+             : (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
 }
 
 void GCNRegPressure::inc(unsigned Reg,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 1d93165f9eec..715fd69fc7ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -177,9 +177,7 @@ bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   TII = ST.getInstrInfo();
 
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
+  for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
     MachineBasicBlock::iterator LatestCFAlu = E;
     while (I != E) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index d5eaa33ef964..b9ca7f928d56 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -301,9 +301,7 @@ public:
     const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
     TII = ST.getInstrInfo();
 
-    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                    BB != BB_E; ++BB) {
-      MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock &MBB : MF) {
       MachineBasicBlock::iterator I = MBB.begin();
       if (I != MBB.end() && I->getOpcode() == R600::CF_ALU)
         continue; // BB was already parsed
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 838a497b4df1..194879fef53c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -73,9 +73,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
 
   const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
+  for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator I = MBB.begin();
     while (I != MBB.end()) {
       MachineInstr &MI = *I;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 0215eb9f9fea..bd757e9e3d70 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -285,9 +285,8 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
                       TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
-      for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
-        NewMI.add(MI.getOperand(i));
-      }
+      for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+        NewMI.add(MO);
     } else {
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
index 4487864888b6..b3da2fdefacc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1265,7 +1265,6 @@ let Predicates = [isR600] in {
   defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
   def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
 
-  def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
   def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
 
   def R600_ExportSwz : ExportSwzInst {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 36acfafa72aa..6aee2f591b56 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -124,11 +124,9 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
     DAG->dumpNode(*SU);
   } else {
     dbgs() << "NO NODE \n";
-    for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
-      const SUnit &S = DAG->SUnits[i];
+    for (const SUnit &S : DAG->SUnits)
       if (!S.isScheduled)
         DAG->dumpNode(S);
-    }
   });
 
   return SU;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 1a723279dc9f..72cf48c04e7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -323,14 +323,12 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
   TII = ST.getInstrInfo();
   MRI = &Fn.getRegInfo();
 
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
-    MachineBasicBlock *MB = &*MBB;
+  for (MachineBasicBlock &MB : Fn) {
     PreviousRegSeq.clear();
     PreviousRegSeqByReg.clear();
     PreviousRegSeqByUndefCount.clear();
 
-    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+    for (MachineBasicBlock::iterator MII = MB.begin(), MIIE = MB.end();
          MII != MIIE; ++MII) {
       MachineInstr &MI = *MII;
       if (MI.getOpcode() != R600::REG_SEQUENCE) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index e858bba2983c..beb0aad86e89 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -343,20 +343,11 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
   // dependence between Insn 0 and Insn 2. This can lead to incorrect
   // packetization
   //
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
-    MachineBasicBlock::iterator End = MBB->end();
-    MachineBasicBlock::iterator MI = MBB->begin();
-    while (MI != End) {
-      if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF ||
-          (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) {
-        MachineBasicBlock::iterator DeleteMI = MI;
-        ++MI;
-        MBB->erase(DeleteMI);
-        End = MBB->end();
-        continue;
-      }
-      ++MI;
+  for (MachineBasicBlock &MBB : Fn) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+      if (MI.isKill() || MI.getOpcode() == R600::IMPLICIT_DEF ||
+          (MI.getOpcode() == R600::CF_ALU && !MI.getOperand(8).getImm()))
+        MBB.erase(MI);
     }
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
index 777744f08cde..580e4bc417a4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -18,7 +18,8 @@ namespace llvm {
 enum SIRCFlags : uint8_t {
   // For vector registers.
   HasVGPR = 1 << 0,
-  HasAGPR = 1 << 1
+  HasAGPR = 1 << 1,
+  HasSGPR = 1 << 2
 }; // enum SIRCFlags
 
 namespace SIInstrFlags {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index cf93a63f26a0..f54778535b7c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -127,11 +127,11 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() {
 static bool hasVectorOperands(const MachineInstr &MI,
                               const SIRegisterInfo *TRI) {
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.getReg().isVirtual())
       continue;
 
-    if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
+    if (TRI->hasVectorRegisters(MRI.getRegClass(MO.getReg())))
       return true;
   }
   return false;
@@ -259,7 +259,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   // VGPRz = REG_SEQUENCE VGPRx, sub0
 
   MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
-  bool IsAGPR = TRI->hasAGPRs(DstRC);
+  bool IsAGPR = TRI->isAGPRClass(DstRC);
 
   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
     Register SrcReg = MI.getOperand(I).getReg();
@@ -853,7 +853,7 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
 
   Register PHIRes = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
-  if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
+  if (AllAGPRUses && numVGPRUses && !TRI->isAGPRClass(RC0)) {
     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
     for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a3a0e9c9b9ac..200e00ee5521 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1586,17 +1586,9 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
 
   unsigned OpIdx = Op - &UseMI->getOperand(0);
   const MCInstrDesc &InstDesc = UseMI->getDesc();
-  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
-  switch (OpInfo.RegClass) {
-  case AMDGPU::AV_32RegClassID:  LLVM_FALLTHROUGH;
-  case AMDGPU::AV_64RegClassID:  LLVM_FALLTHROUGH;
-  case AMDGPU::AV_96RegClassID:  LLVM_FALLTHROUGH;
-  case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH;
-  case AMDGPU::AV_160RegClassID:
-    break;
-  default:
+  if (!TRI->isVectorSuperClass(
+          TRI->getRegClass(InstDesc.OpInfo[OpIdx].RegClass)))
     return false;
-  }
 
   const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
   auto Dst = MRI->createVirtualRegister(NewDstRC);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 882b9a203755..4706c74be721 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1364,6 +1364,34 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
   return false;
 }
 
+bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
+  const MachineFunction &MF) const {
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint64_t EstStackSize = MFI.estimateStackSize(MF);
+  uint64_t MaxOffset = EstStackSize - 1;
+
+  // We need the emergency stack slots to be allocated in range of the
+  // MUBUF/flat scratch immediate offset from the base register, so assign these
+  // first at the incoming SP position.
+  //
+  // TODO: We could try sorting the objects to find a hole in the first bytes
+  // rather than allocating as close to possible. This could save a lot of space
+  // on frames with alignment requirements.
+  if (ST.enableFlatScratch()) {
+    const SIInstrInfo *TII = ST.getInstrInfo();
+    if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
+                               SIInstrFlags::FlatScratch))
+      return false;
+  } else {
+    if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
+      return false;
+  }
+
+  return true;
+}
+
 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
   MachineFunction &MF,
   MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 951ea79b2809..56fbb875ffd9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -43,6 +43,9 @@ public:
                               const TargetRegisterInfo *TRI,
                               std::vector<CalleeSavedInfo> &CSI) const override;
 
+  bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const override;
+
   bool isSupportedStackID(TargetStackID::Value ID) const override;
 
   void processFunctionBeforeFrameFinalized(
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 519c5b936536..35b72f5d201b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -809,6 +809,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SMULO, MVT::i64, Custom);
   setOperationAction(ISD::UMULO, MVT::i64, Custom);
 
+  if (Subtarget->hasMad64_32()) {
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
+  }
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
@@ -919,6 +924,16 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
     !hasFP32Denormals(DAG.getMachineFunction());
 }
 
+bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
+                                       LLT DestTy, LLT SrcTy) const {
+  return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
+          (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
+         DestTy.getScalarSizeInBits() == 32 &&
+         SrcTy.getScalarSizeInBits() == 16 &&
+         // TODO: This probably only requires no input flushing?
+         !hasFP32Denormals(*MI.getMF());
+}
+
 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
   // SI has some legal vector types, but no legal vector operations. Say no
   // shuffles are legal in order to prefer scalarizing some vector operations.
@@ -4290,8 +4305,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstrBuilder MIB;
     MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
 
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
-      MIB.add(MI.getOperand(I));
+    for (const MachineOperand &MO : MI.operands())
+      MIB.add(MO);
 
     MIB.cloneMemRefs(MI);
     MI.eraseFromParent();
@@ -4457,6 +4472,8 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   return true;
 }
 
+bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
+
 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                                          EVT VT) const {
   if (!VT.isVector()) {
@@ -4522,6 +4539,34 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
   return false;
 }
 
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                                  LLT Ty) const {
+  switch (Ty.getScalarSizeInBits()) {
+  case 16:
+    return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
+  case 32:
+    return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
+  case 64:
+    return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
+  if (!Ty.isScalar())
+    return false;
+
+  if (Ty.getScalarSizeInBits() == 16)
+    return Subtarget->hasMadF16() && !hasFP64FP16Denormals(*MI.getMF());
+  if (Ty.getScalarSizeInBits() == 32)
+    return Subtarget->hasMadMacF32Insts() && !hasFP32Denormals(*MI.getMF());
+
+  return false;
+}
+
 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
                                    const SDNode *N) const {
   // TODO: Check future ftz flag
@@ -4691,6 +4736,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMULO:
   case ISD::UMULO:
     return lowerXMULO(Op, DAG);
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    return lowerXMUL_LOHI(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
@@ -5304,6 +5352,21 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({ Result, Overflow }, SL);
 }
 
+SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->isDivergent()) {
+    // Select to V_MAD_[IU]64_[IU]32.
+    return Op;
+  }
+  if (Subtarget->hasSMulHi()) {
+    // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
+    return SDValue();
+  }
+  // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
+  // calculate the high part, so we might as well do the whole thing with
+  // V_MAD_[IU]64_[IU]32.
+  return Op;
+}
+
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget->isTrapHandlerEnabled() ||
       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
@@ -9790,10 +9853,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
     if (Subtarget->supportsMinMaxDenormModes() ||
         denormalsEnabledForType(MRI.getType(Reg), MF))
       return true;
-    for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) {
-      if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1))
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
+      if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
         return false;
-    }
     return true;
   }
   default:
@@ -11460,15 +11522,15 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
         if (I == -1)
           break;
         MachineOperand &Op = MI.getOperand(I);
-        if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
-             OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
-            !Op.getReg().isVirtual() || !TRI->isAGPR(MRI, Op.getReg()))
+        if (!Op.isReg() || !Op.getReg().isVirtual())
+          continue;
+        auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
+        if (!TRI->hasAGPRs(RC))
           continue;
         auto *Src = MRI.getUniqueVRegDef(Op.getReg());
         if (!Src || !Src->isCopy() ||
             !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
           continue;
-        auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
         auto *NewRC = TRI->getEquivalentVGPRClass(RC);
         // All uses of agpr64 and agpr32 can also accept vgpr except for
         // v_accvgpr_read, but we do not produce agpr reads during selection,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1e48c96ad3c8..1315cc15dd02 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -135,6 +135,7 @@ private:
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
                              SelectionDAG &DAG) const;
@@ -252,6 +253,9 @@ public:
   bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT,
                        EVT SrcVT) const override;
 
+  bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, LLT DestTy,
+                       LLT SrcTy) const override;
+
   bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
@@ -377,6 +381,7 @@ public:
 
   bool hasBitPreservingFPLogic(EVT VT) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
+  bool enableAggressiveFMAFusion(LLT Ty) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
@@ -384,7 +389,10 @@ public:
 
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  const LLT Ty) const override;
   bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
+  bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const override;
 
   SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f4e5771d2a2a..c9d9dd1fb82c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -150,6 +150,8 @@ enum VmemType {
   VMEM_NOSAMPLER,
   // MIMG instructions with a sampler.
   VMEM_SAMPLER,
+  // BVH instructions
+  VMEM_BVH
 };
 
 VmemType getVmemType(const MachineInstr &Inst) {
@@ -157,9 +159,10 @@ VmemType getVmemType(const MachineInstr &Inst) {
   if (!SIInstrInfo::isMIMG(Inst))
     return VMEM_NOSAMPLER;
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
-  return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
-             ? VMEM_SAMPLER
-             : VMEM_NOSAMPLER;
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+  return BaseInfo->BVH ? VMEM_BVH
+                       : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
 }
 
 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4a928123b68f..92f5322b8ad2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -898,10 +898,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   unsigned EltSize = 4;
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
-  if (RI.hasAGPRs(RC)) {
+  if (RI.isAGPRClass(RC)) {
     Opcode = (RI.hasVGPRs(SrcRC)) ?
       AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
-  } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) {
+  } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
   } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
              (RI.isProperlyAlignedRC(*RC) &&
@@ -1205,7 +1205,7 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
 
 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
-  if (RI.hasAGPRs(DstRC))
+  if (RI.isAGPRClass(DstRC))
     return AMDGPU::COPY;
   if (RI.getRegSizeInBits(*DstRC) == 32) {
     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -1435,6 +1435,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       FrameInfo.getObjectAlign(FrameIndex));
   unsigned SpillSize = TRI->getSpillSize(*RC);
 
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
     assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
@@ -1448,7 +1449,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // The SGPR spill/restore instructions only work on number sgprs, so we need
     // to make sure we are using the correct register class.
     if (SrcReg.isVirtual() && SpillSize == 4) {
-      MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
     }
 
@@ -1463,10 +1463,21 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
-                                    : getVGPRSpillSaveOpcode(SpillSize);
+  unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+                                       : getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
 
+  if (RI.isVectorSuperClass(RC)) {
+    // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
+    // equivalent VGPR register beforehand. Regalloc might want to introduce
+    // AV spills only to be relevant until rewriter at which they become
+    // either spills of VGPRs or AGPRs.
+    Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
+    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
+        .addReg(SrcReg, RegState::Kill);
+    SrcReg = TmpVReg;
+  }
+
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg, getKillRegState(isKill)) // data
     .addFrameIndex(FrameIndex)               // addr
@@ -1598,13 +1609,26 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
-                                    : getVGPRSpillRestoreOpcode(SpillSize);
+  unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+                                       : getVGPRSpillRestoreOpcode(SpillSize);
+
+  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
+  Register TmpReg = DestReg;
+  if (IsVectorSuperClass) {
+    // For AV classes, insert the spill restore to a VGPR followed by a copy
+    // into an equivalent AV register.
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
+  }
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     .addFrameIndex(FrameIndex)        // vaddr
     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
     .addImm(0)                           // offset
     .addMemOperand(MMO);
+
+  if (IsVectorSuperClass)
+    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
+        .addReg(DestReg, RegState::Kill);
 }
 
 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -2802,12 +2826,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     }
 
     if (Is16Bit) {
-       if (isVGPRCopy)
-         return false; // Do not clobber vgpr_hi16
+      if (isVGPRCopy)
+        return false; // Do not clobber vgpr_hi16
 
-       if (DstReg.isVirtual() &&
-           UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
-         return false;
+      if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
+        return false;
 
       UseMI.getOperand(0).setSubReg(0);
       if (DstReg.isPhysical()) {
@@ -3896,9 +3919,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     // verification is broken anyway
     if (ST.needsAlignedVGPRs()) {
       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
-      const bool IsVGPR = RI.hasVGPRs(RC);
-      const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
-      if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
+      if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
         const TargetRegisterClass *SubRC =
             RI.getSubRegClass(RC, MO.getSubReg());
         RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
@@ -5522,13 +5543,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
         if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
           VRC = &AMDGPU::VReg_1RegClass;
         } else
-          VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+          VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
                     ? RI.getEquivalentAGPRClass(SRC)
                     : RI.getEquivalentVGPRClass(SRC);
       } else {
-          VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
-                    ? RI.getEquivalentAGPRClass(VRC)
-                    : RI.getEquivalentVGPRClass(VRC);
+        VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
+                  ? RI.getEquivalentAGPRClass(VRC)
+                  : RI.getEquivalentVGPRClass(VRC);
       }
       RC = VRC;
     } else {
@@ -7065,8 +7086,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
   case AMDGPU::STRICT_WWM:
   case AMDGPU::STRICT_WQM: {
     const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
-    if (RI.hasAGPRs(SrcRC)) {
-      if (RI.hasAGPRs(NewDstRC))
+    if (RI.isAGPRClass(SrcRC)) {
+      if (RI.isAGPRClass(NewDstRC))
         return nullptr;
 
       switch (Inst.getOpcode()) {
@@ -7082,7 +7103,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
       if (!NewDstRC)
         return nullptr;
     } else {
-      if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
+      if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
         return nullptr;
 
       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8c24268e379e..47ee83eb9351 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2246,7 +2246,7 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
   let HasExtSDWA9 = 0;
 }
 
-class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> {
+class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.ArgVT> {
   let NeedPatGen = mode;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index d5f9cb8ba493..d55d8da8699a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -827,10 +827,6 @@ def : Pat <
 
 let OtherPredicates = [UnsafeFPMath] in {
 
-//defm : RsqPat<V_RSQ_F32_e32, f32>;
-
-def : RsqPat<V_RSQ_F32_e32, f32>;
-
 // Convert (x - floor(x)) to fract(x)
 def : GCNPat <
   (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
@@ -1372,61 +1368,48 @@ def : GCNPat <
 >;
 }
 
+
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
 /********** ================================ **********/
 
-// Prevent expanding both fneg and fabs.
-// TODO: Add IgnoredBySelectionDAG bit?
-let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG
-
 def : GCNPat <
-  (fneg (fabs (f32 SReg_32:$src))),
+  (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))),
   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit
 >;
 
 def : GCNPat <
-  (fabs (f32 SReg_32:$src)),
+  (UniformUnaryFrag<fabs> (f32 SReg_32:$src)),
   (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff)))
 >;
 
 def : GCNPat <
-  (fneg (f32 SReg_32:$src)),
+  (UniformUnaryFrag<fneg> (f32 SReg_32:$src)),
   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
 >;
 
 def : GCNPat <
-  (fneg (f16 SReg_32:$src)),
+  (UniformUnaryFrag<fneg> (f16 SReg_32:$src)),
   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
 >;
 
 def : GCNPat <
-  (fneg (f16 VGPR_32:$src)),
-  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
->;
-
-def : GCNPat <
-  (fabs (f16 SReg_32:$src)),
+  (UniformUnaryFrag<fabs> (f16 SReg_32:$src)),
   (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
 >;
 
 def : GCNPat <
-  (fneg (fabs (f16 SReg_32:$src))),
+  (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))),
   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
 >;
 
 def : GCNPat <
-  (fneg (fabs (f16 VGPR_32:$src))),
-  (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
->;
-
-def : GCNPat <
-  (fneg (v2f16 SReg_32:$src)),
+  (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)),
   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
 >;
 
 def : GCNPat <
-  (fabs (v2f16 SReg_32:$src)),
+  (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)),
   (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
 >;
 
@@ -1435,51 +1418,20 @@ def : GCNPat <
 // fabs is not reported as free because there is modifier for it in
 // VOP3P instructions, so it is turned into the bit op.
 def : GCNPat <
-  (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
+  (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
 >;
 
 def : GCNPat <
-  (fneg (v2f16 (fabs SReg_32:$src))),
+  (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))),
   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
 >;
 
-// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled
- // def : GCNPat <
-//   (fneg (f64 SReg_64:$src)),
-//   (REG_SEQUENCE SReg_64,
-//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
-//     sub0,
-//     (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
-//                (i32 (S_MOV_B32 (i32 0x80000000)))),
-//     sub1)
-// >;
-
-// def : GCNPat <
-//   (fneg (fabs (f64 SReg_64:$src))),
-//   (REG_SEQUENCE SReg_64,
-//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
-//     sub0,
-//     (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
-//               (S_MOV_B32 (i32 0x80000000))), // Set sign bit.
-//     sub1)
-// >;
-
-// FIXME: Use S_BITSET0_B32/B64?
-// def : GCNPat <
-//   (fabs (f64 SReg_64:$src)),
-//   (REG_SEQUENCE SReg_64,
-//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
-//     sub0,
-//     (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
-//                (i32 (S_MOV_B32 (i32 0x7fffffff)))),
-//     sub1)
-// >;
 
 // COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
 // of the real value.
 def : GCNPat <
-  (fneg (v2f32 SReg_64:$src)),
+  (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)),
   (v2f32 (REG_SEQUENCE SReg_64,
          (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
                                            (i32 (S_MOV_B32 (i32 0x80000000)))),
@@ -1489,36 +1441,103 @@ def : GCNPat <
                                  SReg_32)), sub1))
 >;
 
-} // End let AddedComplexity = 1
+def : GCNPat <
+  (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)),
+  (v2f32 (REG_SEQUENCE SReg_64,
+         (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
+                                           (i32 (S_MOV_B32 (i32 0x7fffffff)))),
+                                 SReg_32)), sub0,
+         (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
+                                           (i32 (S_MOV_B32 (i32 0x7fffffff)))),
+                                 SReg_32)), sub1))
+>;
+
+def : GCNPat <
+  (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))),
+  (v2f32 (REG_SEQUENCE SReg_64,
+         (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
+                                           (i32 (S_MOV_B32 (i32 0x80000000)))),
+                                 SReg_32)), sub0,
+         (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
+                                           (i32 (S_MOV_B32 (i32 0x80000000)))),
+                                 SReg_32)), sub1))
+>;
+
+// FIXME: Use S_BITSET0_B32/B64?
+def : GCNPat <
+  (UniformUnaryFrag<fabs> (f64 SReg_64:$src)),
+  (REG_SEQUENCE SReg_64,
+    (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+    sub0,
+    (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+                   (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit.
+     sub1)
+>;
+
+def : GCNPat <
+  (UniformUnaryFrag<fneg> (f64 SReg_64:$src)),
+  (REG_SEQUENCE SReg_64,
+    (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+    sub0,
+    (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+                   (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)),
+    sub1)
+>;
+
+def : GCNPat <
+  (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))),
+  (REG_SEQUENCE SReg_64,
+    (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+    sub0,
+    (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+                  (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit.
+    sub1)
+>;
+
+
+def : GCNPat <
+  (fneg (fabs (f32 VGPR_32:$src))),
+  (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit
+>;
 
 def : GCNPat <
   (fabs (f32 VGPR_32:$src)),
-  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
 >;
 
 def : GCNPat <
   (fneg (f32 VGPR_32:$src)),
-  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
+  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
 >;
 
 def : GCNPat <
   (fabs (f16 VGPR_32:$src)),
-  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+  (fneg (f16 VGPR_32:$src)),
+  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+  (fneg (fabs (f16 VGPR_32:$src))),
+  (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
 >;
 
 def : GCNPat <
   (fneg (v2f16 VGPR_32:$src)),
-  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
+  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
 >;
 
 def : GCNPat <
   (fabs (v2f16 VGPR_32:$src)),
-  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
 >;
 
 def : GCNPat <
   (fneg (v2f16 (fabs VGPR_32:$src))),
-  (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit
+  (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
 >;
 
 def : GCNPat <
@@ -1526,30 +1545,28 @@ def : GCNPat <
   (REG_SEQUENCE VReg_64,
     (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
     sub0,
-    (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
-                   (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
+    (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))),
+        (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))),
      sub1)
 >;
 
-// TODO: Use SGPR for constant
 def : GCNPat <
   (fneg (f64 VReg_64:$src)),
   (REG_SEQUENCE VReg_64,
     (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
     sub0,
-    (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
-                   (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
+    (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))),
+        (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))),
     sub1)
 >;
 
-// TODO: Use SGPR for constant
 def : GCNPat <
   (fneg (fabs (f64 VReg_64:$src))),
   (REG_SEQUENCE VReg_64,
     (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
     sub0,
-    (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
-                  (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
+    (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))),
+        (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))),
     sub1)
 >;
 
@@ -1681,14 +1698,9 @@ def : GCNPat <
 /********** Intrinsic Patterns **********/
 /********** ================== **********/
 
-let OtherPredicates = [isNotGFX90APlus] in
-// FIXME: Should use _e64 and select source modifiers.
-def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
-
-let OtherPredicates = [isGFX90APlus] in
 def : GCNPat <
-  (fpow f32:$src0, f32:$src1),
-  (V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0))
+  (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))),
+  (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0))
 >;
 
 def : GCNPat <
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 34cbb49dcd16..f4d9002e930e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1609,7 +1609,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
   }
 
   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
-  return TRI->hasAGPRs(getDataRegClass(*CI.I))
+  return TRI->isAGPRClass(getDataRegClass(*CI.I))
              ? TRI->getAGPRClassForBitWidth(BitWidth)
              : TRI->getVGPRClassForBitWidth(BitWidth);
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 29f072ca1e6c..fff4f6729c99 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -795,6 +795,8 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
+      // Set L1 cache policy to MISS_EVICT.
+      // Note: there is no L2 cache bypass policy at the ISA level.
       Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
@@ -837,8 +839,10 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
   assert(MI->mayLoad() && MI->mayStore());
   bool Changed = false;
 
-  /// The L1 cache is write through so does not need to be bypassed. There is no
-  /// bypass control for the L2 cache at the isa level.
+  /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
+  /// bypassed, and the GLC bit is instead used to indicate if they are
+  /// return or no-return.
+  /// Note: there is no L2 cache coherent bypass control at the ISA level.
 
   return Changed;
 }
@@ -860,6 +864,9 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   bool Changed = false;
 
   if (IsVolatile) {
+    // Set L1 cache policy to be MISS_EVICT for load instructions
+    // and MISS_LRU for store instructions.
+    // Note: there is no L2 cache bypass policy at the ISA level.
     if (Op == SIMemOp::LOAD)
       Changed |= enableGLCBit(MI);
 
@@ -875,7 +882,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   }
 
   if (IsNonTemporal) {
-    // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
+    // for both loads and stores, and the L2 cache policy to STREAM.
     Changed |= enableGLCBit(MI);
     Changed |= enableSLCBit(MI);
     return Changed;
@@ -1097,6 +1105,8 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass(
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
+      // Set the L1 cache policy to MISS_LRU.
+      // Note: there is no L2 cache bypass policy at the ISA level.
       Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WORKGROUP:
@@ -1206,6 +1216,9 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
   bool Changed = false;
 
   if (IsVolatile) {
+    // Set L1 cache policy to be MISS_EVICT for load instructions
+    // and MISS_LRU for store instructions.
+    // Note: there is no L2 cache bypass policy at the ISA level.
     if (Op == SIMemOp::LOAD)
       Changed |= enableGLCBit(MI);
 
@@ -1221,7 +1234,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
   }
 
   if (IsNonTemporal) {
-    // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
+    // for both loads and stores, and the L2 cache policy to STREAM.
     Changed |= enableGLCBit(MI);
     Changed |= enableSLCBit(MI);
     return Changed;
@@ -1380,12 +1394,11 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
   bool Changed = false;
 
   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
-    /// TODO Do not set glc for rmw atomic operations as they
-    /// implicitly bypass the L0/L1 caches.
-
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
+      // Set the L0 and L1 cache policies to MISS_EVICT.
+      // Note: there is no L2 cache coherent bypass control at the ISA level.
       Changed |= enableGLCBit(MI);
       Changed |= enableDLCBit(MI);
       break;
@@ -1434,6 +1447,9 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
   bool Changed = false;
 
   if (IsVolatile) {
+    // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
+    // and MISS_LRU for store instructions.
+    // Note: there is no L2 cache coherent bypass control at the ISA level.
     if (Op == SIMemOp::LOAD) {
       Changed |= enableGLCBit(MI);
       Changed |= enableDLCBit(MI);
@@ -1450,8 +1466,14 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
   }
 
   if (IsNonTemporal) {
-    // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
+    // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
+    // and L2 cache policy to STREAM.
+    // For stores setting both GLC and SLC configures L0 and L1 cache policy
+    // to MISS_EVICT and the L2 cache policy to STREAM.
+    if (Op == SIMemOp::STORE)
+      Changed |= enableGLCBit(MI);
     Changed |= enableSLCBit(MI);
+
     return Changed;
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 6a698348d389..da41a5e2478a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1170,7 +1170,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
 
     unsigned I = MI.getOperandNo(&Op);
     if (Desc.OpInfo[I].RegClass == -1 ||
-       !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
+        !TRI->isVSSuperClass(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
       continue;
 
     if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index d1b8e217471e..b0e45dd3e3e3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -291,20 +291,19 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
        MBBI != End && MBBI != ToI; ++MBBI) {
     const MachineBasicBlock &MBB = *MBBI;
 
-    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
+    for (const MachineInstr &MI : MBB) {
       // When a uniform loop is inside non-uniform control flow, the branch
       // leaving the loop might never be taken when EXEC = 0.
       // Hence we should retain cbranch out of the loop lest it become infinite.
-      if (I->isConditionalBranch())
+      if (MI.isConditionalBranch())
         return true;
 
-      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
+      if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
         return true;
 
       // These instructions are potentially expensive even if EXEC = 0.
-      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
-          TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
+      if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
+          TII->isDS(MI) || MI.getOpcode() == AMDGPU::S_WAITCNT)
         return true;
 
       ++NumInstr;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index bfbe84f696f8..a1d9a23a5084 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -402,6 +402,62 @@ const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
   return CSR_AMDGPU_NoRegs_RegMask;
 }
 
+const TargetRegisterClass *
+SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                          const MachineFunction &MF) const {
+  // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
+  // equivalent AV class. If used one, the verifier will crash after
+  // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
+  // until Instruction selection.
+  if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() &&
+      (isVGPRClass(RC) || isAGPRClass(RC))) {
+    if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
+      return &AMDGPU::AV_32RegClass;
+    if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
+      return &AMDGPU::AV_64RegClass;
+    if (RC == &AMDGPU::VReg_64_Align2RegClass ||
+        RC == &AMDGPU::AReg_64_Align2RegClass)
+      return &AMDGPU::AV_64_Align2RegClass;
+    if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
+      return &AMDGPU::AV_96RegClass;
+    if (RC == &AMDGPU::VReg_96_Align2RegClass ||
+        RC == &AMDGPU::AReg_96_Align2RegClass)
+      return &AMDGPU::AV_96_Align2RegClass;
+    if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
+      return &AMDGPU::AV_128RegClass;
+    if (RC == &AMDGPU::VReg_128_Align2RegClass ||
+        RC == &AMDGPU::AReg_128_Align2RegClass)
+      return &AMDGPU::AV_128_Align2RegClass;
+    if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
+      return &AMDGPU::AV_160RegClass;
+    if (RC == &AMDGPU::VReg_160_Align2RegClass ||
+        RC == &AMDGPU::AReg_160_Align2RegClass)
+      return &AMDGPU::AV_160_Align2RegClass;
+    if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
+      return &AMDGPU::AV_192RegClass;
+    if (RC == &AMDGPU::VReg_192_Align2RegClass ||
+        RC == &AMDGPU::AReg_192_Align2RegClass)
+      return &AMDGPU::AV_192_Align2RegClass;
+    if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
+      return &AMDGPU::AV_256RegClass;
+    if (RC == &AMDGPU::VReg_256_Align2RegClass ||
+        RC == &AMDGPU::AReg_256_Align2RegClass)
+      return &AMDGPU::AV_256_Align2RegClass;
+    if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
+      return &AMDGPU::AV_512RegClass;
+    if (RC == &AMDGPU::VReg_512_Align2RegClass ||
+        RC == &AMDGPU::AReg_512_Align2RegClass)
+      return &AMDGPU::AV_512_Align2RegClass;
+    if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
+      return &AMDGPU::AV_1024RegClass;
+    if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
+        RC == &AMDGPU::AReg_1024_Align2RegClass)
+      return &AMDGPU::AV_1024_Align2RegClass;
+  }
+
+  return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
+}
+
 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const SIFrameLowering *TFI =
       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
@@ -994,10 +1050,22 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
 
   unsigned Dst = IsStore ? Reg : ValueReg;
   unsigned Src = IsStore ? ValueReg : Reg;
-  unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
-                                                   : AMDGPU::V_ACCVGPR_READ_B32_e64;
+  bool IsVGPR = TRI->isVGPR(MRI, Reg);
+  DebugLoc DL = MI->getDebugLoc();
+  if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
+    // Spiller during regalloc may restore a spilled register to its superclass.
+    // It could result in AGPR spills restored to VGPRs or the other way around,
+    // making the src and dst with identical regclasses at this point. It just
+    // needs a copy in such cases.
+    auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
+                       .addReg(Src, getKillRegState(IsKill));
+    CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+    return CopyMIB;
+  }
+  unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
+                                    : AMDGPU::V_ACCVGPR_READ_B32_e64;
 
-  auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+  auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
                  .addReg(Src, getKillRegState(IsKill));
   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
   return MIB;
@@ -1099,7 +1167,7 @@ void SIRegisterInfo::buildSpillLoadStore(
 
   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
   // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
-  const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC);
+  const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
   const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
 
   // Always use 4 byte operations for AGPRs because we need to scavenge
@@ -2163,6 +2231,65 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
                                 : getAnyAGPRClassForBitWidth(BitWidth);
 }
 
+static const TargetRegisterClass *
+getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
+  if (BitWidth <= 64)
+    return &AMDGPU::AV_64RegClass;
+  if (BitWidth <= 96)
+    return &AMDGPU::AV_96RegClass;
+  if (BitWidth <= 128)
+    return &AMDGPU::AV_128RegClass;
+  if (BitWidth <= 160)
+    return &AMDGPU::AV_160RegClass;
+  if (BitWidth <= 192)
+    return &AMDGPU::AV_192RegClass;
+  if (BitWidth <= 224)
+    return &AMDGPU::AV_224RegClass;
+  if (BitWidth <= 256)
+    return &AMDGPU::AV_256RegClass;
+  if (BitWidth <= 512)
+    return &AMDGPU::AV_512RegClass;
+  if (BitWidth <= 1024)
+    return &AMDGPU::AV_1024RegClass;
+
+  return nullptr;
+}
+
+static const TargetRegisterClass *
+getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
+  if (BitWidth <= 64)
+    return &AMDGPU::AV_64_Align2RegClass;
+  if (BitWidth <= 96)
+    return &AMDGPU::AV_96_Align2RegClass;
+  if (BitWidth <= 128)
+    return &AMDGPU::AV_128_Align2RegClass;
+  if (BitWidth <= 160)
+    return &AMDGPU::AV_160_Align2RegClass;
+  if (BitWidth <= 192)
+    return &AMDGPU::AV_192_Align2RegClass;
+  if (BitWidth <= 224)
+    return &AMDGPU::AV_224_Align2RegClass;
+  if (BitWidth <= 256)
+    return &AMDGPU::AV_256_Align2RegClass;
+  if (BitWidth <= 512)
+    return &AMDGPU::AV_512_Align2RegClass;
+  if (BitWidth <= 1024)
+    return &AMDGPU::AV_1024_Align2RegClass;
+
+  return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
+  if (BitWidth <= 16)
+    return &AMDGPU::VGPR_LO16RegClass;
+  if (BitWidth <= 32)
+    return &AMDGPU::AV_32RegClass;
+  return ST.needsAlignedVGPRs()
+             ? getAlignedVectorSuperClassForBitWidth(BitWidth)
+             : getAnyVectorSuperClassForBitWidth(BitWidth);
+}
+
 const TargetRegisterClass *
 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
   if (BitWidth <= 16)
@@ -2305,15 +2432,14 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
 
   // We can assume that each lane corresponds to one 32-bit register.
   unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
-  if (isSGPRClass(RC)) {
-    if (Size == 32)
-      RC = &AMDGPU::SGPR_32RegClass;
-    else
-      RC = getSGPRClassForBitWidth(Size);
-  } else if (hasAGPRs(RC)) {
+  if (isAGPRClass(RC)) {
     RC = getAGPRClassForBitWidth(Size);
-  } else {
+  } else if (isVGPRClass(RC)) {
     RC = getVGPRClassForBitWidth(Size);
+  } else if (isVectorSuperClass(RC)) {
+    RC = getVectorSuperClassForBitWidth(Size);
+  } else {
+    RC = getSGPRClassForBitWidth(Size);
   }
   assert(RC && "Invalid sub-register class size");
   return RC;
@@ -2626,10 +2752,13 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
   if (!ST.needsAlignedVGPRs())
     return true;
 
-  if (hasVGPRs(&RC))
+  if (isVGPRClass(&RC))
     return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
-  if (hasAGPRs(&RC))
+  if (isAGPRClass(&RC))
     return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
+  if (isVectorSuperClass(&RC))
+    return RC.hasSuperClassEq(
+        getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
 
   return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 8d90ddb1cf4c..f1fe0a1d9329 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -77,6 +77,10 @@ public:
     return 100;
   }
 
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
   Register getFrameRegister(const MachineFunction &MF) const override;
 
   bool hasBasePointer(const MachineFunction &MF) const;
@@ -156,6 +160,10 @@ public:
   const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const;
 
   LLVM_READONLY
+  const TargetRegisterClass *
+  getVectorSuperClassForBitWidth(unsigned BitWidth) const;
+
+  LLVM_READONLY
   static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
 
   /// Return the 'base' register class for this register.
@@ -164,7 +172,7 @@ public:
 
   /// \returns true if this class contains only SGPR registers
   static bool isSGPRClass(const TargetRegisterClass *RC) {
-    return !hasVGPRs(RC) && !hasAGPRs(RC);
+    return hasSGPRs(RC) && !hasVGPRs(RC) && !hasAGPRs(RC);
   }
 
   /// \returns true if this class ID contains only SGPR registers
@@ -176,12 +184,22 @@ public:
 
   /// \returns true if this class contains only VGPR registers
   static bool isVGPRClass(const TargetRegisterClass *RC) {
-    return hasVGPRs(RC) && !hasAGPRs(RC);
+    return hasVGPRs(RC) && !hasAGPRs(RC) && !hasSGPRs(RC);
   }
 
   /// \returns true if this class contains only AGPR registers
   static bool isAGPRClass(const TargetRegisterClass *RC) {
-    return hasAGPRs(RC) && !hasVGPRs(RC);
+    return hasAGPRs(RC) && !hasVGPRs(RC) && !hasSGPRs(RC);
+  }
+
+  /// \returns true only if this class contains both VGPR and AGPR registers
+  bool isVectorSuperClass(const TargetRegisterClass *RC) const {
+    return hasVGPRs(RC) && hasAGPRs(RC) && !hasSGPRs(RC);
+  }
+
+  /// \returns true only if this class contains both VGPR and SGPR registers
+  bool isVSSuperClass(const TargetRegisterClass *RC) const {
+    return hasVGPRs(RC) && hasSGPRs(RC) && !hasAGPRs(RC);
   }
 
   /// \returns true if this class contains VGPR registers.
@@ -194,6 +212,11 @@ public:
     return RC->TSFlags & SIRCFlags::HasAGPR;
   }
 
+  /// \returns true if this class contains SGPR registers.
+  static bool hasSGPRs(const TargetRegisterClass *RC) {
+    return RC->TSFlags & SIRCFlags::HasSGPR;
+  }
+
   /// \returns true if this class contains any vector registers.
   static bool hasVectorRegisters(const TargetRegisterClass *RC) {
     return hasVGPRs(RC) || hasAGPRs(RC);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index cf1d90484228..340e2b48e5cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -133,9 +133,13 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
   field bit HasVGPR = 0;
   field bit HasAGPR = 0;
 
+  // For scalar register classes.
+  field bit HasSGPR = 0;
+
   // These need to be kept in sync with the enum SIRCFlags.
   let TSFlags{0} = HasVGPR;
   let TSFlags{1} = HasAGPR;
+  let TSFlags{2} = HasSGPR;
 }
 
 multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
@@ -307,45 +311,51 @@ foreach Index = 0...255 in {
 //  Groupings using register classes and tuples
 //===----------------------------------------------------------------------===//
 
-def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
+def SCC_CLASS : SIRegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
   let CopyCost = -1;
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
-def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
+def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
   let CopyCost = 1;
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
-def M0_CLASS_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
+def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
   let CopyCost = 1;
   let Size = 16;
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
-def SGPR_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
                               (add (sequence "SGPR%u_LO16", 0, 105))> {
   let AllocationPriority = 9;
   let Size = 16;
   let GeneratePressureSet = 0;
+  let HasSGPR = 1;
 }
 
-def SGPR_HI16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
                               (add (sequence "SGPR%u_HI16", 0, 105))> {
   let isAllocatable = 0;
   let Size = 16;
   let GeneratePressureSet = 0;
+  let HasSGPR = 1;
 }
 
 // SGPR 32-bit registers
-def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "SGPR%u", 0, 105))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
   let AllocationPriority = 9;
   let GeneratePressureSet = 0;
+  let HasSGPR = 1;
 }
 
 // SGPR 64-bit registers
@@ -376,16 +386,18 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
 def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
 
 // Trap handler TMP 32-bit registers
-def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
+def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
                             (add (sequence "TTMP%u", 0, 15))> {
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
 // Trap handler TMP 16-bit registers
-def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
                               (add (sequence "TTMP%u_LO16", 0, 15))> {
   let Size = 16;
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
 // Trap handler TMP 64-bit registers
@@ -598,16 +610,18 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
 //  Register classes used as source and destination
 //===----------------------------------------------------------------------===//
 
-def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add FP_REG, SP_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
+  let HasSGPR = 1;
 }
 
-def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
+  let HasSGPR = 1;
 }
 
 def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
@@ -616,10 +630,10 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
   let CopyCost = -1;
 }
 
-let GeneratePressureSet = 0 in {
+let GeneratePressureSet = 0, HasSGPR = 1 in {
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
    SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
@@ -627,7 +641,7 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
   let AllocationPriority = 10;
 }
 
-def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16,
+def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
    XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16,
    TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
@@ -637,29 +651,29 @@ def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16,
   let AllocationPriority = 10;
 }
 
-def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
   let AllocationPriority = 10;
 }
 
-def SReg_LO16_XEXEC_HI : RegisterClass<"AMDGPU", [i16, f16], 16,
+def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> {
   let Size = 16;
   let AllocationPriority = 10;
 }
 
-def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 10;
 }
 
-def SReg_LO16_XM0 : RegisterClass<"AMDGPU", [i16, f16], 16,
+def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> {
   let Size = 16;
   let AllocationPriority = 10;
 }
 
-def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> {
   let Size = 16;
   let AllocationPriority = 10;
@@ -667,65 +681,75 @@ def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
 } // End GeneratePressureSet = 0
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
   let AllocationPriority = 10;
+  let HasSGPR = 1;
 }
 
 let GeneratePressureSet = 0 in {
-def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
                             (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 11;
+  let HasSGPR = 1;
 }
 
 // CCR (call clobbered registers) SGPR 64-bit registers
-def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
                                 (add (trunc SGPR_64, 16))> {
   let CopyCost = SGPR_64.CopyCost;
   let AllocationPriority = SGPR_64.AllocationPriority;
+  let HasSGPR = 1;
 }
 
 // Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
-def Gfx_CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
                                 (add (trunc (shl SGPR_64, 15), 1), // s[30:31]
                                      (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
   let CopyCost = SGPR_64.CopyCost;
   let AllocationPriority = SGPR_64.AllocationPriority;
+  let HasSGPR = 1;
 }
 
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
+def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
   let AllocationPriority = 13;
+  let HasSGPR = 1;
 }
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 13;
+  let HasSGPR = 1;
 }
 
-def SReg_1_XEXEC : RegisterClass<"AMDGPU", [i1], 32,
+def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32,
   (add SReg_64_XEXEC, SReg_32_XM0_XEXEC)> {
   let CopyCost = 1;
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
-def SReg_1 : RegisterClass<"AMDGPU", [i1], 32,
+def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
   (add SReg_1_XEXEC, EXEC, EXEC_LO)> {
   let CopyCost = 1;
   let isAllocatable = 0;
+  let HasSGPR = 1;
 }
 
 multiclass SRegClass<int numRegs, int priority,
@@ -738,18 +762,18 @@ multiclass SRegClass<int numRegs, int priority,
   defvar sgprName = !strconcat("SGPR_", suffix);
   defvar ttmpName = !strconcat("TTMP_", suffix);
 
-  let AllocationPriority = priority, CopyCost = copyCost in {
-    def "" # sgprName : RegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
+  let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in {
+    def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
     }
 
     if hasTTMP then {
-      def "" # ttmpName : RegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> {
+      def "" # ttmpName : SIRegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> {
         let isAllocatable = 0;
       }
     }
 
     def SReg_ # suffix :
-      RegisterClass<"AMDGPU", regTypes, 32,
+      SIRegisterClass<"AMDGPU", regTypes, 32,
                     !con(!dag(add, [!cast<RegisterClass>(sgprName)], ["sgpr"]),
                     !if(hasTTMP,
                         !dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]),
@@ -855,44 +879,45 @@ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                           (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
+  let HasSGPR = 1;
 }
 
 def VS_64 : SIRegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
+  let HasSGPR = 1;
 }
 
-def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32,
-                          (add AGPR_32, VGPR_32)> {
-  let isAllocatable = 0;
-  let HasVGPR = 1;
-  let HasAGPR = 1;
-}
-
-def AV_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
-                          (add AReg_64, VReg_64)> {
-  let isAllocatable = 0;
+def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
   let HasVGPR = 1;
   let HasAGPR = 1;
 }
 } // End GeneratePressureSet = 0
 
-let HasVGPR = 1, HasAGPR = 1 in {
-def AV_96 : SIRegisterClass<"AMDGPU", VReg_96.RegTypes, 32,
-                          (add AReg_96, VReg_96)> {
-  let isAllocatable = 0;
-}
+// Define a register tuple class, along with one requiring an even
+// aligned base register.
+multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
+                      dag vregList,  dag aregList> {
+  let HasVGPR = 1, HasAGPR = 1 in {
+    // Define the regular class.
+    def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
 
-def AV_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
-                          (add AReg_128, VReg_128)> {
-  let isAllocatable = 0;
+    // Define 2-aligned variant
+    def _Align2 : VRegClassBase<numRegs, regTypes,
+                                (add (decimate vregList, 2),
+                                     (decimate aregList, 2))>;
+  }
 }
 
-def AV_160 : SIRegisterClass<"AMDGPU", VReg_160.RegTypes, 32,
-                          (add AReg_160, VReg_160)> {
-  let isAllocatable = 0;
-}
-} // End HasVGPR = 1, HasAGPR = 1
+defm AV_64 : AVRegClass<2, VReg_64.RegTypes, (add VGPR_64), (add AGPR_64)>;
+defm AV_96 : AVRegClass<3, VReg_96.RegTypes, (add VGPR_96), (add AGPR_96)>;
+defm AV_128 : AVRegClass<4, VReg_128.RegTypes, (add VGPR_128), (add AGPR_128)>;
+defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>;
+defm AV_192 : AVRegClass<6, VReg_160.RegTypes, (add VGPR_192), (add AGPR_192)>;
+defm AV_224 : AVRegClass<7, VReg_160.RegTypes, (add VGPR_224), (add AGPR_224)>;
+defm AV_256 : AVRegClass<8, VReg_160.RegTypes, (add VGPR_256), (add AGPR_256)>;
+defm AV_512 : AVRegClass<16, VReg_160.RegTypes, (add VGPR_512), (add AGPR_512)>;
+defm AV_1024 : AVRegClass<32, VReg_160.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
 
 //===----------------------------------------------------------------------===//
 //  Register operands
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
index 0792b303b830..18d424a3bc9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -93,16 +93,16 @@ def HWBranch : ProcResource<1> {
   let BufferSize = 1;
 }
 def HWExport : ProcResource<1> {
-  let BufferSize = 7; // Taken from S_WAITCNT
+  let BufferSize = 1;
 }
 def HWLGKM   : ProcResource<1> {
-  let BufferSize = 31;  // Taken from S_WAITCNT
+  let BufferSize = 1;
 }
 def HWSALU   : ProcResource<1> {
   let BufferSize = 1;
 }
 def HWVMEM   : ProcResource<1> {
-  let BufferSize = 15;  // Taken from S_WAITCNT
+  let BufferSize = 1;
 }
 def HWVALU   : ProcResource<1> {
   let BufferSize = 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 6f63f686635a..46012e5d7d97 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -487,6 +487,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
+  bool HasImplicitDerivatives =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -497,8 +499,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
     MachineBasicBlock &MBB = **BI;
     BlockInfo &BBI = Blocks[&MBB];
 
-    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
-      MachineInstr &MI = *II;
+    for (MachineInstr &MI : MBB) {
       InstrInfo &III = Instructions[&MI];
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
@@ -507,6 +508,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // If LOD is not supported WQM is not needed.
         if (!ST->hasExtendedImageInsts())
           continue;
+        // Only generate implicit WQM if implicit derivatives are required.
+        // This avoids inserting unintended WQM if a shader type without
+        // implicit derivatives uses an image sampling instruction.
+        if (!HasImplicitDerivatives)
+          continue;
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9da7b9f5145d..d20eaaaa65e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1626,13 +1626,14 @@ unsigned getRegBitWidth(unsigned RCID) {
     return 32;
   case AMDGPU::SGPR_64RegClassID:
   case AMDGPU::VS_64RegClassID:
-  case AMDGPU::AV_64RegClassID:
   case AMDGPU::SReg_64RegClassID:
   case AMDGPU::VReg_64RegClassID:
   case AMDGPU::AReg_64RegClassID:
   case AMDGPU::SReg_64_XEXECRegClassID:
   case AMDGPU::VReg_64_Align2RegClassID:
   case AMDGPU::AReg_64_Align2RegClassID:
+  case AMDGPU::AV_64RegClassID:
+  case AMDGPU::AV_64_Align2RegClassID:
     return 64;
   case AMDGPU::SGPR_96RegClassID:
   case AMDGPU::SReg_96RegClassID:
@@ -1641,6 +1642,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::VReg_96_Align2RegClassID:
   case AMDGPU::AReg_96_Align2RegClassID:
   case AMDGPU::AV_96RegClassID:
+  case AMDGPU::AV_96_Align2RegClassID:
     return 96;
   case AMDGPU::SGPR_128RegClassID:
   case AMDGPU::SReg_128RegClassID:
@@ -1649,6 +1651,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::VReg_128_Align2RegClassID:
   case AMDGPU::AReg_128_Align2RegClassID:
   case AMDGPU::AV_128RegClassID:
+  case AMDGPU::AV_128_Align2RegClassID:
     return 128;
   case AMDGPU::SGPR_160RegClassID:
   case AMDGPU::SReg_160RegClassID:
@@ -1657,6 +1660,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::VReg_160_Align2RegClassID:
   case AMDGPU::AReg_160_Align2RegClassID:
   case AMDGPU::AV_160RegClassID:
+  case AMDGPU::AV_160_Align2RegClassID:
     return 160;
   case AMDGPU::SGPR_192RegClassID:
   case AMDGPU::SReg_192RegClassID:
@@ -1664,6 +1668,8 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_192RegClassID:
   case AMDGPU::VReg_192_Align2RegClassID:
   case AMDGPU::AReg_192_Align2RegClassID:
+  case AMDGPU::AV_192RegClassID:
+  case AMDGPU::AV_192_Align2RegClassID:
     return 192;
   case AMDGPU::SGPR_224RegClassID:
   case AMDGPU::SReg_224RegClassID:
@@ -1671,6 +1677,8 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_224RegClassID:
   case AMDGPU::VReg_224_Align2RegClassID:
   case AMDGPU::AReg_224_Align2RegClassID:
+  case AMDGPU::AV_224RegClassID:
+  case AMDGPU::AV_224_Align2RegClassID:
     return 224;
   case AMDGPU::SGPR_256RegClassID:
   case AMDGPU::SReg_256RegClassID:
@@ -1678,6 +1686,8 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_256RegClassID:
   case AMDGPU::VReg_256_Align2RegClassID:
   case AMDGPU::AReg_256_Align2RegClassID:
+  case AMDGPU::AV_256RegClassID:
+  case AMDGPU::AV_256_Align2RegClassID:
     return 256;
   case AMDGPU::SGPR_512RegClassID:
   case AMDGPU::SReg_512RegClassID:
@@ -1685,6 +1695,8 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_512RegClassID:
   case AMDGPU::VReg_512_Align2RegClassID:
   case AMDGPU::AReg_512_Align2RegClassID:
+  case AMDGPU::AV_512RegClassID:
+  case AMDGPU::AV_512_Align2RegClassID:
     return 512;
   case AMDGPU::SGPR_1024RegClassID:
   case AMDGPU::SReg_1024RegClassID:
@@ -1692,6 +1704,8 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_1024RegClassID:
   case AMDGPU::VReg_1024_Align2RegClassID:
   case AMDGPU::AReg_1024_Align2RegClassID:
+  case AMDGPU::AV_1024RegClassID:
+  case AMDGPU::AV_1024_Align2RegClassID:
     return 1024;
   default:
     llvm_unreachable("Unexpected register class");
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp
index 62462b77eccf..50ba9fe75232 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp
@@ -104,8 +104,7 @@ MCOperand ARCMCInstLower::LowerOperand(const MachineOperand &MO,
 void ARCMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp = LowerOperand(MO);
 
     if (MCOp.isValid())
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
index 5500783f74db..1d5e45aec06c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
@@ -44,6 +44,7 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
 FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMBranchTargetsPass();
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
@@ -66,6 +67,7 @@ void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
 void initializeARMParallelDSPPass(PassRegistry &);
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMBranchTargetsPass(PassRegistry &);
 void initializeARMConstantIslandsPass(PassRegistry &);
 void initializeARMExpandPseudoPass(PassRegistry &);
 void initializeThumb2SizeReducePass(PassRegistry &);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
index 8cbd80f1bf65..e03dd597eb65 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
@@ -442,6 +442,10 @@ def FeatureFixCMSE_CVE_2021_35465 : SubtargetFeature<"fix-cmse-cve-2021-35465",
                                         "Mitigate against the cve-2021-35465 "
                                         "security vulnurability">;
 
+def FeaturePACBTI         : SubtargetFeature<"pacbti", "HasPACBTI", "true",
+                                             "Enable Pointer Authentication and Branch "
+                                             "Target Identification">;
+
 //===----------------------------------------------------------------------===//
 // ARM architecture class
 //
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 9901b86b0e87..6a88ac485e69 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -763,6 +763,32 @@ void ARMAsmPrinter::emitAttributes() {
         int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
         ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
       }
+
+      auto *PACValue = mdconst::extract_or_null<ConstantInt>(
+          SourceModule->getModuleFlag("sign-return-address"));
+      if (PACValue && PACValue->getZExtValue() == 1) {
+        // If "+pacbti" is used as an architecture extension,
+        // Tag_PAC_extension is emitted in
+        // ARMTargetStreamer::emitTargetAttributes().
+        if (!STI.hasPACBTI()) {
+          ATS.emitAttribute(ARMBuildAttrs::PAC_extension,
+                            ARMBuildAttrs::AllowPACInNOPSpace);
+        }
+        ATS.emitAttribute(ARMBuildAttrs::PACRET_use, ARMBuildAttrs::PACRETUsed);
+      }
+
+      auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
+          SourceModule->getModuleFlag("branch-target-enforcement"));
+      if (BTIValue && BTIValue->getZExtValue() == 1) {
+        // If "+pacbti" is used as an architecture extension,
+        // Tag_BTI_extension is emitted in
+        // ARMTargetStreamer::emitTargetAttributes().
+        if (!STI.hasPACBTI()) {
+          ATS.emitAttribute(ARMBuildAttrs::BTI_extension,
+                            ARMBuildAttrs::AllowBTIInNOPSpace);
+        }
+        ATS.emitAttribute(ARMBuildAttrs::BTI_use, ARMBuildAttrs::BTIUsed);
+      }
     }
   }
 
@@ -1535,17 +1561,17 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
       MCInst.addExpr(BranchTarget);
     }
 
-      if (Opc == ARM::t2BFic) {
-        const MCExpr *ElseLabel = MCSymbolRefExpr::create(
-            getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
-                       MI->getOperand(2).getIndex(), OutContext),
-            OutContext);
-        MCInst.addExpr(ElseLabel);
-        MCInst.addImm(MI->getOperand(3).getImm());
-      } else {
-        MCInst.addImm(MI->getOperand(2).getImm())
-            .addReg(MI->getOperand(3).getReg());
-      }
+    if (Opc == ARM::t2BFic) {
+      const MCExpr *ElseLabel = MCSymbolRefExpr::create(
+          getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+                     MI->getOperand(2).getIndex(), OutContext),
+          OutContext);
+      MCInst.addExpr(ElseLabel);
+      MCInst.addImm(MI->getOperand(3).getImm());
+    } else {
+      MCInst.addImm(MI->getOperand(2).getImm())
+          .addReg(MI->getOperand(3).getReg());
+    }
 
     EmitToStreamer(*OutStreamer, MCInst);
     return;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 2d981be4cfc1..2a12947d24a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -310,8 +310,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
 
   // Transfer LiveVariables states, kill / dead info.
   if (LV) {
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
+    for (const MachineOperand &MO : MI.operands()) {
       if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
         Register Reg = MO.getReg();
 
@@ -634,8 +633,7 @@ bool ARMBaseInstrInfo::ClobbersPredicate(MachineInstr &MI,
                                          std::vector<MachineOperand> &Pred,
                                          bool SkipDead) const {
   bool Found = false;
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     bool ClobbersCPSR = MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR);
     bool IsCPSR = MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR;
     if (ClobbersCPSR || IsCPSR) {
@@ -732,8 +730,7 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
 namespace llvm {
 
 template <> bool IsCPSRDead<MachineInstr>(const MachineInstr *MI) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
       continue;
     if (MO.getReg() != ARM::CPSR)
@@ -1860,15 +1857,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
                                         const MachineInstr &MI1,
                                         const MachineRegisterInfo *MRI) const {
   unsigned Opcode = MI0.getOpcode();
-  if (Opcode == ARM::t2LDRpci ||
-      Opcode == ARM::t2LDRpci_pic ||
-      Opcode == ARM::tLDRpci ||
-      Opcode == ARM::tLDRpci_pic ||
-      Opcode == ARM::LDRLIT_ga_pcrel ||
-      Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
-      Opcode == ARM::tLDRLIT_ga_pcrel ||
-      Opcode == ARM::MOV_ga_pcrel ||
-      Opcode == ARM::MOV_ga_pcrel_ldr ||
+  if (Opcode == ARM::t2LDRpci || Opcode == ARM::t2LDRpci_pic ||
+      Opcode == ARM::tLDRpci || Opcode == ARM::tLDRpci_pic ||
+      Opcode == ARM::LDRLIT_ga_pcrel || Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+      Opcode == ARM::tLDRLIT_ga_pcrel || Opcode == ARM::t2LDRLIT_ga_pcrel ||
+      Opcode == ARM::MOV_ga_pcrel || Opcode == ARM::MOV_ga_pcrel_ldr ||
       Opcode == ARM::t2MOV_ga_pcrel) {
     if (MI1.getOpcode() != Opcode)
       return false;
@@ -1880,11 +1873,9 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
     if (MO0.getOffset() != MO1.getOffset())
       return false;
 
-    if (Opcode == ARM::LDRLIT_ga_pcrel ||
-        Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
-        Opcode == ARM::tLDRLIT_ga_pcrel ||
-        Opcode == ARM::MOV_ga_pcrel ||
-        Opcode == ARM::MOV_ga_pcrel_ldr ||
+    if (Opcode == ARM::LDRLIT_ga_pcrel || Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+        Opcode == ARM::tLDRLIT_ga_pcrel || Opcode == ARM::t2LDRLIT_ga_pcrel ||
+        Opcode == ARM::MOV_ga_pcrel || Opcode == ARM::MOV_ga_pcrel_ldr ||
         Opcode == ARM::t2MOV_ga_pcrel)
       // Ignore the PC labels.
       return MO0.getGlobal() == MO1.getGlobal();
@@ -2312,8 +2303,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
     return nullptr;
   // Check if MI has any non-dead defs or physreg uses. This also detects
   // predicated instructions which will be reading CPSR.
-  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 1)) {
     // Reject frame index operands, PEI can't handle the predicated pseudos.
     if (MO.isFI() || MO.isCPI() || MO.isJTI())
       return nullptr;
@@ -4857,11 +4847,10 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
   if (MI.getOpcode() == ARM::tPUSH ||
       MI.getOpcode() == ARM::tPOP ||
       MI.getOpcode() == ARM::tPOP_RET) {
-    for (int i = 2, e = MI.getNumOperands(); i < e; ++i) {
-      if (MI.getOperand(i).isImplicit() ||
-          !MI.getOperand(i).isReg())
+    for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), 2)) {
+      if (MO.isImplicit() || !MO.isReg())
         continue;
-      Register Reg = MI.getOperand(i).getReg();
+      Register Reg = MO.getReg();
       if (Reg < ARM::R0 || Reg > ARM::R7) {
         if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) &&
             !(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) {
@@ -5748,17 +5737,17 @@ enum MachineOutlinerMBBFlags {
 };
 
 struct OutlinerCosts {
-  const int CallTailCall;
-  const int FrameTailCall;
-  const int CallThunk;
-  const int FrameThunk;
-  const int CallNoLRSave;
-  const int FrameNoLRSave;
-  const int CallRegSave;
-  const int FrameRegSave;
-  const int CallDefault;
-  const int FrameDefault;
-  const int SaveRestoreLROnStack;
+  int CallTailCall;
+  int FrameTailCall;
+  int CallThunk;
+  int FrameThunk;
+  int CallNoLRSave;
+  int FrameNoLRSave;
+  int CallRegSave;
+  int FrameRegSave;
+  int CallDefault;
+  int FrameDefault;
+  int SaveRestoreLROnStack;
 
   OutlinerCosts(const ARMSubtarget &target)
       : CallTailCall(target.isThumb() ? 4 : 4),
@@ -5879,6 +5868,24 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
       return outliner::OutlinedFunction();
   }
 
+  // Partition the candidates in two sets: one with BTI enabled and one with BTI
+  // disabled. Remove the candidates from the smaller set. We expect the
+  // majority of the candidates to be in consensus with regard to branch target
+  // enforcement with just a few oddballs, but if they are the same number
+  // prefer the non-BTI ones for outlining, since they have less overhead.
+  auto NoBTI =
+      llvm::partition(RepeatedSequenceLocs, [](const outliner::Candidate &C) {
+        const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>();
+        return AFI.branchTargetEnforcement();
+      });
+  if (std::distance(RepeatedSequenceLocs.begin(), NoBTI) >
+      std::distance(NoBTI, RepeatedSequenceLocs.end()))
+    RepeatedSequenceLocs.erase(NoBTI, RepeatedSequenceLocs.end());
+  else
+    RepeatedSequenceLocs.erase(RepeatedSequenceLocs.begin(), NoBTI);
+  if (RepeatedSequenceLocs.size() < 2)
+    return outliner::OutlinedFunction();
+
   // At this point, we have only "safe" candidates to outline. Figure out
   // frame + call instruction information.
 
@@ -5892,6 +5899,16 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
       };
 
   OutlinerCosts Costs(Subtarget);
+  const auto &SomeMFI =
+      *RepeatedSequenceLocs.front().getMF()->getInfo<ARMFunctionInfo>();
+  // Adjust costs to account for the BTI instructions.
+  if (SomeMFI.branchTargetEnforcement()) {
+    Costs.FrameDefault += 4;
+    Costs.FrameNoLRSave += 4;
+    Costs.FrameRegSave += 4;
+    Costs.FrameTailCall += 4;
+    Costs.FrameThunk += 4;
+  }
   unsigned FrameID = MachineOutlinerDefault;
   unsigned NumBytesToCreateFrame = Costs.FrameDefault;
 
@@ -6004,16 +6021,18 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
 
   // Stack might be involved but addressing mode doesn't handle any offset.
   // Rq: AddrModeT1_[1|2|4] don't operate on SP
-  if (AddrMode == ARMII::AddrMode1        // Arithmetic instructions
-      || AddrMode == ARMII::AddrMode4     // Load/Store Multiple
-      || AddrMode == ARMII::AddrMode6     // Neon Load/Store Multiple
-      || AddrMode == ARMII::AddrModeT2_so // SP can't be used as based register
-      || AddrMode == ARMII::AddrModeT2_pc // PCrel access
-      || AddrMode == ARMII::AddrMode2     // Used by PRE and POST indexed LD/ST
-      || AddrMode == ARMII::AddrModeT2_i7 // v8.1-M MVE
-      || AddrMode == ARMII::AddrModeT2_i7s2 // v8.1-M MVE
-      || AddrMode == ARMII::AddrModeT2_i7s4 // v8.1-M sys regs VLDR/VSTR
-      || AddrMode == ARMII::AddrModeNone)
+  if (AddrMode == ARMII::AddrMode1 ||       // Arithmetic instructions
+      AddrMode == ARMII::AddrMode4 ||       // Load/Store Multiple
+      AddrMode == ARMII::AddrMode6 ||       // Neon Load/Store Multiple
+      AddrMode == ARMII::AddrModeT2_so ||   // SP can't be used as based register
+      AddrMode == ARMII::AddrModeT2_pc ||   // PCrel access
+      AddrMode == ARMII::AddrMode2 ||       // Used by PRE and POST indexed LD/ST
+      AddrMode == ARMII::AddrModeT2_i7 ||   // v8.1-M MVE
+      AddrMode == ARMII::AddrModeT2_i7s2 || // v8.1-M MVE
+      AddrMode == ARMII::AddrModeT2_i7s4 || // v8.1-M sys regs VLDR/VSTR
+      AddrMode == ARMII::AddrModeNone ||
+      AddrMode == ARMII::AddrModeT2_i8 ||   // Pre/Post inc instructions
+      AddrMode == ARMII::AddrModeT2_i8neg)  // Always negative imm
     return false;
 
   unsigned NumOps = MI->getDesc().getNumOperands();
@@ -6051,7 +6070,7 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
     NumBits = 8;
     Scale = 2;
     break;
-  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i8pos:
     NumBits = 8;
     break;
   case ARMII::AddrModeT2_i8s4:
@@ -6089,7 +6108,18 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
   }
 
   return false;
+}
+
+void ARMBaseInstrInfo::mergeOutliningCandidateAttributes(
+    Function &F, std::vector<outliner::Candidate> &Candidates) const {
+  outliner::Candidate &C = Candidates.front();
+  // branch-target-enforcement is guaranteed to be consistent between all
+  // candidates, so we only need to look at one.
+  const Function &CFn = C.getMF()->getFunction();
+  if (CFn.hasFnAttribute("branch-target-enforcement"))
+    F.addFnAttr(CFn.getFnAttribute("branch-target-enforcement"));
 
+  ARMGenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
 }
 
 bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom(
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index db9320962e81..5fa912ae35d7 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -349,6 +349,8 @@ public:
                                    bool OutlineFromLinkOnceODRs) const override;
   outliner::OutlinedFunction getOutliningCandidateInfo(
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+  void mergeOutliningCandidateAttributes(
+      Function &F, std::vector<outliner::Candidate> &Candidates) const override;
   outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT,
                                        unsigned Flags) const override;
   bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
@@ -877,19 +879,23 @@ inline bool isLegalAddressImm(unsigned Opcode, int Imm,
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
   switch (AddrMode) {
   case ARMII::AddrModeT2_i7:
-    return std::abs(Imm) < (((1 << 7) * 1) - 1);
+    return std::abs(Imm) < ((1 << 7) * 1);
   case ARMII::AddrModeT2_i7s2:
-    return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
+    return std::abs(Imm) < ((1 << 7) * 2) && Imm % 2 == 0;
   case ARMII::AddrModeT2_i7s4:
-    return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
+    return std::abs(Imm) < ((1 << 7) * 4) && Imm % 4 == 0;
   case ARMII::AddrModeT2_i8:
-    return std::abs(Imm) < (((1 << 8) * 1) - 1);
-  case ARMII::AddrMode2:
-    return std::abs(Imm) < (((1 << 12) * 1) - 1);
-  case ARMII::AddrModeT2_i12:
-    return Imm >= 0 && Imm < (((1 << 12) * 1) - 1);
+    return std::abs(Imm) < ((1 << 8) * 1);
+  case ARMII::AddrModeT2_i8pos:
+    return Imm >= 0 && Imm < ((1 << 8) * 1);
+  case ARMII::AddrModeT2_i8neg:
+    return Imm < 0 && -Imm < ((1 << 8) * 1);
   case ARMII::AddrModeT2_i8s4:
-    return std::abs(Imm) < (((1 << 8) * 4) - 1) && Imm % 4 == 0;
+    return std::abs(Imm) < ((1 << 8) * 4) && Imm % 4 == 0;
+  case ARMII::AddrModeT2_i12:
+    return Imm >= 0 && Imm < ((1 << 12) * 1);
+  case ARMII::AddrMode2:
+    return std::abs(Imm) < ((1 << 12) * 1);
   default:
     llvm_unreachable("Unhandled Addressing mode");
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBranchTargets.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBranchTargets.cpp
new file mode 100644
index 000000000000..1091c1f970fa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBranchTargets.cpp
@@ -0,0 +1,135 @@
+//===-- ARMBranchTargets.cpp -- Harden code using v8.1-M BTI extension -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass inserts BTI instructions at the start of every function and basic
+// block which could be indirectly called. The hardware will (when enabled)
+// trap when an indirect branch or call instruction targets an instruction
+// which is not a valid BTI instruction. This is intended to guard against
+// control-flow hijacking attacks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-branch-targets"
+#define ARM_BRANCH_TARGETS_NAME "ARM Branch Targets"
+
+namespace {
+class ARMBranchTargets : public MachineFunctionPass {
+public:
+  static char ID;
+  ARMBranchTargets() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return ARM_BRANCH_TARGETS_NAME; }
+
+private:
+  void addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB, bool IsFirstBB);
+};
+} // end anonymous namespace
+
+char ARMBranchTargets::ID = 0;
+
+INITIALIZE_PASS(ARMBranchTargets, "arm-branch-targets", ARM_BRANCH_TARGETS_NAME,
+                false, false)
+
+void ARMBranchTargets::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createARMBranchTargetsPass() {
+  return new ARMBranchTargets();
+}
+
+bool ARMBranchTargets::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getInfo<ARMFunctionInfo>()->branchTargetEnforcement())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** ARM Branch Targets  **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+  const ARMInstrInfo &TII =
+      *static_cast<const ARMInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  // LLVM does not consider basic blocks which are the targets of jump tables
+  // to be address-taken (the address can't escape anywhere else), but they are
+  // used for indirect branches, so need BTI instructions.
+  SmallPtrSet<const MachineBasicBlock *, 8> JumpTableTargets;
+  if (const MachineJumpTableInfo *JTI = MF.getJumpTableInfo())
+    for (const MachineJumpTableEntry &JTE : JTI->getJumpTables())
+      for (const MachineBasicBlock *MBB : JTE.MBBs)
+        JumpTableTargets.insert(MBB);
+
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    bool NeedBTI = false;
+    bool IsFirstBB = &MBB == &MF.front();
+
+    // Every function can potentially be called indirectly (even if it has
+    // static linkage, due to linker-generated veneers).
+    if (IsFirstBB)
+      NeedBTI = true;
+
+    // If the block itself is address-taken, or is an exception landing pad, it
+    // could be indirectly branched to.
+    if (MBB.hasAddressTaken() || MBB.isEHPad() || JumpTableTargets.count(&MBB))
+      NeedBTI = true;
+
+    if (NeedBTI) {
+      addBTI(TII, MBB, IsFirstBB);
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// Insert a BTI/PACBTI instruction into a given basic block \c MBB. If
+/// \c IsFirstBB is true (meaning that this is the first BB in a function) try
+/// to find a PAC instruction and replace it with PACBTI. Otherwise just insert
+/// a BTI instruction.
+/// The point of insertion is in the beginning of the BB, immediately after meta
+/// instructions (such labels in exception handling landing pads).
+void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB,
+                              bool IsFirstBB) {
+  // Which instruction to insert: BTI or PACBTI
+  unsigned OpCode = ARM::t2BTI;
+
+  // Skip meta instructions, including EH labels
+  auto MBBI = llvm::find_if_not(MBB.instrs(), [](const MachineInstr &MI) {
+    return MI.isMetaInstruction();
+  });
+
+  // If this is the first BB in a function, check if it starts with a PAC
+  // instruction and in that case remove the PAC instruction.
+  if (IsFirstBB) {
+    if (MBBI != MBB.instr_end() && MBBI->getOpcode() == ARM::t2PAC) {
+      LLVM_DEBUG(dbgs() << "Removing a 'PAC' instr from BB '" << MBB.getName()
+                        << "' to replace with PACBTI\n");
+      OpCode = ARM::t2PACBTI;
+      auto NextMBBI = std::next(MBBI);
+      MBBI->eraseFromParent();
+      MBBI = NextMBBI;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inserting a '"
+                    << (OpCode == ARM::t2BTI ? "BTI" : "PACBTI")
+                    << "' instr into BB '" << MBB.getName() << "'\n");
+  // Finally, insert a new instruction (either PAC or PACBTI)
+  BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII.get(OpCode));
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 121558276c3e..c2ca4708c208 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -184,6 +184,9 @@ namespace {
     /// base address.
     DenseMap<int, int> JumpTableUserIndices;
 
+    // Maps a MachineBasicBlock to the number of jump tables entries.
+    DenseMap<const MachineBasicBlock *, int> BlockJumpTableRefCount;
+
     /// ImmBranch - One per immediate branch, keeping the machine instruction
     /// pointer, conditional or unconditional, the max displacement,
     /// and (if isCond is true) the corresponding unconditional branch
@@ -274,7 +277,10 @@ namespace {
                               unsigned &DeadSize, bool &CanDeleteLEA,
                               bool &BaseRegKill);
     bool optimizeThumb2JumpTables();
-    MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
+    void fixupBTI(unsigned JTI, MachineBasicBlock &OldBB,
+                  MachineBasicBlock &NewBB);
+    MachineBasicBlock *adjustJTTargetBlockForward(unsigned JTI,
+                                                  MachineBasicBlock *BB,
                                                   MachineBasicBlock *JTBB);
 
     unsigned getUserOffset(CPUser&) const;
@@ -518,6 +524,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   CPEntries.clear();
   JumpTableEntryIndices.clear();
   JumpTableUserIndices.clear();
+  BlockJumpTableRefCount.clear();
   ImmBranches.clear();
   PushPopMIs.clear();
   T2JumpTables.clear();
@@ -720,6 +727,14 @@ Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) {
   return MCP->getConstants()[CPI].getAlign();
 }
 
+// Exception landing pads, blocks that has their adress taken, and function
+// entry blocks will always be (potential) indirect jump targets, regardless of
+// whether they are referenced by or not by jump tables.
+static bool isAlwaysIndirectTarget(const MachineBasicBlock &MBB) {
+  return MBB.isEHPad() || MBB.hasAddressTaken() ||
+         &MBB == &MBB.getParent()->front();
+}
+
 /// scanFunctionJumpTables - Do a scan of the function, building up
 /// information about the sizes of each block and the locations of all
 /// the jump tables.
@@ -730,6 +745,20 @@ void ARMConstantIslands::scanFunctionJumpTables() {
           (I.getOpcode() == ARM::t2BR_JT || I.getOpcode() == ARM::tBR_JTr))
         T2JumpTables.push_back(&I);
   }
+
+  if (!MF->getInfo<ARMFunctionInfo>()->branchTargetEnforcement())
+    return;
+
+  if (const MachineJumpTableInfo *JTI = MF->getJumpTableInfo())
+    for (const MachineJumpTableEntry &JTE : JTI->getJumpTables())
+      for (const MachineBasicBlock *MBB : JTE.MBBs) {
+        if (isAlwaysIndirectTarget(*MBB))
+          // Set the reference count essentially to infinity, it will never
+          // reach zero and the BTI Instruction will never be removed.
+          BlockJumpTableRefCount[MBB] = std::numeric_limits<int>::max();
+        else
+          ++BlockJumpTableRefCount[MBB];
+      }
 }
 
 /// initializeFunctionInfo - Do the initial scan of the function, building up
@@ -1219,9 +1248,9 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) {
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
-      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
-        if (UserMI->getOperand(j).isCPI()) {
-          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+      for (MachineOperand &MO : UserMI->operands())
+        if (MO.isCPI()) {
+          MO.setIndex(CPEs[i].CPI);
           break;
         }
       // Adjust the refcount of the clone...
@@ -1601,9 +1630,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
   BBUtils->adjustBBOffsetsAfter(&*--NewIsland->getIterator());
 
   // Finally, change the CPI in the instruction operand to be ID.
-  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
-    if (UserMI->getOperand(i).isCPI()) {
-      UserMI->getOperand(i).setIndex(ID);
+  for (MachineOperand &MO : UserMI->operands())
+    if (MO.isCPI()) {
+      MO.setIndex(ID);
       break;
     }
 
@@ -2211,8 +2240,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
     unsigned JTOffset = BBUtils->getOffsetOf(MI) + 4;
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
     BBInfoVector &BBInfo = BBUtils->getBBInfo();
-    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
-      MachineBasicBlock *MBB = JTBBs[j];
+    for (MachineBasicBlock *MBB : JTBBs) {
       unsigned DstOffset = BBInfo[MBB->getNumber()].Offset;
       // Negative offset is not ok. FIXME: We should change BB layout to make
       // sure all the branches are forward.
@@ -2405,17 +2433,16 @@ bool ARMConstantIslands::reorderThumb2JumpTables() {
     // and try to adjust them such that that's true.
     int JTNumber = MI->getParent()->getNumber();
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
-    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
-      MachineBasicBlock *MBB = JTBBs[j];
+    for (MachineBasicBlock *MBB : JTBBs) {
       int DTNumber = MBB->getNumber();
 
       if (DTNumber < JTNumber) {
         // The destination precedes the switch. Try to move the block forward
         // so we have a positive offset.
         MachineBasicBlock *NewBB =
-          adjustJTTargetBlockForward(MBB, MI->getParent());
+            adjustJTTargetBlockForward(JTI, MBB, MI->getParent());
         if (NewBB)
-          MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB);
+          MJTI->ReplaceMBBInJumpTable(JTI, MBB, NewBB);
         MadeChange = true;
       }
     }
@@ -2424,8 +2451,40 @@ bool ARMConstantIslands::reorderThumb2JumpTables() {
   return MadeChange;
 }
 
-MachineBasicBlock *ARMConstantIslands::
-adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
+void ARMConstantIslands::fixupBTI(unsigned JTI, MachineBasicBlock &OldBB,
+                                  MachineBasicBlock &NewBB) {
+  assert(isThumb2 && "BTI in Thumb1?");
+
+  // Insert a BTI instruction into NewBB
+  BuildMI(NewBB, NewBB.begin(), DebugLoc(), TII->get(ARM::t2BTI));
+
+  // Update jump table reference counts.
+  const MachineJumpTableInfo &MJTI = *MF->getJumpTableInfo();
+  const MachineJumpTableEntry &JTE = MJTI.getJumpTables()[JTI];
+  for (const MachineBasicBlock *MBB : JTE.MBBs) {
+    if (MBB != &OldBB)
+      continue;
+    --BlockJumpTableRefCount[MBB];
+    ++BlockJumpTableRefCount[&NewBB];
+  }
+
+  // If the old basic block reference count dropped to zero, remove
+  // the BTI instruction at its beginning.
+  if (BlockJumpTableRefCount[&OldBB] > 0)
+    return;
+
+  // Skip meta instructions
+  auto BTIPos = llvm::find_if_not(OldBB.instrs(), [](const MachineInstr &MI) {
+    return MI.isMetaInstruction();
+  });
+  assert(BTIPos->getOpcode() == ARM::t2BTI &&
+         "BasicBlock is mentioned in a jump table but does start with BTI");
+  if (BTIPos->getOpcode() == ARM::t2BTI)
+    BTIPos->eraseFromParent();
+}
+
+MachineBasicBlock *ARMConstantIslands::adjustJTTargetBlockForward(
+    unsigned JTI, MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   // If the destination block is terminated by an unconditional branch,
   // try to move it; otherwise, create a new block following the jump
   // table that branches back to the actual target. This is a very simple
@@ -2483,6 +2542,9 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   NewBB->addSuccessor(BB);
   JTBB->replaceSuccessor(BB, NewBB);
 
+  if (MF->getInfo<ARMFunctionInfo>()->branchTargetEnforcement())
+    fixupBTI(JTI, *BB, *NewBB);
+
   ++NumJTInserted;
   return NewBB;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index a8f09969e948..7a35f252b22a 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -125,9 +125,8 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
                                      MachineInstrBuilder &UseMI,
                                      MachineInstrBuilder &DefMI) {
   const MCInstrDesc &Desc = OldMI.getDesc();
-  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands();
-       i != e; ++i) {
-    const MachineOperand &MO = OldMI.getOperand(i);
+  for (const MachineOperand &MO :
+       llvm::drop_begin(OldMI.operands(), Desc.getNumOperands())) {
     assert(MO.isReg() && MO.getReg());
     if (MO.isUse())
       UseMI.add(MO);
@@ -2252,8 +2251,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
               .add(predOps(ARMCC::AL))
               .addReg(JumpReg, RegState::Kill);
 
-      for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
-        NewCall->addOperand(MI.getOperand(I));
+      for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+        NewCall->addOperand(MO);
       if (MI.isCandidateForCallSiteEntry())
         MI.getMF()->moveCallSiteInfo(&MI, NewCall.getInstr());
 
@@ -2524,17 +2523,21 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::LDRLIT_ga_pcrel:
     case ARM::LDRLIT_ga_pcrel_ldr:
     case ARM::tLDRLIT_ga_abs:
+    case ARM::t2LDRLIT_ga_pcrel:
     case ARM::tLDRLIT_ga_pcrel: {
       Register DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
       const MachineOperand &MO1 = MI.getOperand(1);
       auto Flags = MO1.getTargetFlags();
       const GlobalValue *GV = MO1.getGlobal();
-      bool IsARM =
-          Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs;
+      bool IsARM = Opcode != ARM::tLDRLIT_ga_pcrel &&
+                   Opcode != ARM::tLDRLIT_ga_abs &&
+                   Opcode != ARM::t2LDRLIT_ga_pcrel;
       bool IsPIC =
           Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs;
       unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci;
+      if (Opcode == ARM::t2LDRLIT_ga_pcrel)
+        LDRLITOpc = ARM::t2LDRpci;
       unsigned PICAddOpc =
           IsARM
               ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
@@ -3065,7 +3068,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL));
       }
       MIB.cloneMemRefs(MI);
-      for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i));
+      for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+        MIB.add(MO);
       MI.eraseFromParent();
       return true;
     }
@@ -3080,8 +3084,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                       Opcode == ARM::LOADDUAL ? RegState::Define : 0)
               .addReg(TRI->getSubReg(PairReg, ARM::gsub_1),
                       Opcode == ARM::LOADDUAL ? RegState::Define : 0);
-      for (unsigned i = 1; i < MI.getNumOperands(); i++)
-        MIB.add(MI.getOperand(i));
+      for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+        MIB.add(MO);
       MIB.add(predOps(ARMCC::AL));
       MIB.cloneMemRefs(MI);
       MI.eraseFromParent();
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 025e43444f9c..b866cf952ff1 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -523,9 +523,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   // Determine spill area sizes.
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    int FI = CSI[i].getFrameIdx();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
+    int FI = I.getFrameIdx();
     switch (Reg) {
     case ARM::R8:
     case ARM::R9:
@@ -1317,11 +1317,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // Mark the D-register spill slots as properly aligned.  Since MFI computes
   // stack slot layout backwards, this can actually mean that the d-reg stack
   // slot offsets can be wrong. The offset for d8 will always be correct.
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned DNum = CSI[i].getReg() - ARM::D8;
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned DNum = I.getReg() - ARM::D8;
     if (DNum > NumAlignedDPRCS2Regs - 1)
       continue;
-    int FI = CSI[i].getFrameIdx();
+    int FI = I.getFrameIdx();
     // The even-numbered registers will be 16-byte aligned, the odd-numbered
     // registers will be 8-byte aligned.
     MFI.setObjectAlignment(FI, DNum % 2 ? Align(8) : Align(16));
@@ -1488,9 +1488,9 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
 
   // Find the frame index assigned to d8.
   int D8SpillFI = 0;
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
-    if (CSI[i].getReg() == ARM::D8) {
-      D8SpillFI = CSI[i].getFrameIdx();
+  for (const CalleeSavedInfo &I : CSI)
+    if (I.getReg() == ARM::D8) {
+      D8SpillFI = I.getFrameIdx();
       break;
     }
 
@@ -1693,7 +1693,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           // Default 12 bit limit.
           break;
         case ARMII::AddrMode3:
-        case ARMII::AddrModeT2_i8:
+        case ARMII::AddrModeT2_i8neg:
           Limit = std::min(Limit, (1U << 8) - 1);
           break;
         case ARMII::AddrMode5FP16:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 2b83a292db76..bb2859c766c2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3274,7 +3274,8 @@ bool ARMDAGToDAGISel::tryFP_TO_INT(SDNode *N, SDLoc dl) {
     return false;
   unsigned int ScalarBits = Type.getScalarSizeInBits();
 
-  bool IsUnsigned = N->getOpcode() == ISD::FP_TO_UINT;
+  bool IsUnsigned = N->getOpcode() == ISD::FP_TO_UINT ||
+                    N->getOpcode() == ISD::FP_TO_UINT_SAT;
   SDNode *Node = N->getOperand(0).getNode();
 
   // floating-point to fixed-point with one fractional bit gets turned into an
@@ -3764,6 +3765,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     break;
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::FP_TO_SINT_SAT:
     if (tryFP_TO_INT(N, dl))
       return;
     break;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e7e10ce07a44..33d115945614 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1016,6 +1016,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::FP_EXTEND);
     setTargetDAGCombine(ISD::SELECT);
     setTargetDAGCombine(ISD::SELECT_CC);
+    setTargetDAGCombine(ISD::SETCC);
+  }
+  if (Subtarget->hasMVEFloatOps()) {
+    setTargetDAGCombine(ISD::FADD);
   }
 
   if (!Subtarget->hasFP64()) {
@@ -10587,10 +10591,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   LPadList.reserve(CallSiteNumToLPad.size());
   for (unsigned I = 1; I <= MaxCSNum; ++I) {
     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
-    for (SmallVectorImpl<MachineBasicBlock*>::iterator
-           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
-      LPadList.push_back(*II);
-      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
+    for (MachineBasicBlock *MBB : MBBList) {
+      LPadList.push_back(MBB);
+      InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end());
     }
   }
 
@@ -10879,9 +10882,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // Add the jump table entries as successors to the MBB.
   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
-  for (std::vector<MachineBasicBlock*>::iterator
-         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
-    MachineBasicBlock *CurMBB = *I;
+  for (MachineBasicBlock *CurMBB : LPadList) {
     if (SeenMBBs.insert(CurMBB).second)
       DispContBB->addSuccessor(CurMBB);
   }
@@ -10943,9 +10944,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // Mark all former landing pads as non-landing pads. The dispatch is the only
   // landing pad now.
-  for (SmallVectorImpl<MachineBasicBlock*>::iterator
-         I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
-    (*I)->setIsEHPad(false);
+  for (MachineBasicBlock *MBBLPad : MBBLPads)
+    MBBLPad->setIsEHPad(false);
 
   // The instruction is gone now.
   MI.eraseFromParent();
@@ -11771,8 +11771,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
-    for (unsigned i = 0; i < MI.getNumOperands(); ++i)
-      MIB.add(MI.getOperand(i));
+    for (const MachineOperand &MO : MI.operands())
+      MIB.add(MO);
     MI.eraseFromParent();
     return BB;
   }
@@ -13083,6 +13083,65 @@ static SDValue PerformVSELECTCombine(SDNode *N,
   return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
 }
 
+// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
+static SDValue PerformVSetCCToVCTPCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const ARMSubtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  EVT VT = N->getValueType(0);
+
+  if (!Subtarget->hasMVEIntegerOps() ||
+      !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  if (CC == ISD::SETUGE) {
+    std::swap(Op0, Op1);
+    CC = ISD::SETULT;
+  }
+
+  if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
+      Op0.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check first operand is BuildVector of 0,1,2,...
+  for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
+    if (!Op0.getOperand(I).isUndef() &&
+        !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
+          Op0.getConstantOperandVal(I) == I))
+      return SDValue();
+  }
+
+  // The second is a Splat of Op1S
+  SDValue Op1S = DCI.DAG.getSplatValue(Op1);
+  if (!Op1S)
+    return SDValue();
+
+  unsigned Opc;
+  switch (VT.getVectorNumElements()) {
+  case 2:
+    Opc = Intrinsic::arm_mve_vctp64;
+    break;
+  case 4:
+    Opc = Intrinsic::arm_mve_vctp32;
+    break;
+  case 8:
+    Opc = Intrinsic::arm_mve_vctp16;
+    break;
+  case 16:
+    Opc = Intrinsic::arm_mve_vctp8;
+    break;
+  default:
+    return SDValue();
+  }
+
+  SDLoc DL(N);
+  return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DCI.DAG.getConstant(Opc, DL, MVT::i32),
+                         DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
+}
+
 static SDValue PerformABSCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
@@ -13427,6 +13486,26 @@ bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   return VT.isScalarInteger();
 }
 
+bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
+                                             EVT VT) const {
+  if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
+    return false;
+
+  switch (FPVT.getSimpleVT().SimpleTy) {
+  case MVT::f16:
+    return Subtarget->hasVFP2Base();
+  case MVT::f32:
+    return Subtarget->hasVFP2Base();
+  case MVT::f64:
+    return Subtarget->hasFP64();
+  case MVT::v4f32:
+  case MVT::v8f16:
+    return Subtarget->hasMVEFloatOps();
+  default:
+    return false;
+  }
+}
+
 static SDValue PerformSHLSimplify(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const ARMSubtarget *ST) {
@@ -14485,6 +14564,52 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Check that N is CMPZ(CSINC(0, 0, CC, X)), return X if valid.
+static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
+  if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
+    return SDValue();
+  SDValue CSInc = Cmp->getOperand(0);
+  if (CSInc.getOpcode() != ARMISD::CSINC ||
+      !isNullConstant(CSInc.getOperand(0)) ||
+      !isNullConstant(CSInc.getOperand(1)) || !CSInc->hasOneUse())
+    return SDValue();
+  CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
+  return CSInc.getOperand(3);
+}
+
+static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {
+  // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
+  //       t92: glue = ARMISD::CMPZ t74, 0
+  //     t93: i32 = ARMISD::CSINC 0, 0, 1, t92
+  //   t96: glue = ARMISD::CMPZ t93, 0
+  // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
+  ARMCC::CondCodes Cond;
+  if (SDValue C = IsCMPZCSINC(N, Cond))
+    if (Cond == ARMCC::EQ)
+      return C;
+  return SDValue();
+}
+
+static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) {
+  // Fold away an unneccessary CMPZ/CSINC
+  // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
+  // if C1==EQ -> CSXYZ A, B, C2, D
+  // if C1==NE -> CSXYZ A, B, NOT(C2), D
+  ARMCC::CondCodes Cond;
+  if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
+    if (N->getConstantOperandVal(2) == ARMCC::EQ)
+      return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
+                         N->getOperand(1),
+                         DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
+    if (N->getConstantOperandVal(2) == ARMCC::NE)
+      return DAG.getNode(
+          N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
+          N->getOperand(1),
+          DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C);
+  }
+  return SDValue();
+}
+
 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
 /// ARMISD::VMOVRRD.
 static SDValue PerformVMOVRRDCombine(SDNode *N,
@@ -16411,6 +16536,42 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
   return FixConv;
 }
 
+static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,
+                                         const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasMVEFloatOps())
+    return SDValue();
+
+  // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
+  // The second form can be more easily turned into a predicated vadd, and
+  // possibly combined into a fma to become a predicated vfma.
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // The identity element for a fadd is -0.0, which these VMOV's represent.
+  auto isNegativeZeroSplat = [&](SDValue Op) {
+    if (Op.getOpcode() != ISD::BITCAST ||
+        Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
+      return false;
+    if (VT == MVT::v4f32 && Op.getOperand(0).getConstantOperandVal(0) == 1664)
+      return true;
+    if (VT == MVT::v8f16 && Op.getOperand(0).getConstantOperandVal(0) == 2688)
+      return true;
+    return false;
+  };
+
+  if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
+    std::swap(Op0, Op1);
+
+  if (Op1.getOpcode() != ISD::VSELECT ||
+      !isNegativeZeroSplat(Op1.getOperand(2)))
+    return SDValue();
+  SDValue FAdd =
+      DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), N->getFlags());
+  return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0);
+}
+
 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
 /// can replace combinations of VCVT (integer to floating-point) and VDIV
 /// when the VDIV has a constant operand that is a power of 2.
@@ -17049,18 +17210,6 @@ static SDValue PerformShiftCombine(SDNode *N,
                                    const ARMSubtarget *ST) {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
-  if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
-    // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
-    // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
-    SDValue N1 = N->getOperand(1);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
-      SDValue N0 = N->getOperand(0);
-      if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
-          DAG.MaskedValueIsZero(N0.getOperand(0),
-                                APInt::getHighBitsSet(32, 16)))
-        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
-    }
-  }
 
   if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
       N->getOperand(0)->getOpcode() == ISD::AND &&
@@ -18173,6 +18322,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT_CC:
   case ISD::SELECT:     return PerformSELECTCombine(N, DCI, Subtarget);
   case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
+  case ISD::SETCC:      return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
   case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
@@ -18205,6 +18355,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
+  case ISD::FADD:
+    return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget);
   case ISD::FDIV:
     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
@@ -18228,6 +18380,12 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformCMOVCombine(N, DCI.DAG);
   case ARMISD::BRCOND:
     return PerformBRCONDCombine(N, DCI.DAG);
+  case ARMISD::CMPZ:
+    return PerformCMPZCombine(N, DCI.DAG);
+  case ARMISD::CSINC:
+  case ARMISD::CSINV:
+  case ARMISD::CSNEG:
+    return PerformCSETCombine(N, DCI.DAG);
   case ISD::LOAD:
     return PerformLOADCombine(N, DCI, Subtarget);
   case ARMISD::VLD1DUP:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
index 0fddd58e178e..e3b422358cae 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -736,6 +736,8 @@ class VectorType;
 
     bool preferIncOfAddToSubOfNot(EVT VT) const override;
 
+    bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
index de351372abf2..ff5afd787c82 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -103,15 +103,17 @@ def AddrModeT1_4    : AddrMode<9>;
 def AddrModeT1_s    : AddrMode<10>;
 def AddrModeT2_i12  : AddrMode<11>;
 def AddrModeT2_i8   : AddrMode<12>;
-def AddrModeT2_so   : AddrMode<13>;
-def AddrModeT2_pc   : AddrMode<14>;
-def AddrModeT2_i8s4 : AddrMode<15>;
-def AddrMode_i12    : AddrMode<16>;
-def AddrMode5FP16   : AddrMode<17>;
-def AddrModeT2_ldrex : AddrMode<18>;
-def AddrModeT2_i7s4 : AddrMode<19>;
-def AddrModeT2_i7s2 : AddrMode<20>;
-def AddrModeT2_i7   : AddrMode<21>;
+def AddrModeT2_i8pos : AddrMode<13>;
+def AddrModeT2_i8neg : AddrMode<14>;
+def AddrModeT2_so   : AddrMode<15>;
+def AddrModeT2_pc   : AddrMode<16>;
+def AddrModeT2_i8s4 : AddrMode<17>;
+def AddrMode_i12    : AddrMode<18>;
+def AddrMode5FP16   : AddrMode<19>;
+def AddrModeT2_ldrex : AddrMode<20>;
+def AddrModeT2_i7s4 : AddrMode<21>;
+def AddrModeT2_i7s2 : AddrMode<22>;
+def AddrModeT2_i7   : AddrMode<23>;
 
 // Load / store index mode.
 class IndexMode<bits<2> val> {
@@ -1392,9 +1394,12 @@ class T2I<dag oops, dag iops, InstrItinClass itin,
 class T2Ii12<dag oops, dag iops, InstrItinClass itin,
              string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>;
-class T2Ii8<dag oops, dag iops, InstrItinClass itin,
-            string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>;
+class T2Ii8p<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8pos, 4, itin, opc, asm, "", pattern>;
+class T2Ii8n<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8neg, 4, itin, opc, asm, "", pattern>;
 class T2Iso<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
index 7d0bc756e882..1c1db473f866 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -420,6 +420,12 @@ def lo16AllZero : PatLeaf<(i32 imm), [{
   return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
 }], hi16>;
 
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 GPR:$src), [{
+  return !SDValue(N,0)->getValueType(0).isVector() &&
+         CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+  }]>;
+
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
 
@@ -4748,6 +4754,8 @@ def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)),
               (REV16 (LDRH addrmode3:$addr))>;
 def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr),
                (STRH (REV16 GPR:$Rn), addrmode3:$addr)>;
+def : ARMV6Pat<(srl (bswap top16Zero:$Rn), (i32 16)),
+               (REV16 GPR:$Rn)>;
 
 let AddedComplexity = 5 in
 def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
index 697730037277..f53814a80e01 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3621,21 +3621,24 @@ class MVE_VMUL_fp<string iname, string suffix, bits<2> size, list<dag> pattern=[
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI,
-                            SDNode Op, Intrinsic PredInt> {
+multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op,
+                          Intrinsic PredInt, SDPatternOperator IdentityVec> {
   def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
-    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), IdentityVec>;
   }
 }
 
-multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI>
-  : MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated>;
+multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
+  : MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated, IdentityVec>;
+
+def ARMimmOneF: PatLeaf<(bitconvert (v4f32 (ARMvmovFPImm (i32 112))))>; // 1.0 float
+def ARMimmOneH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2620))))>; // 1.0 half
 
-defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>;
-defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>;
+defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32, ARMimmOneF>;
+defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16, ARMimmOneH>;
 
 class MVE_VCMLA<string suffix, bits<2> size>
   : MVEFloatArithNeon<"vcmla", suffix, size{1}, (outs MQPR:$Qd),
@@ -3747,27 +3750,30 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
 defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
 
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
-                            SDNode Op, Intrinsic PredInt> {
+                            SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
   def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0, 1, bit_21> {
     let validForTailPredication = 1;
   }
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
-    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), IdentityVec>;
   }
 }
 
-multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI>
-  : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>;
-multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI>
-  : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>;
+multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
+  : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated, IdentityVec>;
+multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
+  : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated, IdentityVec>;
 
-defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32>;
-defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>;
+def ARMimmMinusZeroF: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 1664))))>; // -0.0 float
+def ARMimmMinusZeroH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2688))))>; // -0.0 half
 
-defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>;
-defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>;
+defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32, ARMimmMinusZeroF>;
+defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16, ARMimmMinusZeroH>;
+
+defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32, ARMimmAllZerosV>;
+defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16, ARMimmAllZerosV>;
 
 class MVE_VCADD<string suffix, bits<2> size, string cstr="">
   : MVEFloatArithNeon<"vcadd", suffix, size{1}, (outs MQPR:$Qd),
@@ -5373,22 +5379,22 @@ defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
 defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
 
 multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
-                            SDNode Op, Intrinsic PredInt> {
+                            SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
   def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
-                              !cast<Instruction>(NAME)>;
+                              !cast<Instruction>(NAME), IdentityVec>;
 }
 
 let Predicates = [HasMVEFloat] in {
   defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd,
-                                          int_arm_mve_add_predicated>;
+                                          int_arm_mve_add_predicated, ARMimmMinusZeroF>;
   defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd,
-                                          int_arm_mve_add_predicated>;
+                                          int_arm_mve_add_predicated, ARMimmMinusZeroH>;
 
   defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub,
-                                          int_arm_mve_sub_predicated>;
+                                          int_arm_mve_sub_predicated, ARMimmAllZerosV>;
   defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub,
-                                          int_arm_mve_sub_predicated>;
+                                          int_arm_mve_sub_predicated, ARMimmAllZerosV>;
 }
 
 class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
@@ -5567,16 +5573,16 @@ defm MVE_VQRDMULH_qr_s8   : MVE_VQRDMULH_qr_m<MVE_v16s8>;
 defm MVE_VQRDMULH_qr_s16  : MVE_VQRDMULH_qr_m<MVE_v8s16>;
 defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 
-multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> {
+multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> {
   let validForTailPredication = 1 in
   def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
-                             !cast<Instruction>(NAME)>;
+                             !cast<Instruction>(NAME), IdentityVec>;
 }
 
 let Predicates = [HasMVEFloat] in {
-  defm MVE_VMUL_qr_f16   : MVE_VxxMUL_qr_f_m<MVE_v8f16>;
-  defm MVE_VMUL_qr_f32   : MVE_VxxMUL_qr_f_m<MVE_v4f32>;
+  defm MVE_VMUL_qr_f16   : MVE_VxxMUL_qr_f_m<MVE_v8f16, ARMimmOneH>;
+  defm MVE_VMUL_qr_f32   : MVE_VxxMUL_qr_f_m<MVE_v4f32, ARMimmOneF>;
 }
 
 class MVE_VFMAMLA_qr<string iname, string suffix,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
index bf717a4056e9..f09ad8167600 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1576,6 +1576,8 @@ def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
             (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
 def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)),
             (tREV16 (tLDRHr t_addrmode_rr:$addr))>;
+def : T1Pat<(srl (bswap top16Zero:$Rn), (i32 16)),
+            (tREV16 tGPR:$Rn)>;
 def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
                            t_addrmode_is2:$addr),
             (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 783db9dde17f..4471317f4ea4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -1191,9 +1191,9 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
 
     let DecoderMethod = "DecodeT2LoadImm12";
   }
-  def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
-                   opc, "\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]>,
+  def i8  : T2Ii8n <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
+                    opc, "\t$Rt, $addr",
+                    [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]>,
             Sched<[WriteLd]> {
     bits<4> Rt;
     bits<13> addr;
@@ -1284,9 +1284,9 @@ multiclass T2I_st<bits<2> opcod, string opc,
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm
   }
-  def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
-                   opc, "\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]>,
+  def i8  : T2Ii8n <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
+                    opc, "\t$Rt, $addr",
+                    [(opnode target:$Rt, t2addrmode_negimm8:$addr)]>,
             Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
@@ -1580,8 +1580,8 @@ def t2LDR_POST_imm : t2AsmPseudo<"ldr${p}.w $Rt, $Rn, $imm",
 // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
-          "\t$Rt, $addr", []>, Sched<[WriteLd]> {
+  : T2Ii8p<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
+           "\t$Rt, $addr", []>, Sched<[WriteLd]> {
   bits<4> Rt;
   bits<13> addr;
   let Inst{31-27} = 0b11111;
@@ -1747,8 +1747,8 @@ def t2STR_POST_imm : t2AsmPseudo<"str${p}.w $Rt, $Rn, $imm",
 // only.
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
 class T2IstT<bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs), (ins rGPR:$Rt, t2addrmode_imm8:$addr), ii, opc,
-          "\t$Rt, $addr", []>, Sched<[WriteST]> {
+  : T2Ii8p<(outs), (ins rGPR:$Rt, t2addrmode_posimm8:$addr), ii, opc,
+           "\t$Rt, $addr", []>, Sched<[WriteST]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = 0; // not signed
@@ -1851,8 +1851,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let DecoderMethod = "DecodeT2LoadImm12";
   }
 
-  def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
-                "\t$addr",
+  def i8 : T2Ii8n<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
+                 "\t$addr",
             [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>,
             Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
@@ -2926,18 +2926,11 @@ let AddedComplexity = 1 in
 def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
             (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
 
-// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
-def top16Zero: PatLeaf<(i32 rGPR:$src), [{
-  return !SDValue(N,0)->getValueType(0).isVector() &&
-         CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
-  }]>;
-
 // so_imm_notSext is needed instead of so_imm_not, as the value of imm
 // will match the extended, not the original bitWidth for $src.
 def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm),
             (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>;
 
-
 // FIXME: Disable this pattern on Darwin to workaround an assembler bug.
 def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
             (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
@@ -3283,6 +3276,9 @@ def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                 [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>,
                 Sched<[WriteALU]>;
 
+def : T2Pat<(srl (bswap top16Zero:$Rn), (i32 16)),
+            (t2REV16 rGPR:$Rn)>;
+
 def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                        "revsh", ".w\t$Rd, $Rm",
                  [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>,
@@ -4059,6 +4055,8 @@ def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
   bits<8> imm;
   let Inst{31-3} = 0b11110011101011111000000000000;
   let Inst{7-0} = imm;
+
+  let DecoderMethod = "DecodeT2HintSpaceInstruction";
 }
 
 def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p), 0>;
@@ -4079,6 +4077,11 @@ def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> {
 def : t2InstAlias<"csdb$p.w", (t2HINT 20, pred:$p), 0>;
 def : t2InstAlias<"csdb$p",   (t2HINT 20, pred:$p), 1>;
 
+def : t2InstAlias<"pacbti$p r12,lr,sp", (t2HINT 13, pred:$p), 1>;
+def : t2InstAlias<"bti$p", (t2HINT 15, pred:$p), 1>;
+def : t2InstAlias<"pac$p r12,lr,sp", (t2HINT 29, pred:$p), 1>;
+def : t2InstAlias<"aut$p r12,lr,sp", (t2HINT 45, pred:$p), 1>;
+
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
                 [(int_arm_dbg imm0_15:$opt)]> {
   bits<4> opt;
@@ -4254,6 +4257,19 @@ def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
 
 def : T2Pat<(ARMWrapperJT tjumptable:$dst), (t2LEApcrelJT tjumptable:$dst)>;
 
+let hasNoSchedulingInfo = 1 in {
+def t2LDRLIT_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+                                  IIC_iLoadiALU,
+                                  [(set rGPR:$dst,
+                                        (ARMWrapperPIC tglobaladdr:$addr))]>,
+                       Requires<[IsThumb, HasV8MBaseline, DontUseMovtInPic]>;
+}
+
+// TLS globals
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (t2LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+      Requires<[IsThumb, HasV8MBaseline, DontUseMovtInPic]>;
+
 // Pseudo instruction that combines ldr from constpool and add pc. This should
 // be expanded into two instructions late to allow if-conversion and
 // scheduling.
@@ -5607,6 +5623,15 @@ let Predicates = [HasV8_1MMainline] in {
   defm : CSPats<ARMcsinv, t2CSINV>;
   defm : CSPats<ARMcsneg, t2CSNEG>;
 
+  def : T2Pat<(ARMcmov (i32 1), (i32 0), cmovpred:$imm),
+              (t2CSINC ZR, ZR, imm0_31:$imm)>;
+  def : T2Pat<(ARMcmov (i32 -1), (i32 0), cmovpred:$imm),
+              (t2CSINV ZR, ZR, imm0_31:$imm)>;
+  def : T2Pat<(ARMcmov (i32 0), (i32 1), cmovpred:$imm),
+              (t2CSINC ZR, ZR, (inv_cond_XFORM imm:$imm))>;
+  def : T2Pat<(ARMcmov (i32 0), (i32 -1), cmovpred:$imm),
+              (t2CSINV ZR, ZR, (inv_cond_XFORM imm:$imm))>;
+
   multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> {
     def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm),
                 (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
@@ -5636,3 +5661,78 @@ let Predicates = [HasV8_1MMainline] in {
   def : InstAlias<"cneg\t$Rd, $Rn, $fcond",
                  (t2CSNEG rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>;
 }
+
+
+// PACBTI
+let Predicates = [IsThumb2, HasV8_1MMainline, HasPACBTI] in {
+def t2PACG : V8_1MI<(outs rGPR:$Rd),
+                    (ins pred:$p, GPRnopc:$Rn, GPRnopc:$Rm),
+                    AddrModeNone, NoItinerary, "pacg${p}", "$Rd, $Rn, $Rm", "", []> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111110110110;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8}  = Rd;
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = Rm;
+}
+
+let hasSideEffects = 1 in {
+class PACBTIAut<dag iops, string asm, bit b>
+  : V8_1MI<(outs), iops,
+           AddrModeNone, NoItinerary, asm, "$Ra, $Rn, $Rm", "", []> {
+  bits<4> Ra;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111110110101;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Ra;
+  let Inst{11-5}  = 0b1111000;
+  let Inst{4}     = b;
+  let Inst{3-0}   = Rm;
+}
+}
+
+def t2AUTG  : PACBTIAut<(ins pred:$p, GPRnosp:$Ra, GPRnopc:$Rn, GPRnopc:$Rm),
+                        "autg${p}", 0>;
+
+let isBranch = 1, isTerminator = 1, isIndirectBranch = 1 in {
+  def t2BXAUT : PACBTIAut<(ins pred:$p, GPRnosp:$Ra, rGPR:$Rn, GPRnopc:$Rm),
+                          "bxaut${p}", 1>;
+}
+}
+
+
+class PACBTIHintSpaceInst<string asm, string ops, bits<8> imm>
+  : V8_1MI<(outs), (ins), AddrModeNone, NoItinerary, asm, ops, "", []> {
+  let Inst{31-8} = 0b111100111010111110000000;
+  let Inst{7-0}  = imm;
+
+  let Unpredictable{19-16} = 0b1111;
+  let Unpredictable{13-11} = 0b101;
+
+  let DecoderMethod = "DecodeT2HintSpaceInstruction";
+}
+
+class PACBTIHintSpaceNoOpsInst<string asm, bits<8> imm>
+  : PACBTIHintSpaceInst<asm, "", imm>;
+
+class PACBTIHintSpaceDefInst<string asm, bits<8> imm>
+  : PACBTIHintSpaceInst<asm, "r12, lr, sp", imm> {
+  let Defs = [R12];
+  let Uses = [LR, SP];
+}
+
+class PACBTIHintSpaceUseInst<string asm, bits<8> imm>
+  : PACBTIHintSpaceInst<asm, "r12, lr, sp", imm> {
+  let Uses = [R12, LR, SP];
+}
+
+def t2PAC    : PACBTIHintSpaceDefInst<"pac", 0b00011101>;
+def t2PACBTI : PACBTIHintSpaceDefInst<"pacbti", 0b00001101>;
+def t2BTI    : PACBTIHintSpaceNoOpsInst<"bti", 0b00001111>;
+def t2AUT    : PACBTIHintSpaceUseInst<"aut", 0b00101101> {
+  let hasSideEffects = 1;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 6e259b1baf97..3b10c60a0654 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1298,8 +1298,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
 
   // Can't use an updating ld/st if the base register is also a dest
   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
-  for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
-    if (MI->getOperand(i).getReg() == Base)
+  for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+    if (MO.getReg() == Base)
       return false;
 
   int Bytes = getLSMultipleTransferSize(MI);
@@ -1326,8 +1326,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
         return false;
 
       bool HighRegsUsed = false;
-      for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
-        if (MI->getOperand(i).getReg() >= ARM::R8) {
+      for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+        if (MO.getReg() >= ARM::R8) {
           HighRegsUsed = true;
           break;
         }
@@ -1350,8 +1350,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
     .addImm(Pred).addReg(PredReg);
 
   // Transfer the rest of operands.
-  for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
-    MIB.add(MI->getOperand(OpNum));
+  for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 3))
+    MIB.add(MO);
 
   // Transfer memoperands.
   MIB.setMemRefs(MI->memoperands());
@@ -2119,9 +2119,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
   bool Modified = false;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
+  for (MachineBasicBlock &MBB : Fn) {
     Modified |= LoadStoreMultipleOpti(MBB);
     if (STI->hasV5TOps())
       Modified |= MergeReturnIntoLDM(MBB);
@@ -2710,13 +2708,13 @@ static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm,
   if (isLegalAddressImm(Opcode, Imm, TII))
     return true;
 
-  // We can convert AddrModeT2_i12 to AddrModeT2_i8.
+  // We can convert AddrModeT2_i12 to AddrModeT2_i8neg.
   const MCInstrDesc &Desc = TII->get(Opcode);
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
   switch (AddrMode) {
   case ARMII::AddrModeT2_i12:
     CodesizeEstimate += 1;
-    return std::abs(Imm) < (((1 << 8) * 1) - 1);
+    return Imm < 0 && -Imm < ((1 << 8) * 1);
   }
   return false;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 507c3e69b3a4..308d5e7889f2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -13,8 +13,63 @@ using namespace llvm;
 
 void ARMFunctionInfo::anchor() {}
 
+static bool GetBranchTargetEnforcement(MachineFunction &MF) {
+  const auto &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  if (!Subtarget.isMClass() || !Subtarget.hasV7Ops())
+    return false;
+
+  const Function &F = MF.getFunction();
+  if (!F.hasFnAttribute("branch-target-enforcement")) {
+    if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+            F.getParent()->getModuleFlag("branch-target-enforcement")))
+      return BTE->getZExtValue();
+    return false;
+  }
+
+  const StringRef BTIEnable =
+      F.getFnAttribute("branch-target-enforcement").getValueAsString();
+  assert(BTIEnable.equals_insensitive("true") ||
+         BTIEnable.equals_insensitive("false"));
+  return BTIEnable.equals_insensitive("true");
+}
+
+// The pair returns values for the ARMFunctionInfo members
+// SignReturnAddress and SignReturnAddressAll respectively.
+static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
+  if (!F.hasFnAttribute("sign-return-address")) {
+    const Module &M = *F.getParent();
+    if (const auto *Sign = mdconst::extract_or_null<ConstantInt>(
+            M.getModuleFlag("sign-return-address"))) {
+      if (Sign->getZExtValue()) {
+        if (const auto *All = mdconst::extract_or_null<ConstantInt>(
+                M.getModuleFlag("sign-return-address-all")))
+          return {true, All->getZExtValue()};
+        return {true, false};
+      }
+    }
+    return {false, false};
+  }
+
+  StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
+  if (Scope.equals("none"))
+    return {false, false};
+
+  if (Scope.equals("all"))
+    return {true, true};
+
+  assert(Scope.equals("non-leaf"));
+  return {true, false};
+}
+
 ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
     : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
       hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
       IsCmseNSEntry(MF.getFunction().hasFnAttribute("cmse_nonsecure_entry")),
-      IsCmseNSCall(MF.getFunction().hasFnAttribute("cmse_nonsecure_call")) {}
+      IsCmseNSCall(MF.getFunction().hasFnAttribute("cmse_nonsecure_call")),
+      BranchTargetEnforcement(GetBranchTargetEnforcement(MF)) {
+
+  const auto &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  if (Subtarget.isMClass() && Subtarget.hasV7Ops())
+    std::tie(SignReturnAddress, SignReturnAddressAll) =
+        GetSignReturnAddress(MF.getFunction());
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 851655284060..4077fc058217 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -142,6 +142,17 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// con/destructors).
   bool PreservesR0 = false;
 
+  /// True if the function should sign its return address.
+  bool SignReturnAddress = false;
+
+  /// True if the fucntion should sign its return address, even if LR is not
+  /// saved.
+  bool SignReturnAddressAll = false;
+
+  /// True if BTI instructions should be placed at potential indirect jump
+  /// destinations.
+  bool BranchTargetEnforcement = false;
+
 public:
   ARMFunctionInfo() = default;
 
@@ -268,6 +279,20 @@ public:
 
   void setPreservesR0() { PreservesR0 = true; }
   bool getPreservesR0() const { return PreservesR0; }
+
+  bool shouldSignReturnAddress() const {
+    return shouldSignReturnAddress(LRSpilled);
+  }
+
+  bool shouldSignReturnAddress(bool SpillsLR) const {
+    if (!SignReturnAddress)
+      return false;
+    if (SignReturnAddressAll)
+      return true;
+    return LRSpilled;
+  }
+
+  bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
index 2dc097566d14..c0dc6a363471 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
@@ -107,6 +107,8 @@ def HasRAS           : Predicate<"Subtarget->hasRAS()">,
                                  AssemblerPredicate<(all_of FeatureRAS), "ras">;
 def HasLOB           : Predicate<"Subtarget->hasLOB()">,
                                  AssemblerPredicate<(all_of FeatureLOB), "lob">;
+def HasPACBTI        : Predicate<"Subtarget->hasPACBTI()">,
+                                 AssemblerPredicate<(all_of FeaturePACBTI), "pacbti">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<(all_of FeatureFP16),"half-float conversions">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 9752b3166b45..760a5a5a20cf 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -277,6 +277,16 @@ def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)
   let DiagnosticString = "operand must be a register in range [r0, r14] or apsr_nzcv";
 }
 
+// GPRs without the SP register. Used for BXAUT and AUTG
+def GPRnosp : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, PC)> {
+  let AltOrders = [(add LR, GPRnosp), (trunc GPRnosp, 8),
+                   (add (trunc GPRnosp, 8), R12, LR, (shl GPRnosp, 8))];
+  let AltOrderSelect = [{
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
+  }];
+  let DiagnosticString = "operand must be a register in range [r0, r12] or LR or PC";
+}
+
 // GPRs without the PC and SP registers but with APSR. Used by CLRM instruction.
 def GPRwithAPSRnosp : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, APSR)> {
   let isAllocatable = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
index 5e1217b6a468..d51a888c951f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -373,6 +373,8 @@ protected:
   /// HasLOB - if true, the processor supports the Low Overhead Branch extension
   bool HasLOB = false;
 
+  bool HasPACBTI = false;
+
   /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
   /// particularly effective at zeroing a VFP register.
   bool HasZeroCycleZeroing = false;
@@ -671,6 +673,7 @@ public:
   bool hasCRC() const { return HasCRC; }
   bool hasRAS() const { return HasRAS; }
   bool hasLOB() const { return HasLOB; }
+  bool hasPACBTI() const { return HasPACBTI; }
   bool hasVirtualization() const { return HasVirtualization; }
 
   bool useNEONForSinglePrecisionFP() const {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td
index f21c7f0246f9..c03db15d1041 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td
@@ -106,6 +106,24 @@ def : MClassSysReg<0,    0,    1,    0x894, "control_ns">;
 def : MClassSysReg<0,    0,    1,    0x898, "sp_ns">;
 }
 
+let Requires = [{ {ARM::FeaturePACBTI} }] in {
+def : MClassSysReg<0,    0,    1,    0x820, "pac_key_p_0">;
+def : MClassSysReg<0,    0,    1,    0x821, "pac_key_p_1">;
+def : MClassSysReg<0,    0,    1,    0x822, "pac_key_p_2">;
+def : MClassSysReg<0,    0,    1,    0x823, "pac_key_p_3">;
+def : MClassSysReg<0,    0,    1,    0x824, "pac_key_u_0">;
+def : MClassSysReg<0,    0,    1,    0x825, "pac_key_u_1">;
+def : MClassSysReg<0,    0,    1,    0x826, "pac_key_u_2">;
+def : MClassSysReg<0,    0,    1,    0x827, "pac_key_u_3">;
+def : MClassSysReg<0,    0,    1,    0x8a0, "pac_key_p_0_ns">;
+def : MClassSysReg<0,    0,    1,    0x8a1, "pac_key_p_1_ns">;
+def : MClassSysReg<0,    0,    1,    0x8a2, "pac_key_p_2_ns">;
+def : MClassSysReg<0,    0,    1,    0x8a3, "pac_key_p_3_ns">;
+def : MClassSysReg<0,    0,    1,    0x8a4, "pac_key_u_0_ns">;
+def : MClassSysReg<0,    0,    1,    0x8a5, "pac_key_u_1_ns">;
+def : MClassSysReg<0,    0,    1,    0x8a6, "pac_key_u_2_ns">;
+def : MClassSysReg<0,    0,    1,    0x8a7, "pac_key_u_3_ns">;
+}
 
 // Banked Registers
 //
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 833c7effd31c..0b314ac2a41e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -92,6 +92,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
   initializeARMParallelDSPPass(Registry);
+  initializeARMBranchTargetsPass(Registry);
   initializeARMConstantIslandsPass(Registry);
   initializeARMExecutionDomainFixPass(Registry);
   initializeARMExpandPseudoPass(Registry);
@@ -571,6 +572,7 @@ void ARMPassConfig::addPreEmitPass() {
 }
 
 void ARMPassConfig::addPreEmitPass2() {
+  addPass(createARMBranchTargetsPass());
   addPass(createARMConstantIslandPass());
   addPass(createARMLowOverheadLoopsPass());
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 88de84a4fd78..602c6745d310 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -334,8 +334,9 @@ InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
 }
 
 // Checks whether Inst is part of a min(max()) or max(min()) pattern
-// that will match to an SSAT instruction
-static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
+// that will match to an SSAT instruction. Returns the instruction being
+// saturated, or null if no saturation pattern was found.
+static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
   Value *LHS, *RHS;
   ConstantInt *C;
   SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
@@ -358,12 +359,27 @@ static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
       return false;
     };
 
-    if (isSSatMin(Inst->getOperand(1)) ||
-        (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
-                               isSSatMin(*(++Inst->user_begin())))))
-      return true;
+    if (isSSatMin(Inst->getOperand(1)))
+      return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
+    if (Inst->hasNUses(2) &&
+        (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
+      return Inst->getOperand(1);
   }
-  return false;
+  return nullptr;
+}
+
+// Look for a FP Saturation pattern, where the instruction can be simplified to
+// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
+static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
+  if (Imm.getBitWidth() != 64 ||
+      Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
+    return false;
+  Value *FP = isSSATMinMaxPattern(Inst, Imm);
+  if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
+    FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
+  if (!FP)
+    return false;
+  return isa<FPToSIInst>(FP);
 }
 
 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
@@ -423,6 +439,9 @@ InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
       return 0;
   }
 
+  if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
+    return 0;
+
   // We can convert <= -1 to < 0, which is generally quite cheap.
   if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
     ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 64d2e1bfa9b2..39f407ba7149 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6429,15 +6429,17 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
       Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
       Mnemonic == "bxns"  || Mnemonic == "blxns" ||
-      Mnemonic == "vdot"  || Mnemonic == "vmmla"  ||
+      Mnemonic == "vdot"  || Mnemonic == "vmmla" ||
       Mnemonic == "vudot" || Mnemonic == "vsdot" ||
       Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
       Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
-      Mnemonic == "wls" || Mnemonic == "le" || Mnemonic == "dls" ||
-      Mnemonic == "csel" || Mnemonic == "csinc" ||
+      Mnemonic == "wls"   || Mnemonic == "le"    || Mnemonic == "dls" ||
+      Mnemonic == "csel"  || Mnemonic == "csinc" ||
       Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" ||
-      Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" ||
-      Mnemonic == "csetm")
+      Mnemonic == "cinv"  || Mnemonic == "cneg"  || Mnemonic == "cset" ||
+      Mnemonic == "csetm" ||
+      Mnemonic == "aut"   || Mnemonic == "pac" || Mnemonic == "pacbti" ||
+      Mnemonic == "bti")
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
@@ -6581,9 +6583,11 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic,
       Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" ||
       Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" ||
       Mnemonic == "cset" || Mnemonic == "csetm" ||
-      Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") ||
       (hasCDE() && MS.isCDEInstr(Mnemonic) &&
        !MS.isITPredicableCDEInstr(Mnemonic)) ||
+      Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") ||
+      Mnemonic == "pac" || Mnemonic == "pacbti" || Mnemonic == "aut" ||
+      Mnemonic == "bti" ||
       (hasMVE() &&
        (Mnemonic.startswith("vst2") || Mnemonic.startswith("vld2") ||
         Mnemonic.startswith("vst4") || Mnemonic.startswith("vld4") ||
@@ -12272,6 +12276,7 @@ bool ARMAsmParser::enableArchExtFeature(StringRef Name, SMLoc &ExtLoc) {
        {ARM::FeatureFPARMv8, ARM::FeatureFullFP16}},
       {ARM::AEK_RAS, {Feature_HasV8Bit}, {ARM::FeatureRAS}},
       {ARM::AEK_LOB, {Feature_HasV8_1MMainlineBit}, {ARM::FeatureLOB}},
+      {ARM::AEK_PACBTI, {Feature_HasV8_1MMainlineBit}, {ARM::FeaturePACBTI}},
       // FIXME: Unsupported extensions.
       {ARM::AEK_OS, {}, {}},
       {ARM::AEK_IWMMXT, {}, {}},
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 9caef9f09ea9..c3df7dc88d79 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -185,8 +185,11 @@ static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus
 DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
+static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
                                                const void *Decoder);
 static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
                                                unsigned RegNo, uint64_t Address,
@@ -287,6 +290,9 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
 static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
@@ -1172,6 +1178,19 @@ DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
   return S;
 }
 
+static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo == 13)
+    S = MCDisassembler::SoftFail;
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+
+  return S;
+}
+
 static DecodeStatus
 DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
                                uint64_t Address, const void *Decoder) {
@@ -2441,6 +2460,31 @@ static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+
+  unsigned Opcode = ARM::t2HINT;
+
+  if (imm == 0x0D) {
+    Opcode = ARM::t2PACBTI;
+  } else if (imm == 0x1D) {
+    Opcode = ARM::t2PAC;
+  } else if (imm == 0x2D) {
+    Opcode = ARM::t2AUT;
+  } else if (imm == 0x0F) {
+    Opcode = ARM::t2BTI;
+  }
+
+  Inst.setOpcode(Opcode);
+  if (Opcode == ARM::t2HINT) {
+    Inst.addOperand(MCOperand::createImm(imm));
+  }
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4726,6 +4770,25 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
       if (!(FeatureBits[ARM::Feature8MSecExt]))
         return MCDisassembler::Fail;
       break;
+    case 0x20: // pac_key_p_0
+    case 0x21: // pac_key_p_1
+    case 0x22: // pac_key_p_2
+    case 0x23: // pac_key_p_3
+    case 0x24: // pac_key_u_0
+    case 0x25: // pac_key_u_1
+    case 0x26: // pac_key_u_2
+    case 0x27: // pac_key_u_3
+    case 0xa0: // pac_key_p_0_ns
+    case 0xa1: // pac_key_p_1_ns
+    case 0xa2: // pac_key_p_2_ns
+    case 0xa3: // pac_key_p_3_ns
+    case 0xa4: // pac_key_u_0_ns
+    case 0xa5: // pac_key_u_1_ns
+    case 0xa6: // pac_key_u_2_ns
+    case 0xa7: // pac_key_u_3_ns
+      if (!(FeatureBits[ARM::FeaturePACBTI]))
+        return MCDisassembler::Fail;
+      break;
     default:
       // Architecturally defined as unpredictable
       S = MCDisassembler::SoftFail;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 43f7575df6db..f8de0320166a 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -195,16 +195,18 @@ namespace ARMII {
     AddrModeT1_4    = 9,
     AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
     AddrModeT2_i12  = 11,
-    AddrModeT2_i8   = 12,
-    AddrModeT2_so   = 13,
-    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
-    AddrModeT2_i8s4 = 15, // i8 * 4
-    AddrMode_i12    = 16,
-    AddrMode5FP16   = 17,  // i8 * 2
-    AddrModeT2_ldrex = 18, // i8 * 4, with unscaled offset in MCInst
-    AddrModeT2_i7s4 = 19, // i7 * 4
-    AddrModeT2_i7s2 = 20, // i7 * 2
-    AddrModeT2_i7   = 21, // i7 * 1
+    AddrModeT2_i8   = 12, // +/- i8
+    AddrModeT2_i8pos = 13, // + i8
+    AddrModeT2_i8neg = 14, // - i8
+    AddrModeT2_so   = 15,
+    AddrModeT2_pc   = 16, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 17, // i8 * 4
+    AddrMode_i12    = 18,
+    AddrMode5FP16   = 19,  // i8 * 2
+    AddrModeT2_ldrex = 20, // i8 * 4, with unscaled offset in MCInst
+    AddrModeT2_i7s4 = 21, // i7 * 4
+    AddrModeT2_i7s2 = 22, // i7 * 2
+    AddrModeT2_i7   = 23, // i7 * 1
   };
 
   inline static const char *AddrModeToString(AddrMode addrmode) {
@@ -223,6 +225,8 @@ namespace ARMII {
     case AddrModeT1_s:    return "AddrModeT1_s";
     case AddrModeT2_i12:  return "AddrModeT2_i12";
     case AddrModeT2_i8:   return "AddrModeT2_i8";
+    case AddrModeT2_i8pos: return "AddrModeT2_i8pos";
+    case AddrModeT2_i8neg: return "AddrModeT2_i8neg";
     case AddrModeT2_so:   return "AddrModeT2_so";
     case AddrModeT2_pc:   return "AddrModeT2_pc";
     case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 3e4c97630af6..02a2d01176fc 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -299,4 +299,9 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
   else if (STI.hasFeature(ARM::FeatureVirtualization))
     emitAttribute(ARMBuildAttrs::Virtualization_use,
                   ARMBuildAttrs::AllowVirtualization);
+
+  if (STI.hasFeature(ARM::FeaturePACBTI)) {
+    emitAttribute(ARMBuildAttrs::PAC_extension, ARMBuildAttrs::AllowPAC);
+    emitAttribute(ARMBuildAttrs::BTI_extension, ARMBuildAttrs::AllowBTI);
+  }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index e4e95f63f0a6..224c61b9f065 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -205,9 +205,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     return;
   }
 
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    int FI = CSI[i].getFrameIdx();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
+    int FI = I.getFrameIdx();
     switch (Reg) {
     case ARM::R8:
     case ARM::R9:
@@ -266,10 +266,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
   }
-  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-         E = CSI.end(); I != E; ++I) {
-    unsigned Reg = I->getReg();
-    int FI = I->getFrameIdx();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
+    int FI = I.getFrameIdx();
     switch (Reg) {
     case ARM::R8:
     case ARM::R9:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index bdb167a08e61..ebd139af2219 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -261,7 +261,7 @@ void Thumb2InstrInfo::expandLoadStackGuard(
       cast<GlobalValue>((*MI->memoperands_begin())->getValue());
 
   if (MF.getSubtarget<ARMSubtarget>().isGVInGOT(GV))
-    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::t2LDRi12);
+    expandLoadStackGuardBase(MI, ARM::t2LDRLIT_ga_pcrel, ARM::t2LDRi12);
   else if (MF.getTarget().isPositionIndependent())
     expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
   else
@@ -634,7 +634,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 
     unsigned NumBits = 0;
     unsigned Scale = 1;
-    if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+    if (AddrMode == ARMII::AddrModeT2_i8neg ||
+        AddrMode == ARMII::AddrModeT2_i12) {
       // i8 supports only negative, and i12 supports only positive, so
       // based on Offset sign convert Opcode to the appropriate
       // instruction
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index 132516694f4e..1164b6ebbac3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -502,8 +502,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     // For the non-writeback version (this one), the base register must be
     // one of the registers being loaded.
     bool isOK = false;
-    for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
-      if (MI->getOperand(i).getReg() == BaseReg) {
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 3)) {
+      if (MO.getReg() == BaseReg) {
         isOK = true;
         break;
       }
@@ -527,8 +527,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     // numbered register (i.e. it's in operand 4 onwards) then with writeback
     // the stored value is unknown, so we can't convert to tSTMIA_UPD.
     Register BaseReg = MI->getOperand(0).getReg();
-    for (unsigned i = 4; i < MI->getNumOperands(); ++i)
-      if (MI->getOperand(i).getReg() == BaseReg)
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 4))
+      if (MO.getReg() == BaseReg)
         return false;
 
     break;
@@ -611,8 +611,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   }
 
   // Transfer the rest of operands.
-  for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
-    MIB.add(MI->getOperand(OpNum));
+  for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), OpNum))
+    MIB.add(MO);
 
   // Transfer memoperands.
   MIB.setMemRefs(MI->memoperands());
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp
index 846798a63cb7..2ce9c386f24c 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp
@@ -47,9 +47,7 @@ MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
index 8bced3cec082..685bafd785df 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -214,9 +214,9 @@ bool BT::RegisterCell::meet(const RegisterCell &RC, Register SelfR) {
 BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
       const BitMask &M) {
   uint16_t B = M.first(), E = M.last(), W = width();
-  // Sanity: M must be a valid mask for *this.
+  // M must be a valid mask for *this.
   assert(B < W && E < W);
-  // Sanity: the masked part of *this must have the same number of bits
+  // The masked part of *this must have the same number of bits
   // as the source.
   assert(B > E || E-B+1 == RC.width());      // B <= E  =>  E-B+1 = |RC|.
   assert(B <= E || E+(W-B)+1 == RC.width()); // E < B   =>  E+(W-B)+1 = |RC|.
@@ -850,8 +850,7 @@ void BT::visitNonBranch(const MachineInstr &MI) {
   bool Eval = ME.evaluate(MI, Map, ResMap);
 
   if (Trace && Eval) {
-    for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
-      const MachineOperand &MO = MI.getOperand(i);
+    for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg() || !MO.isUse())
         continue;
       RegisterRef RU(MO);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index 0f6dedeb28c3..1938a5c259da 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -189,7 +189,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
 
   unsigned NumDefs = 0;
 
-  // Sanity verification: there should not be any defs with subregisters.
+  // Basic correctness check: there should not be any defs with subregisters.
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 43f0758f6598..8c3b9572201e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -476,10 +476,10 @@ namespace {
 } // end anonymous namespace
 
 static const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) {
-    for (NodeSymRel::iterator I = Rel.begin(), E = Rel.end(); I != E; ++I)
-      if (I->count(N))
-        return &*I;
-    return nullptr;
+  for (const NodeSet &S : Rel)
+    if (S.count(N))
+      return &S;
+  return nullptr;
 }
 
   // Create an ordered pair of GepNode pointers. The pair will be used in
@@ -589,9 +589,8 @@ void HexagonCommonGEP::common() {
       dbgs() << "{ " << I->first << ", " << I->second << " }\n";
 
     dbgs() << "Gep equivalence classes:\n";
-    for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+    for (const NodeSet &S : EqRel) {
       dbgs() << '{';
-      const NodeSet &S = *I;
       for (NodeSet::const_iterator J = S.begin(), F = S.end(); J != F; ++J) {
         if (J != S.begin())
           dbgs() << ',';
@@ -604,8 +603,7 @@ void HexagonCommonGEP::common() {
   // Create a projection from a NodeSet to the minimal element in it.
   using ProjMap = std::map<const NodeSet *, GepNode *>;
   ProjMap PM;
-  for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
-    const NodeSet &S = *I;
+  for (const NodeSet &S : EqRel) {
     GepNode *Min = *std::min_element(S.begin(), S.end(), NodeOrder);
     std::pair<ProjMap::iterator,bool> Ins = PM.insert(std::make_pair(&S, Min));
     (void)Ins;
@@ -1280,8 +1278,8 @@ bool HexagonCommonGEP::runOnFunction(Function &F) {
     return false;
 
   // For now bail out on C++ exception handling.
-  for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A)
-    for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I)
+  for (const BasicBlock &BB : F)
+    for (const Instruction &I : BB)
       if (isa<InvokeInst>(I) || isa<LandingPadInst>(I))
         return false;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index a774baaa48e6..d3fcdb6ae9a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -1254,7 +1254,7 @@ void HCE::collect(MachineFunction &MF) {
 
 void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
       AssignmentMap &IMap) {
-  // Sanity check: make sure that all extenders in the range [Begin..End)
+  // Basic correctness: make sure that all extenders in the range [Begin..End)
   // share the same root ER.
   for (unsigned I = Begin; I != End; ++I)
     assert(ER == ExtRoot(Extenders[I].getOp()));
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 23d0cc829e52..03b0f75b2dc1 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -237,12 +237,9 @@ static bool isEvenReg(unsigned Reg) {
 }
 
 static void removeKillInfo(MachineInstr &MI, unsigned RegNotKilled) {
-  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-    MachineOperand &Op = MI.getOperand(I);
-    if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill())
-      continue;
-    Op.setIsKill(false);
-  }
+  for (MachineOperand &Op : MI.operands())
+    if (Op.isReg() && Op.getReg() == RegNotKilled && Op.isKill())
+      Op.setIsKill(false);
 }
 
 /// Returns true if it is unsafe to move a copy instruction from \p UseReg to
@@ -403,10 +400,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
     // Mark TFRs that feed a potential new value store as such.
     if (TII->mayBeNewStore(MI)) {
       // Look for uses of TFR instructions.
-      for (unsigned OpdIdx = 0, OpdE = MI.getNumOperands(); OpdIdx != OpdE;
-           ++OpdIdx) {
-        MachineOperand &Op = MI.getOperand(OpdIdx);
-
+      for (const MachineOperand &Op : MI.operands()) {
         // Skip over anything except register uses.
         if (!Op.isReg() || !Op.isUse() || !Op.getReg())
           continue;
@@ -484,14 +478,13 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
     IsConst64Disabled = true;
 
   // Traverse basic blocks.
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
-       ++BI) {
+  for (MachineBasicBlock &MBB : MF) {
     PotentiallyNewifiableTFR.clear();
-    findPotentialNewifiableTFRs(*BI);
+    findPotentialNewifiableTFRs(MBB);
 
     // Traverse instructions in basic block.
-    for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end();
-        MI != End;) {
+    for (MachineBasicBlock::iterator MI = MBB.begin(), End = MBB.end();
+         MI != End;) {
       MachineInstr &I1 = *MI++;
 
       if (I1.isDebugInstr())
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index bff596e69efd..12ceac545e9d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1404,18 +1404,18 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
     // Add callee-saved registers as use.
     addCalleeSaveRegistersAsImpOperand(SaveRegsCall, CSI, false, true);
     // Add live in registers.
-    for (unsigned I = 0; I < CSI.size(); ++I)
-      MBB.addLiveIn(CSI[I].getReg());
+    for (const CalleeSavedInfo &I : CSI)
+      MBB.addLiveIn(I.getReg());
     return true;
   }
 
-  for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
-    unsigned Reg = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     // Add live in registers. We treat eh_return callee saved register r0 - r3
     // specially. They are not really callee saved registers as they are not
     // supposed to be killed.
     bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
-    int FI = CSI[i].getFrameIdx();
+    int FI = I.getFrameIdx();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
     HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI);
     if (IsKill)
@@ -1478,10 +1478,10 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     return true;
   }
 
-  for (unsigned i = 0; i < CSI.size(); ++i) {
-    unsigned Reg = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
-    int FI = CSI[i].getFrameIdx();
+    int FI = I.getFrameIdx();
     HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
   }
 
@@ -1619,8 +1619,8 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
   // (1) For each callee-saved register, add that register and all of its
   // sub-registers to SRegs.
   LLVM_DEBUG(dbgs() << "Initial CS registers: {");
-  for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
-    unsigned R = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned R = I.getReg();
     LLVM_DEBUG(dbgs() << ' ' << printReg(R, TRI));
     for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
       SRegs[*SR] = true;
@@ -1720,10 +1720,10 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
 
   LLVM_DEBUG({
     dbgs() << "CS information: {";
-    for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
-      int FI = CSI[i].getFrameIdx();
+    for (const CalleeSavedInfo &I : CSI) {
+      int FI = I.getFrameIdx();
       int Off = MFI.getObjectOffset(FI);
-      dbgs() << ' ' << printReg(CSI[i].getReg(), TRI) << ":fi#" << FI << ":sp";
+      dbgs() << ' ' << printReg(I.getReg(), TRI) << ":fi#" << FI << ":sp";
       if (Off >= 0)
         dbgs() << '+';
       dbgs() << Off;
@@ -2634,8 +2634,8 @@ bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF,
   // Check if CSI only has double registers, and if the registers form
   // a contiguous block starting from D8.
   BitVector Regs(Hexagon::NUM_TARGET_REGS);
-  for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
-    unsigned R = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned R = I.getReg();
     if (!Hexagon::DoubleRegsRegClass.contains(R))
       return true;
     Regs[R] = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 02da2f29591a..46c1fbc6eeb2 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -597,19 +597,12 @@ void HexagonGenInsert::dump_map() const {
 void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
   unsigned Index = 0;
 
-  using mf_iterator = MachineFunction::const_iterator;
-
-  for (mf_iterator A = MFN->begin(), Z = MFN->end(); A != Z; ++A) {
-    const MachineBasicBlock &B = *A;
+  for (const MachineBasicBlock &B : *MFN) {
     if (!CMS->BT.reached(&B))
       continue;
 
-    using mb_iterator = MachineBasicBlock::const_iterator;
-
-    for (mb_iterator I = B.begin(), E = B.end(); I != E; ++I) {
-      const MachineInstr *MI = &*I;
-      for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
-        const MachineOperand &MO = MI->getOperand(i);
+    for (const MachineInstr &MI : B) {
+      for (const MachineOperand &MO : MI.operands()) {
         if (MO.isReg() && MO.isDef()) {
           Register R = MO.getReg();
           assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
@@ -725,8 +718,7 @@ bool HexagonGenInsert::findNonSelfReference(unsigned VR) const {
 
 void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
       RegisterSet &Defs) const {
-  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
     Register R = MO.getReg();
@@ -738,8 +730,7 @@ void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
 
 void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
       RegisterSet &Uses) const {
-  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isUse())
       continue;
     Register R = MO.getReg();
@@ -942,12 +933,11 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
   // can remove them from the list of available registers once all DT
   // successors have been processed.
   RegisterSet BlockDefs, InsDefs;
-  for (MachineBasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
-    MachineInstr *MI = &*I;
+  for (MachineInstr &MI : *B) {
     InsDefs.clear();
-    getInstrDefs(MI, InsDefs);
+    getInstrDefs(&MI, InsDefs);
     // Leave those alone. They are more transparent than "insert".
-    bool Skip = MI->isCopy() || MI->isRegSequence();
+    bool Skip = MI.isCopy() || MI.isRegSequence();
 
     if (!Skip) {
       // Visit all defined registers, and attempt to find the corresponding
@@ -1458,8 +1448,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
   for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
     Instrs.push_back(&*I);
 
-  for (auto I = Instrs.begin(), E = Instrs.end(); I != E; ++I) {
-    MachineInstr *MI = *I;
+  for (MachineInstr *MI : Instrs) {
     unsigned Opc = MI->getOpcode();
     // Do not touch lifetime markers. This is why the target-independent DCE
     // cannot be used.
@@ -1501,7 +1490,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
   bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail;
   bool Changed = false;
 
-  // Sanity check: one, but not both.
+  // Verify: one, but not both.
   assert(!OptSelectAll0 || !OptSelectHas0);
 
   IFMap.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index cf4f13fb8c0d..55de02816fb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -328,7 +328,7 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
     if (!MxOpc)
       continue;
-    // Basic sanity check: since we are deleting instructions, validate the
+    // Basic correctness check: since we are deleting instructions, validate the
     // iterators. There is a possibility that one of Def1 or Def2 is translated
     // to "mux" and being considered for other "mux" instructions.
     if (!MX.At->getParent() || !MX.Def1->getParent() || !MX.Def2->getParent())
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index d8d2025c5d27..1a66394e9757 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -205,16 +205,14 @@ bool HexagonGenPredicate::isConvertibleToPredForm(const MachineInstr *MI) {
 }
 
 void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
-  for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
-    MachineBasicBlock &B = *A;
-    for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
-      MachineInstr *MI = &*I;
-      unsigned Opc = MI->getOpcode();
+  for (MachineBasicBlock &B : MF) {
+    for (MachineInstr &MI : B) {
+      unsigned Opc = MI.getOpcode();
       switch (Opc) {
         case Hexagon::C2_tfrpr:
         case TargetOpcode::COPY:
-          if (isPredReg(MI->getOperand(1).getReg())) {
-            RegisterSubReg RD = MI->getOperand(0);
+          if (isPredReg(MI.getOperand(1).getReg())) {
+            RegisterSubReg RD = MI.getOperand(0);
             if (RD.R.isVirtual())
               PredGPRs.insert(RD);
           }
@@ -411,7 +409,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
     NumOps = 2;
   }
 
-  // Some sanity: check that def is in operand #0.
+  // Check that def is in operand #0.
   MachineOperand &Op0 = MI->getOperand(0);
   assert(Op0.isDef());
   RegisterSubReg OutR(Op0);
@@ -488,8 +486,8 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
     }
   }
 
-  for (VectOfInst::iterator I = Erase.begin(), E = Erase.end(); I != E; ++I)
-    (*I)->eraseFromParent();
+  for (MachineInstr *MI : Erase)
+    MI->eraseFromParent();
 
   return Changed;
 }
@@ -515,11 +513,8 @@ bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) {
     Again = false;
     VectOfInst Processed, Copy;
 
-    using iterator = VectOfInst::iterator;
-
     Copy = PUsers;
-    for (iterator I = Copy.begin(), E = Copy.end(); I != E; ++I) {
-      MachineInstr *MI = *I;
+    for (MachineInstr *MI : Copy) {
       bool Done = convertToPredForm(MI);
       if (Done) {
         Processed.insert(MI);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index a4971ad712eb..5d2e1b259449 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -1014,12 +1014,10 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
   LLVM_DEBUG(dbgs() << "\nhw_loop head, "
                     << printMBBReference(**L->block_begin()));
   for (MachineBasicBlock *MBB : L->getBlocks()) {
-    for (MachineBasicBlock::iterator
-           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
-      const MachineInstr *MI = &*MII;
-      if (isInvalidLoopOperation(MI, IsInnerHWLoop)) {
+    for (const MachineInstr &MI : *MBB) {
+      if (isInvalidLoopOperation(&MI, IsInnerHWLoop)) {
         LLVM_DEBUG(dbgs() << "\nCannot convert to hw_loop due to:";
-                   MI->dump(););
+                   MI.dump(););
         return true;
       }
     }
@@ -1034,8 +1032,7 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
 bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
                               SmallVectorImpl<MachineInstr *> &DeadPhis) const {
   // Examine each operand.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
 
@@ -1089,8 +1086,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
     // It is possible that some DBG_VALUE instructions refer to this
     // instruction.  Examine each def operand for such references;
     // if found, mark the DBG_VALUE as undef (but don't delete it).
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
+    for (const MachineOperand &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
       Register Reg = MO.getReg();
@@ -1123,7 +1119,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
 bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
                                                  bool &RecL0used,
                                                  bool &RecL1used) {
-  // This is just for sanity.
+  // This is just to confirm basic correctness.
   assert(L->getHeader() && "Loop without a header?");
 
   bool Changed = false;
@@ -1877,8 +1873,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   if (TII->analyzeBranch(*ExitingBlock, TB, FB, Tmp1, false))
     return nullptr;
 
-  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
-    MachineBasicBlock *PB = *I;
+  for (MachineBasicBlock *PB : Preds) {
     bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp1, false);
     if (NotAnalyzed)
       return nullptr;
@@ -1960,8 +1955,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
 
   TB = FB = nullptr;
 
-  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
-    MachineBasicBlock *PB = *I;
+  for (MachineBasicBlock *PB : Preds) {
     if (PB != Latch) {
       Tmp2.clear();
       bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp2, false);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index b50a0e29ecae..ed4874baf7c8 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -1006,7 +1006,7 @@ static void packSegmentMask(ArrayRef<int> Mask, ArrayRef<unsigned> OutSegMap,
 
 static bool isPermutation(ArrayRef<int> Mask) {
   // Check by adding all numbers only works if there is no overflow.
-  assert(Mask.size() < 0x00007FFF && "Sanity failure");
+  assert(Mask.size() < 0x00007FFF && "Overflow failure");
   int Sum = 0;
   for (int Idx : Mask) {
     if (Idx == -1)
@@ -1217,7 +1217,7 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
       } else if (Seg0 == ~1u) {
         Seg0 = SegList[0] != Seg1 ? SegList[0] : SegList[1];
       } else {
-        assert(Seg1 == ~1u); // Sanity
+        assert(Seg1 == ~1u);
         Seg1 = SegList[0] != Seg0 ? SegList[0] : SegList[1];
       }
     }
@@ -1265,7 +1265,7 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
       } else {
         // BC or DA: this could be done via valign by SegLen.
         // Do nothing here, because valign (if possible) will be generated
-        // later on (make sure the Seg0 values are as expected, for sanity).
+        // later on (make sure the Seg0 values are as expected).
         assert(Seg0 == 1 || Seg0 == 3);
       }
     }
@@ -1414,7 +1414,7 @@ OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
     return OpRef::undef(getSingleVT(MVT::i8));
 
   unsigned HalfLen = HwLen / 2;
-  assert(isPowerOf2_32(HalfLen)); // Sanity.
+  assert(isPowerOf2_32(HalfLen));
 
   // Handle special case where the output is the same half of the input
   // repeated twice, i.e. if Va = AB, then handle the output of AA or BB.
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 29572e3106d1..88effed9f076 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -442,8 +442,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     CLI.IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                         IsVarArg, IsStructRet, StructAttrFlag, Outs,
                         OutVals, Ins, DAG);
-    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-      CCValAssign &VA = ArgLocs[i];
+    for (const CCValAssign &VA : ArgLocs) {
       if (VA.isMemLoc()) {
         CLI.IsTailCall = false;
         break;
@@ -2549,7 +2548,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
   // Special case for v{8,4,2}i1 (the only boolean vectors legal in Hexagon
   // without any coprocessors).
   if (ElemWidth == 1) {
-    assert(VecWidth == VecTy.getVectorNumElements() && "Sanity failure");
+    assert(VecWidth == VecTy.getVectorNumElements() &&
+           "Vector elements should equal vector width size");
     assert(VecWidth == 8 || VecWidth == 4 || VecWidth == 2);
     // Check if this is an extract of the lowest bit.
     if (IdxN) {
@@ -2863,8 +2863,7 @@ HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
       Scale /= 2;
     }
 
-    // Another sanity check. At this point there should only be two words
-    // left, and Scale should be 2.
+    // At this point there should only be two words left, and Scale should be 2.
     assert(Scale == 2 && Words[IdxW].size() == 2);
 
     SDValue WW = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 8900fca8bb78..f7237f496aee 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -9,6 +9,7 @@
 #include "HexagonISelLowering.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -1846,16 +1847,18 @@ HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = MemN->getChain();
   SDValue Base0 = MemN->getBasePtr();
   SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl);
+  unsigned MemOpc = MemN->getOpcode();
 
   MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr;
   if (MachineMemOperand *MMO = MemN->getMemOperand()) {
     MachineFunction &MF = DAG.getMachineFunction();
-    MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen);
-    MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen);
+    uint64_t MemSize = (MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE)
+                           ? (uint64_t)MemoryLocation::UnknownSize
+                           : HwLen;
+    MOp0 = MF.getMachineMemOperand(MMO, 0, MemSize);
+    MOp1 = MF.getMachineMemOperand(MMO, HwLen, MemSize);
   }
 
-  unsigned MemOpc = MemN->getOpcode();
-
   if (MemOpc == ISD::LOAD) {
     assert(cast<LoadSDNode>(Op)->isUnindexed());
     SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 76220eff4d51..b6984d40f78e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -193,9 +193,7 @@ static inline void parseOperands(const MachineInstr &MI,
   Defs.clear();
   Uses.clear();
 
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
 
@@ -1644,8 +1642,7 @@ bool HexagonInstrInfo::ClobbersPredicate(MachineInstr &MI,
                                          bool SkipDead) const {
   const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
 
-  for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
-    MachineOperand MO = MI.getOperand(oper);
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg()) {
       if (!MO.isDef())
         continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 9507de95231f..987c4a5fa6c4 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -109,8 +109,7 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
   assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
          "MCI opcode should have been set on construction");
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCO;
     bool MustExtend = MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
index fc31139e13ce..1ff248200572 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -120,16 +120,12 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   if (DisableHexagonPeephole) return false;
 
   // Loop over all of the basic blocks.
-  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
-       MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock *MBB = &*MBBb;
+  for (MachineBasicBlock &MBB : MF) {
     PeepholeMap.clear();
     PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
-    for (auto I = MBB->begin(), E = MBB->end(), NextI = I; I != E; I = NextI) {
-      NextI = std::next(I);
-      MachineInstr &MI = *I;
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
       // Look for sign extends:
       // %170 = SXTW %166
       if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
@@ -274,11 +270,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           if (NewOp) {
             Register PSrc = MI.getOperand(PR).getReg();
             if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
-              BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
-                      QII->get(NewOp), MI.getOperand(0).getReg())
-                .addReg(POrig)
-                .add(MI.getOperand(S2))
-                .add(MI.getOperand(S1));
+              BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), QII->get(NewOp),
+                      MI.getOperand(0).getReg())
+                  .addReg(POrig)
+                  .add(MI.getOperand(S2))
+                  .add(MI.getOperand(S1));
               MRI->clearKillFlags(POrig);
               MI.eraseFromParent();
             }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
index 93ba277b0c9d..2c5c64cfcfc6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -400,8 +400,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
   unsigned Acc = 0;  // Value accumulator.
   unsigned Shift = 0;
 
-  for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) {
-    MachineInstr *MI = *I;
+  for (MachineInstr *MI : OG) {
     const MachineMemOperand &MMO = getStoreTarget(MI);
     MachineOperand &SO = MI->getOperand(2);  // Source.
     assert(SO.isImm() && "Expecting an immediate operand");
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 87b1c43961d7..ecb2f88d8096 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -305,8 +305,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
         VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg();
         LastVRegUse.erase(MI->getOperand(1).getReg());
       } else {
-        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-          const MachineOperand &MO = MI->getOperand(i);
+        for (const MachineOperand &MO : MI->operands()) {
           if (!MO.isReg())
             continue;
           if (MO.isUse() && !MI->isCopy() &&
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 897fb209a8bf..ea2798a3b44e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -749,7 +749,6 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
                                       WithMaxAlign.ValTy, Adjust);
     int Diff = Start - (OffAtMax + Adjust);
     AlignVal = HVC.getConstInt(Diff);
-    // Sanity.
     assert(Diff >= 0);
     assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
   } else {
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index b9e577d201f9..cafe93bf8f4b 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -51,9 +51,8 @@ struct Filler : public MachineFunctionPass {
     TRI = Subtarget.getRegisterInfo();
 
     bool Changed = false;
-    for (MachineFunction::iterator FI = MF.begin(), FE = MF.end(); FI != FE;
-         ++FI)
-      Changed |= runOnMachineBasicBlock(*FI);
+    for (MachineBasicBlock &MBB : MF)
+      Changed |= runOnMachineBasicBlock(MBB);
     return Changed;
   }
 
@@ -200,8 +199,7 @@ bool Filler::delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad,
   assert((!MI->isCall() && !MI->isReturn()) &&
          "Cannot put calls or returns in delay slot.");
 
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
+  for (const MachineOperand &MO : MI->operands()) {
     unsigned Reg;
 
     if (!MO.isReg() || !(Reg = MO.getReg()))
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
index 3a2d5030775e..3644eafe4353 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -65,17 +65,14 @@ void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
       *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
   unsigned MaxCallFrameSize = MF.getFrameInfo().getMaxCallFrameSize();
 
-  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E;
-       ++MBB) {
-    MachineBasicBlock::iterator MBBI = MBB->begin();
-    while (MBBI != MBB->end()) {
-      MachineInstr &MI = *MBBI++;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
       if (MI.getOpcode() == Lanai::ADJDYNALLOC) {
         DebugLoc DL = MI.getDebugLoc();
         Register Dst = MI.getOperand(0).getReg();
         Register Src = MI.getOperand(1).getReg();
 
-        BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst)
+        BuildMI(MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst)
             .addReg(Src)
             .addImm(MaxCallFrameSize);
         MI.eraseFromParent();
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 21d035c7ee9c..4217b8509676 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -467,8 +467,7 @@ static MachineInstr *canFoldIntoSelect(Register Reg,
     return nullptr;
   // Check if MI has any non-dead defs or physreg uses. This also detects
   // predicated instructions which will be reading SR.
-  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 1)) {
     // Reject frame index operands.
     if (MO.isFI() || MO.isCPI() || MO.isJTI())
       return nullptr;
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
index 743f4f7c6e2f..479c0b1f0358 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -93,9 +93,7 @@ MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 void LanaiMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
-
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
     switch (MO.getType()) {
     case MachineOperand::MO_Register:
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index a83a5d2dfcc9..2a77a150f9aa 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -211,8 +211,8 @@ bool MSP430FrameLowering::restoreCalleeSavedRegisters(
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
-    BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), CSI[i].getReg());
+  for (const CalleeSavedInfo &I : CSI)
+    BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), I.getReg());
 
   return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
index 1e57f33386e6..52c037de7660 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -115,9 +115,7 @@ LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
 void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
index fefa1134b021..622f2039f9e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -72,10 +72,9 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
   if (!CSI.empty()) {
     const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-         E = CSI.end(); I != E; ++I) {
-      int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
-      unsigned Reg = I->getReg();
+    for (const CalleeSavedInfo &I : CSI) {
+      int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
+      unsigned Reg = I.getReg();
       unsigned DReg = MRI->getDwarfRegNum(Reg, true);
       unsigned CFIIndex = MF.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, DReg, Offset));
@@ -119,13 +118,13 @@ bool Mips16FrameLowering::spillCalleeSavedRegisters(
   // will be saved with the "save" instruction
   // during emitPrologue
   //
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+  for (const CalleeSavedInfo &I : CSI) {
     // Add the callee-saved register as live-in. Do not add if the register is
     // RA and return address is taken, because it has already been added in
     // method MipsTargetLowering::lowerRETURNADDR.
     // It's killed at the spill, unless the register is RA and return address
     // is taken.
-    unsigned Reg = CSI[i].getReg();
+    unsigned Reg = I.getReg();
     bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
       && MF->getFrameInfo().isReturnAddressTaken();
     if (!IsRAAndRetAddrIsTaken)
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 6d3f3adb2b7a..5d026785b921 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -163,9 +163,8 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI,
                                    TargetMachine &TM,
                                    MCStreamer &OutStreamer,
                                    const MipsSubtarget &Subtarget) {
-  for (unsigned int I = MI.getDesc().getNumOperands(), E = MI.getNumOperands();
-       I < E; ++I) {
-    MachineOperand MO = MI.getOperand(I);
+  for (const MachineOperand &MO :
+       llvm::drop_begin(MI.operands(), MI.getDesc().getNumOperands())) {
     if (MO.isMCSymbol() && (MO.getTargetFlags() & MipsII::MO_JALR)) {
       MCSymbol *Callee = MO.getMCSymbol();
       if (Callee && !Callee->getName().empty()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 8e619549f01c..491d379bfe0b 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -637,8 +637,8 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   // has any inline assembly in it. If so, we have to be conservative about
   // alignment assumptions, as we don't know for sure the size of any
   // instructions in the inline assembly.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(&*I);
+  for (MachineBasicBlock &MBB : *MF)
+    computeBlockSize(&MBB);
 
   // Compute block offsets.
   adjustBBOffsetsAfter(&MF->front());
@@ -730,8 +730,8 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
         continue;
 
       // Scan the instructions for constant pool operands.
-      for (unsigned op = 0, e = MI.getNumOperands(); op != e; ++op)
-        if (MI.getOperand(op).isCPI()) {
+      for (const MachineOperand &MO : MI.operands())
+        if (MO.isCPI()) {
           // We found one.  The addressing mode tells us the max displacement
           // from the PC that this instruction permits.
 
@@ -759,7 +759,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             break;
           }
           // Remember that this is a user of a CP entry.
-          unsigned CPI = MI.getOperand(op).getIndex();
+          unsigned CPI = MO.getIndex();
           MachineInstr *CPEMI = CPEMIs[CPI];
           unsigned MaxOffs = ((1 << Bits)-1) * Scale;
           unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
@@ -1066,9 +1066,9 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
-      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
-        if (UserMI->getOperand(j).isCPI()) {
-          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+      for (MachineOperand &MO : UserMI->operands())
+        if (MO.isCPI()) {
+          MO.setIndex(CPEs[i].CPI);
           break;
         }
       // Adjust the refcount of the clone...
@@ -1122,9 +1122,9 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
-      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
-        if (UserMI->getOperand(j).isCPI()) {
-          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+      for (MachineOperand &MO : UserMI->operands())
+        if (MO.isCPI()) {
+          MO.setIndex(CPEs[i].CPI);
           break;
         }
       // Adjust the refcount of the clone...
@@ -1392,9 +1392,9 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   adjustBBOffsetsAfter(&*--NewIsland->getIterator());
 
   // Finally, change the CPI in the instruction operand to be ID.
-  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
-    if (UserMI->getOperand(i).isCPI()) {
-      UserMI->getOperand(i).setIndex(ID);
+  for (MachineOperand &MO : UserMI->operands())
+    if (MO.isCPI()) {
+      MO.setIndex(ID);
       break;
     }
 
@@ -1633,10 +1633,10 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
 void MipsConstantIslands::prescanForConstants() {
   unsigned J = 0;
   (void)J;
-  for (MachineFunction::iterator B =
-         MF->begin(), E = MF->end(); B != E; ++B) {
-    for (MachineBasicBlock::instr_iterator I =
-        B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
+  for (MachineBasicBlock &B : *MF) {
+    for (MachineBasicBlock::instr_iterator I = B.instr_begin(),
+                                           EB = B.instr_end();
+         I != EB; ++I) {
       switch(I->getDesc().getOpcode()) {
         case Mips::LwConstant32: {
           PrescannedForConstants = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index c2e3d7393a6d..2d27d7553de6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -218,9 +218,8 @@ namespace {
     bool runOnMachineFunction(MachineFunction &F) override {
       TM = &F.getTarget();
       bool Changed = false;
-      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
-           FI != FE; ++FI)
-        Changed |= runOnMachineBasicBlock(*FI);
+      for (MachineBasicBlock &MBB : F)
+        Changed |= runOnMachineBasicBlock(MBB);
 
       // This pass invalidates liveness information when it reorders
       // instructions to fill delay slot. Without this, -verify-machineinstrs
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
index f72dc1da4131..31180d5a23ef 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -896,9 +896,8 @@ bool MipsExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   TII = STI->getInstrInfo();
 
   bool Modified = false;
-  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
-       ++MFI)
-    Modified |= expandMBB(*MFI);
+  for (MachineBasicBlock &MBB : MF)
+    Modified |= expandMBB(MBB);
 
   if (Modified)
     MF.RenumberBlocks();
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h
index 612b2b712fa8..710a3d40c38e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -34,7 +34,10 @@ public:
 
   bool hasBP(const MachineFunction &MF) const;
 
-  bool isFPCloseToIncomingSP() const override { return false; }
+  bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const override {
+    return false;
+  }
 
   bool enableShrinkWrapping(const MachineFunction &MF) const override {
     return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index 66e04bda2af3..7b58cb90ab87 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -318,8 +318,7 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
 
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp = LowerOperand(MO);
 
     if (MCOp.isValid())
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index bb4b9c6fa6a7..193d071447ff 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -452,10 +452,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
 
     // Iterate over list of callee-saved registers and emit .cfi_offset
     // directives.
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
-      unsigned Reg = I->getReg();
+    for (const CalleeSavedInfo &I : CSI) {
+      int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
+      unsigned Reg = I.getReg();
 
       // If Reg is a double precision register, emit two cfa_offsets,
       // one for each of the paired single precision registers.
@@ -796,13 +795,13 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters(
   MachineFunction *MF = MBB.getParent();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
 
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+  for (const CalleeSavedInfo &I : CSI) {
     // Add the callee-saved register as live-in. Do not add if the register is
     // RA and return address is taken, because it has already been added in
     // method MipsTargetLowering::lowerRETURNADDR.
     // It's killed at the spill, unless the register is RA and return address
     // is taken.
-    unsigned Reg = CSI[i].getReg();
+    unsigned Reg = I.getReg();
     bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
         && MF->getFrameInfo().isReturnAddressTaken();
     if (!IsRAAndRetAddrIsTaken)
@@ -831,8 +830,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters(
     // Insert the spill to the stack frame.
     bool IsKill = !IsRAAndRetAddrIsTaken;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill,
-                            CSI[i].getFrameIdx(), RC, TRI);
+    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, TRI);
   }
 
   return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 1fe6ab09804b..40b215a8204c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -3581,8 +3581,8 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
 
   MachineInstrBuilder MIB =
       BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt);
-  for (unsigned i = 1; i < MI.getNumOperands(); i++)
-    MIB.add(MI.getOperand(i));
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+    MIB.add(MO);
 
   if(!UsingMips32) {
     Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index aab6d2034f11..c35e67d6726f 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -130,10 +130,8 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV,
   for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
     DiscoverDependentGlobals(GV->getOperand(i), Others);
 
-  for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
-                                                  E = Others.end();
-       I != E; ++I)
-    VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+  for (const GlobalVariable *GV : Others)
+    VisitGlobalVariableForEmission(GV, Order, Visited, Visiting);
 
   // Now we can visit ourself
   Order.push_back(GV);
@@ -699,35 +697,33 @@ static bool useFuncSeen(const Constant *C,
 
 void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
   DenseMap<const Function *, bool> seenMap;
-  for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
-    const Function *F = &*FI;
-
-    if (F->getAttributes().hasFnAttr("nvptx-libcall-callee")) {
-      emitDeclaration(F, O);
+  for (const Function &F : M) {
+    if (F.getAttributes().hasFnAttr("nvptx-libcall-callee")) {
+      emitDeclaration(&F, O);
       continue;
     }
 
-    if (F->isDeclaration()) {
-      if (F->use_empty())
+    if (F.isDeclaration()) {
+      if (F.use_empty())
         continue;
-      if (F->getIntrinsicID())
+      if (F.getIntrinsicID())
         continue;
-      emitDeclaration(F, O);
+      emitDeclaration(&F, O);
       continue;
     }
-    for (const User *U : F->users()) {
+    for (const User *U : F.users()) {
       if (const Constant *C = dyn_cast<Constant>(U)) {
         if (usedInGlobalVarDef(C)) {
           // The use is in the initialization of a global variable
           // that is a function pointer, so print a declaration
           // for the original function
-          emitDeclaration(F, O);
+          emitDeclaration(&F, O);
           break;
         }
         // Emit a declaration of this function if the function that
         // uses this constant expr has already been seen.
         if (useFuncSeen(C, seenMap)) {
-          emitDeclaration(F, O);
+          emitDeclaration(&F, O);
           break;
         }
       }
@@ -746,11 +742,11 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
       // appearing in the module before the callee. so print out
       // a declaration for the callee.
       if (seenMap.find(caller) != seenMap.end()) {
-        emitDeclaration(F, O);
+        emitDeclaration(&F, O);
         break;
       }
     }
-    seenMap[F] = true;
+    seenMap[&F] = true;
   }
 }
 
@@ -887,33 +883,11 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
     GlobalsEmitted = true;
   }
 
-  // XXX Temproarily remove global variables so that doFinalization() will not
-  // emit them again (global variables are emitted at beginning).
-
-  Module::GlobalListType &global_list = M.getGlobalList();
-  int i, n = global_list.size();
-  GlobalVariable **gv_array = new GlobalVariable *[n];
-
-  // first, back-up GlobalVariable in gv_array
-  i = 0;
-  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
-       I != E; ++I)
-    gv_array[i++] = &*I;
-
-  // second, empty global_list
-  while (!global_list.empty())
-    global_list.remove(global_list.begin());
-
   // call doFinalization
   bool ret = AsmPrinter::doFinalization(M);
 
-  // now we restore global variables
-  for (i = 0; i < n; i++)
-    global_list.insert(global_list.end(), gv_array[i]);
-
   clearAnnotationCache(&M);
 
-  delete[] gv_array;
   // Close the last emitted section
   if (HasDebugInfo) {
     static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 5d680e731e4a..2a3a38d7b2f1 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -306,6 +306,11 @@ public:
   std::string getVirtualRegisterName(unsigned) const;
 
   const MCSymbol *getFunctionFrameSymbol() const override;
+
+  // Make emitGlobalVariable() no-op for NVPTX.
+  // Global variables have been already emitted by the time the base AsmPrinter
+  // attempts to do so in doFinalization() (see NVPTXAsmPrinter::emitGlobals()).
+  void emitGlobalVariable(const GlobalVariable *GV) override {}
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
index a8a43cee9ab7..34b9dfe87cc2 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -72,8 +72,7 @@ bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
 std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
   std::string ValidName;
   raw_string_ostream ValidNameStream(ValidName);
-  for (unsigned I = 0, E = Name.size(); I != E; ++I) {
-    char C = Name[I];
+  for (char C : Name) {
     if (C == '.' || C == '@') {
       ValidNameStream << "_$_";
     } else {
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index e404cead344b..f4934f0bc20b 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -56,23 +56,16 @@ bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   InstrsToRemove.clear();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
-       ++BI) {
-    for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &MI : MBB)
       Changed |= processInstr(MI);
-    }
-  }
 
   // Now clean up any handle-access instructions
   // This is needed in debug mode when code cleanup passes are not executed,
   // but we need the handle access to be eliminated because they are not
   // valid instructions when image handles are disabled.
-  for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(),
-       E = InstrsToRemove.end(); I != E; ++I) {
-    (*I)->eraseFromParent();
-  }
+  for (MachineInstr *MI : InstrsToRemove)
+    MI->eraseFromParent();
   return Changed;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td
index f43ba00ec373..f3ae0010ad8e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -626,7 +626,9 @@ def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read],
 // 5 Cycles Fixed-Point and BCD operations, 3 input operands
 def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read, P10DX_Read],
       (instrs
+    BCDADD_rec,
     BCDS_rec,
+    BCDSUB_rec,
     BCDTRUNC_rec,
     VADDECUQ,
     VADDEUQM,
@@ -1974,7 +1976,7 @@ def : InstRW<[P10W_SX, P10W_DISP_ANY],
     ICBLQ,
     ICBTLS,
     ICCCI,
-    LA,
+    LA, LA8,
     LDMX,
     MFDCR,
     MFPMR,
@@ -2073,3 +2075,4 @@ def : InstRW<[P10W_vMU_7C, P10W_DISP_ANY, P10vMU_Read, P10vMU_Read, P10vMU_Read]
     VMSUMUHM,
     VMSUMUHS
 )>;
+
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
index c4f4a2b3d796..f7c049951c54 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -151,6 +151,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
     (instregex "ADD(4|8)(TLS)?(_)?$"),
     (instregex "NEG(8)?(O)?$"),
     (instregex "ADDI(S)?toc(HA|L)(8)?$"),
+    (instregex "LA(8)?$"),
     COPY,
     MCRF,
     MCRXRX,
@@ -165,7 +166,6 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
     SRADI_32,
     RLDIC,
     RFEBB,
-    LA,
     TBEGIN,
     TRECHKPT,
     NOP,
@@ -624,7 +624,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
     BCDS_rec,
     BCDTRUNC_rec,
     BCDUS_rec,
-    BCDUTRUNC_rec
+    BCDUTRUNC_rec,
+    BCDADD_rec,
+    BCDSUB_rec
 )>;
 
 // 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
index a1ff20bb3612..422bd11dca52 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
@@ -203,6 +203,22 @@ def FeatureLogicalFusion :
   SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true",
                    "Target supports Logical Operations fusion",
                    [FeatureFusion]>;
+def FeatureSha3Fusion :
+  SubtargetFeature<"fuse-sha3", "HasSha3Fusion", "true",
+                   "Target supports SHA3 assist fusion",
+                   [FeatureFusion]>;
+def FeatureCompareFusion:
+  SubtargetFeature<"fuse-cmp", "HasCompareFusion", "true",
+                   "Target supports Comparison Operations fusion",
+                   [FeatureFusion]>;
+def FeatureWideImmFusion:
+  SubtargetFeature<"fuse-wideimm", "HasWideImmFusion", "true",
+                   "Target supports Wide-Immediate fusion",
+                   [FeatureFusion]>;
+def FeatureZeroMoveFusion:
+  SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true",
+                   "Target supports move to SPR with branch fusion",
+                   [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
@@ -393,7 +409,7 @@ def ProcessorFeatures {
   // still exist with the exception of those we know are Power9 specific.
   list<SubtargetFeature> FusionFeatures = [
     FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion,
-    FeatureLogicalFusion, FeatureArithAddFusion
+    FeatureLogicalFusion, FeatureArithAddFusion, FeatureSha3Fusion,
   ];
   list<SubtargetFeature> P10AdditionalFeatures =
     !listconcat(FusionFeatures, [
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index a76963abb8e4..16e3b2b85c2e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -875,18 +875,19 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
-  case PPC::ADDItoc: {
+  case PPC::ADDItoc:
+  case PPC::ADDItoc8: {
     assert(IsAIX && TM.getCodeModel() == CodeModel::Small &&
-           "Operand only valid in AIX 32 bit mode");
+           "PseudoOp only valid for small code model AIX");
 
-    // Transform %rN = ADDItoc @op1, %r2.
+    // Transform %rN = ADDItoc/8 @op1, %r2.
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to load address.
-    TmpInst.setOpcode(PPC::LA);
+    TmpInst.setOpcode((!IsPPC64) ? (PPC::LA) : (PPC::LA8));
 
     const MachineOperand &MO = MI->getOperand(1);
-    assert(MO.isGlobal() && "Invalid operand for ADDItoc.");
+    assert(MO.isGlobal() && "Invalid operand for ADDItoc[8].");
 
     // Map the operand to its corresponding MCSymbol.
     const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
index fa6713dcca80..4cac0e3551f6 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -120,16 +120,13 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) {
       static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
   unsigned FuncSize = GetInitialOffset(Fn);
 
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock *MBB = &*MFI;
-
+  for (MachineBasicBlock &MBB : Fn) {
     // The end of the previous block may have extra nops if this block has an
     // alignment requirement.
-    if (MBB->getNumber() > 0) {
-      unsigned AlignExtra = GetAlignmentAdjustment(*MBB, FuncSize);
+    if (MBB.getNumber() > 0) {
+      unsigned AlignExtra = GetAlignmentAdjustment(MBB, FuncSize);
 
-      auto &BS = BlockSizes[MBB->getNumber()-1];
+      auto &BS = BlockSizes[MBB.getNumber()-1];
       BS.first += AlignExtra;
       BS.second = AlignExtra;
 
@@ -138,10 +135,10 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) {
 
     unsigned BlockSize = 0;
     unsigned UnalignedBytesRemaining = 0;
-    for (MachineInstr &MI : *MBB) {
+    for (MachineInstr &MI : MBB) {
       unsigned MINumBytes = TII->getInstSizeInBytes(MI);
       if (MI.isInlineAsm() && (FirstImpreciseBlock < 0))
-        FirstImpreciseBlock = MBB->getNumber();
+        FirstImpreciseBlock = MBB.getNumber();
       if (TII->isPrefixed(MI.getOpcode())) {
         NumPrefixed++;
 
@@ -171,7 +168,7 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) {
       BlockSize += MINumBytes;
     }
 
-    BlockSizes[MBB->getNumber()].first = BlockSize;
+    BlockSizes[MBB.getNumber()].first = BlockSize;
     FuncSize += BlockSize;
   }
 
@@ -181,16 +178,13 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) {
 /// Modify the basic block align adjustment.
 void PPCBSel::modifyAdjustment(MachineFunction &Fn) {
   unsigned Offset = GetInitialOffset(Fn);
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock *MBB = &*MFI;
-
-    if (MBB->getNumber() > 0) {
-      auto &BS = BlockSizes[MBB->getNumber()-1];
+  for (MachineBasicBlock &MBB : Fn) {
+    if (MBB.getNumber() > 0) {
+      auto &BS = BlockSizes[MBB.getNumber()-1];
       BS.first -= BS.second;
       Offset -= BS.second;
 
-      unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset);
+      unsigned AlignExtra = GetAlignmentAdjustment(MBB, Offset);
 
       BS.first += AlignExtra;
       BS.second = AlignExtra;
@@ -198,7 +192,7 @@ void PPCBSel::modifyAdjustment(MachineFunction &Fn) {
       Offset += AlignExtra;
     }
 
-    Offset += BlockSizes[MBB->getNumber()].first;
+    Offset += BlockSizes[MBB.getNumber()].first;
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index b9518d6d7064..b1f5bdd885cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -81,8 +81,7 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
 }
 
 static bool clobbersCTR(const MachineInstr &MI) {
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg()) {
       if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
         return true;
@@ -167,18 +166,16 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
 
   // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
   // any other instructions that might clobber the ctr register.
-  for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
-       I != IE; ++I) {
-    MachineBasicBlock *MBB = &*I;
-    if (!MDT->isReachableFromEntry(MBB))
+  for (MachineBasicBlock &MBB : MF) {
+    if (!MDT->isReachableFromEntry(&MBB))
       continue;
 
-    for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(),
-      MIIE = MBB->end(); MII != MIIE; ++MII) {
+    for (MachineBasicBlock::iterator MII = MBB.getFirstTerminator(),
+      MIIE = MBB.end(); MII != MIIE; ++MII) {
       unsigned Opc = MII->getOpcode();
       if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
           Opc == PPC::BDZ8  || Opc == PPC::BDZ)
-        if (!verifyCTRBranch(MBB, MII))
+        if (!verifyCTRBranch(&MBB, MII))
           llvm_unreachable("Invalid PPC CTR loop!");
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
index be4c9dd60b00..a9794ddd0566 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -74,8 +74,7 @@ bool PPCExpandAtomicPseudo::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   TII = static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
   TRI = &TII->getRegisterInfo();
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock &MBB = *I;
+  for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end();
          MBBI != MBBE;) {
       MachineInstr &MI = *MBBI;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index fc3c7ec35b8d..3ca563fee970 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -391,9 +391,8 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
   unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI)
-    for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) {
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineBasicBlock::iterator MBBI = MBB.end(); MBBI != MBB.begin();) {
       --MBBI;
       for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
         MachineOperand &MO = MBBI->getOperand(I);
@@ -1172,8 +1171,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     // Describe where callee saved registers were saved, at fixed offsets from
     // CFA.
     const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-      unsigned Reg = CSI[I].getReg();
+    for (const CalleeSavedInfo &I : CSI) {
+      unsigned Reg = I.getReg();
       if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
 
       // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
@@ -1204,15 +1203,15 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         continue;
       }
 
-      if (CSI[I].isSpilledToReg()) {
-        unsigned SpilledReg = CSI[I].getDstReg();
+      if (I.isSpilledToReg()) {
+        unsigned SpilledReg = I.getDstReg();
         unsigned CFIRegister = MF.addFrameInst(MCCFIInstruction::createRegister(
             nullptr, MRI->getDwarfRegNum(Reg, true),
             MRI->getDwarfRegNum(SpilledReg, true)));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIRegister);
       } else {
-        int64_t Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
+        int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
         // We have changed the object offset above but we do not want to change
         // the actual offsets in the CFI instruction so we have to undo the
         // offset change here.
@@ -2085,15 +2084,15 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   SmallVector<CalleeSavedInfo, 18> FPRegs;
   SmallVector<CalleeSavedInfo, 18> VRegs;
 
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() ||
             (Reg != PPC::X2 && Reg != PPC::R2)) &&
            "Not expecting to try to spill R2 in a function that must save TOC");
     if (PPC::GPRCRegClass.contains(Reg)) {
       HasGPSaveArea = true;
 
-      GPRegs.push_back(CSI[i]);
+      GPRegs.push_back(I);
 
       if (Reg < MinGPR) {
         MinGPR = Reg;
@@ -2101,7 +2100,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     } else if (PPC::G8RCRegClass.contains(Reg)) {
       HasG8SaveArea = true;
 
-      G8Regs.push_back(CSI[i]);
+      G8Regs.push_back(I);
 
       if (Reg < MinG8R) {
         MinG8R = Reg;
@@ -2109,7 +2108,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     } else if (PPC::F8RCRegClass.contains(Reg)) {
       HasFPSaveArea = true;
 
-      FPRegs.push_back(CSI[i]);
+      FPRegs.push_back(I);
 
       if (Reg < MinFPR) {
         MinFPR = Reg;
@@ -2123,7 +2122,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
       // alignment requirements, so overload the save area for both cases.
       HasVRSaveArea = true;
 
-      VRegs.push_back(CSI[i]);
+      VRegs.push_back(I);
 
       if (Reg < MinVR) {
         MinVR = Reg;
@@ -2395,8 +2394,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
     }
   });
 
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
 
     // CR2 through CR4 are the nonvolatile CR fields.
     bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
@@ -2439,11 +2438,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
         MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
                                          .addReg(PPC::R12,
                                                  getKillRegState(true)),
-                                         CSI[i].getFrameIdx()));
+                                         I.getFrameIdx()));
       }
     } else {
-      if (CSI[i].isSpilledToReg()) {
-        unsigned Dst = CSI[i].getDstReg();
+      if (I.isSpilledToReg()) {
+        unsigned Dst = I.getDstReg();
 
         if (Spilled[Dst])
           continue;
@@ -2478,9 +2477,9 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
         if (Subtarget.needsSwapsForVSXMemOps() &&
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
           TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
-                                       CSI[i].getFrameIdx(), RC, TRI);
+                                       I.getFrameIdx(), RC, TRI);
         else
-          TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, CSI[i].getFrameIdx(),
+          TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(),
                                   RC, TRI);
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 0abdf81d0908..a2664bcff4ab 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -510,14 +510,12 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
     return false;
 
   // TODO: These asserts should be updated as more support for the toc data
-  // transformation is added (64 bit, struct support, etc.).
+  // transformation is added (struct support, etc.).
 
-  assert(PointerSize == 4 && "Only 32 Bit Codegen is currently supported by "
-                             "the toc data transformation.");
-
-  assert(PointerSize >= GV->getAlign().valueOrOne().value() &&
-         "GlobalVariables with an alignment requirement stricter then 4-bytes "
-         "not supported by the toc data transformation.");
+  assert(
+      PointerSize >= GV->getAlign().valueOrOne().value() &&
+      "GlobalVariables with an alignment requirement stricter than TOC entry "
+      "size not supported by the toc data transformation.");
 
   Type *GVType = GV->getValueType();
 
@@ -537,7 +535,7 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
                        "supported by the toc data transformation.");
 
   assert(GVType->getPrimitiveSizeInBits() <= PointerSize * 8 &&
-         "A GlobalVariable with size larger than 32 bits is not currently "
+         "A GlobalVariable with size larger than a TOC entry is not currently "
          "supported by the toc data transformation.");
 
   if (GV->hasLocalLinkage() || GV->hasPrivateLinkage())
@@ -5049,16 +5047,94 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // value for the comparison. When selecting through a .td file, a type
     // error is raised. Must check this first so we never break on the
     // !Subtarget->isISA3_1() check.
-    if (N->getConstantOperandVal(0) == Intrinsic::ppc_fsels) {
+    auto IntID = N->getConstantOperandVal(0);
+    if (IntID == Intrinsic::ppc_fsels) {
       SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3)};
       CurDAG->SelectNodeTo(N, PPC::FSELS, MVT::f32, Ops);
       return;
     }
 
+    if (IntID == Intrinsic::ppc_bcdadd_p || IntID == Intrinsic::ppc_bcdsub_p) {
+      auto Pred = N->getConstantOperandVal(1);
+      unsigned Opcode =
+          IntID == Intrinsic::ppc_bcdadd_p ? PPC::BCDADD_rec : PPC::BCDSUB_rec;
+      unsigned SubReg = 0;
+      unsigned ShiftVal = 0;
+      bool Reverse = false;
+      switch (Pred) {
+      case 0:
+        SubReg = PPC::sub_eq;
+        ShiftVal = 1;
+        break;
+      case 1:
+        SubReg = PPC::sub_eq;
+        ShiftVal = 1;
+        Reverse = true;
+        break;
+      case 2:
+        SubReg = PPC::sub_lt;
+        ShiftVal = 3;
+        break;
+      case 3:
+        SubReg = PPC::sub_lt;
+        ShiftVal = 3;
+        Reverse = true;
+        break;
+      case 4:
+        SubReg = PPC::sub_gt;
+        ShiftVal = 2;
+        break;
+      case 5:
+        SubReg = PPC::sub_gt;
+        ShiftVal = 2;
+        Reverse = true;
+        break;
+      case 6:
+        SubReg = PPC::sub_un;
+        break;
+      case 7:
+        SubReg = PPC::sub_un;
+        Reverse = true;
+        break;
+      }
+
+      EVT VTs[] = {MVT::v16i8, MVT::Glue};
+      SDValue Ops[] = {N->getOperand(2), N->getOperand(3),
+                       CurDAG->getTargetConstant(0, dl, MVT::i32)};
+      SDValue BCDOp = SDValue(CurDAG->getMachineNode(Opcode, dl, VTs, Ops), 0);
+      SDValue CR6Reg = CurDAG->getRegister(PPC::CR6, MVT::i32);
+      // On Power10, we can use SETBC[R]. On prior architectures, we have to use
+      // MFOCRF and shift/negate the value.
+      if (Subtarget->isISA3_1()) {
+        SDValue SubRegIdx = CurDAG->getTargetConstant(SubReg, dl, MVT::i32);
+        SDValue CRBit = SDValue(
+            CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i1,
+                                   CR6Reg, SubRegIdx, BCDOp.getValue(1)),
+            0);
+        CurDAG->SelectNodeTo(N, Reverse ? PPC::SETBCR : PPC::SETBC, MVT::i32,
+                             CRBit);
+      } else {
+        SDValue Move =
+            SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR6Reg,
+                                           BCDOp.getValue(1)),
+                    0);
+        SDValue Ops[] = {Move, getI32Imm((32 - (4 + ShiftVal)) & 31, dl),
+                         getI32Imm(31, dl), getI32Imm(31, dl)};
+        if (!Reverse)
+          CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        else {
+          SDValue Shift = SDValue(
+              CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+          CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Shift, getI32Imm(1, dl));
+        }
+      }
+      return;
+    }
+
     if (!Subtarget->isISA3_1())
       break;
     unsigned Opcode = 0;
-    switch (N->getConstantOperandVal(0)) {
+    switch (IntID) {
     default:
       break;
     case Intrinsic::ppc_altivec_vstribr_p:
@@ -5713,41 +5789,57 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     if (isAIXABI && CModel == CodeModel::Medium)
       report_fatal_error("Medium code model is not supported on AIX.");
 
-    // For 64-bit small code model, we allow SelectCodeCommon to handle this,
-    // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA.
-    if (isPPC64 && CModel == CodeModel::Small)
+    // For 64-bit ELF small code model, we allow SelectCodeCommon to handle
+    // this, selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA. For AIX
+    // small code model, we need to check for a toc-data attribute.
+    if (isPPC64 && !isAIXABI && CModel == CodeModel::Small)
       break;
 
+    auto replaceWith = [this, &dl](unsigned OpCode, SDNode *TocEntry,
+                                   EVT OperandTy) {
+      SDValue GA = TocEntry->getOperand(0);
+      SDValue TocBase = TocEntry->getOperand(1);
+      SDNode *MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase);
+      transferMemOperands(TocEntry, MN);
+      ReplaceNode(TocEntry, MN);
+    };
+
     // Handle 32-bit small code model.
-    if (!isPPC64) {
+    if (!isPPC64 && CModel == CodeModel::Small) {
       // Transforms the ISD::TOC_ENTRY node to passed in Opcode, either
       // PPC::ADDItoc, or PPC::LWZtoc
-      auto replaceWith = [this, &dl](unsigned OpCode, SDNode *TocEntry) {
-        SDValue GA = TocEntry->getOperand(0);
-        SDValue TocBase = TocEntry->getOperand(1);
-        SDNode *MN = CurDAG->getMachineNode(OpCode, dl, MVT::i32, GA, TocBase);
-        transferMemOperands(TocEntry, MN);
-        ReplaceNode(TocEntry, MN);
-      };
-
       if (isELFABI) {
         assert(TM.isPositionIndependent() &&
                "32-bit ELF can only have TOC entries in position independent"
                " code.");
         // 32-bit ELF always uses a small code model toc access.
-        replaceWith(PPC::LWZtoc, N);
+        replaceWith(PPC::LWZtoc, N, MVT::i32);
         return;
       }
 
-      if (isAIXABI && CModel == CodeModel::Small) {
-        if (hasTocDataAttr(N->getOperand(0),
-                           CurDAG->getDataLayout().getPointerSize()))
-          replaceWith(PPC::ADDItoc, N);
-        else
-          replaceWith(PPC::LWZtoc, N);
+      assert(isAIXABI && "ELF ABI already handled");
 
+      if (hasTocDataAttr(N->getOperand(0),
+                         CurDAG->getDataLayout().getPointerSize())) {
+        replaceWith(PPC::ADDItoc, N, MVT::i32);
         return;
       }
+
+      replaceWith(PPC::LWZtoc, N, MVT::i32);
+      return;
+    }
+
+    if (isPPC64 && CModel == CodeModel::Small) {
+      assert(isAIXABI && "ELF ABI handled in common SelectCode");
+
+      if (hasTocDataAttr(N->getOperand(0),
+                         CurDAG->getDataLayout().getPointerSize())) {
+        replaceWith(PPC::ADDItoc8, N, MVT::i64);
+        return;
+      }
+      // Break if it doesn't have toc data attribute. Proceed with common
+      // SelectCode.
+      break;
     }
 
     assert(CModel != CodeModel::Small && "All small code models handled.");
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ac952b240a48..ec7e30d7e362 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12116,6 +12116,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MachineFunction::iterator It = ++BB->getIterator();
 
   MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &MRI = F->getRegInfo();
 
   if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
       MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 ||
@@ -12721,7 +12722,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     Register OldFPSCRReg = MI.getOperand(0).getReg();
 
     // Save FPSCR value.
-    BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
+    if (MRI.use_empty(OldFPSCRReg))
+      BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);
+    else
+      BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
 
     // The floating point rounding mode is in the bits 62:63 of FPCSR, and has
     // the following settings:
@@ -12854,7 +12858,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
     // Result of setflm is previous FPSCR content, so we need to save it first.
     Register OldFPSCRReg = MI.getOperand(0).getReg();
-    BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg);
+    if (MRI.use_empty(OldFPSCRReg))
+      BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);
+    else
+      BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg);
 
     // Put bits in 32:63 to FPSCR.
     Register NewFPSCRReg = MI.getOperand(1).getReg();
@@ -15966,8 +15973,11 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       }
       break;
     case 'v':
-      if (Subtarget.hasAltivec())
+      if (Subtarget.hasAltivec() && VT.isVector())
         return std::make_pair(0U, &PPC::VRRCRegClass);
+      else if (Subtarget.hasVSX())
+        // Scalars in Altivec registers only make sense with VSX.
+        return std::make_pair(0U, &PPC::VFRCRegClass);
       break;
     case 'y':   // crrc
       return std::make_pair(0U, &PPC::CRRCRegClass);
@@ -17664,6 +17674,24 @@ PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,
   return Mode;
 }
 
+bool PPCTargetLowering::splitValueIntoRegisterParts(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+    unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+  EVT ValVT = Val.getValueType();
+  // If we are splitting a scalar integer into f64 parts (i.e. so they
+  // can be placed into VFRC registers), we need to zero extend and
+  // bitcast the values. This will ensure the value is placed into a
+  // VSR using direct moves or stack operations as needed.
+  if (PartVT == MVT::f64 &&
+      (ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) {
+    Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Val);
+    Parts[0] = Val;
+    return true;
+  }
+  return false;
+}
+
 // If we happen to match to an aligned D-Form, check if the Frame Index is
 // adequately aligned. If it is not, reset the mode to match to X-Form.
 static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 34dce2c3172d..87b7f96112ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1139,6 +1139,10 @@ namespace llvm {
     PPC::AddrMode SelectForceXFormMode(SDValue N, SDValue &Disp, SDValue &Base,
                                        SelectionDAG &DAG) const;
 
+    bool
+    splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
+                                SDValue *Parts, unsigned NumParts, MVT PartVT,
+                                Optional<CallingConv::ID> CC) const override;
     /// Structure that collects some common arguments that get passed around
     /// between the functions for call lowering.
     struct CallFlags {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 417a6ce7e522..58af8037f59c 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -773,6 +773,11 @@ def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm),
                      "addis $rD, $rA, $imm", IIC_IntSimple,
                      [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
 
+def LA8     : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$sym),
+                     "la $rD, $sym($rA)", IIC_IntGeneral,
+                     [(set i64:$rD, (add i64:$rA,
+                                    (PPClo tglobaladdr:$sym, 0)))]>;
+
 let Defs = [CARRY] in {
 def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                      "subfic $rD, $rA, $imm", IIC_IntGeneral,
@@ -1435,6 +1440,13 @@ def ADDIStocHA8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentr
 def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                      "#ADDItocL", []>, isPPC64;
 }
+
+// Local Data Transform
+def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
+                   "#ADDItoc8",
+                   [(set i64:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+
 let mayLoad = 1 in
 def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
                    "#LDtocL", []>, isPPC64;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 1e0e2d88e54b..fe21a164dfab 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1161,6 +1161,22 @@ def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))),
 
 } // end HasAltivec
 
+// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
+class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+  : VX_RD5_RSp5_PS1_XO9<xo,
+                   (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
+                   !strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+}
+
+// [PO VRT VRA VRB 1 / XO]
+class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+  : VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+  let PS = 0;
+}
+
 def HasP8Altivec : Predicate<"Subtarget->hasP8Altivec()">;
 def HasP8Crypto : Predicate<"Subtarget->hasP8Crypto()">;
 let Predicates = [HasP8Altivec] in {
@@ -1351,6 +1367,13 @@ def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw,
                           v2i64, v4i32>;
 def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw,
                           v2i64, v4i32>;
+def BCDADD_rec : VX_VT5_VA5_VB5_PS1_XO9_o<1,  "bcdadd." , []>;
+def BCDSUB_rec : VX_VT5_VA5_VB5_PS1_XO9_o<65, "bcdsub." , []>;
+
+def : Pat<(v16i8 (int_ppc_bcdadd v16i8:$vA, v16i8:$vB, timm:$PS)),
+          (BCDADD_rec $vA, $vB, $PS)>;
+def : Pat<(v16i8 (int_ppc_bcdsub v16i8:$vA, v16i8:$vB, timm:$PS)),
+          (BCDSUB_rec $vA, $vB, $PS)>;
 
 // Shuffle patterns for unary and swapped (LE) vector pack modulo.
 def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef),
@@ -1598,22 +1621,6 @@ def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;
 
 def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;
 
-// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
-class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
-  : VX_RD5_RSp5_PS1_XO9<xo,
-                   (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
-                   !strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
-  let Defs = [CR6];
-}
-
-// [PO VRT VRA VRB 1 / XO]
-class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
-  : VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
-  let Defs = [CR6];
-  let PS = 0;
-}
-
 // Decimal Shift/Unsigned-Shift/Shift-and-Round
 def BCDS_rec :  VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
 def BCDUS_rec : VX_VT5_VA5_VB5_XO9_o    <129, "bcdus.", []>;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 649a150866b4..a0fd2111de11 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2138,9 +2138,8 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 }
 
 static bool MBBDefinesCTR(MachineBasicBlock &MBB) {
-  for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
-       I != IE; ++I)
-    if (I->definesRegister(PPC::CTR) || I->definesRegister(PPC::CTR8))
+  for (MachineInstr &MI : MBB)
+    if (MI.definesRegister(PPC::CTR) || MI.definesRegister(PPC::CTR8))
       return true;
   return false;
 }
@@ -2331,8 +2330,7 @@ bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI,
       &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass };
 
   bool Found = false;
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) {
       const TargetRegisterClass *RC = RCs[c];
       if (MO.isReg()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index d2d5ca92ca1c..d92a10c5b208 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2471,6 +2471,7 @@ def DblwdCmp {
 // [HasVSX, HasP8Vector, IsLittleEndian]
 // [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64]
 // [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian]
+// [HasVSX, HasP8Altivec]
 // [HasVSX, HasDirectMove]
 // [HasVSX, HasDirectMove, IsBigEndian]
 // [HasVSX, HasDirectMove, IsLittleEndian]
@@ -2500,6 +2501,10 @@ let Predicates = [HasVSX, IsBigEndian, HasP8Altivec] in
   def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a,
                                                     v16i8:$b, v16i8:$c)),
             (v16i8 (VPERMXOR $a, $b, $c))>;
+let Predicates = [HasVSX, HasP8Altivec] in
+  def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor_be v16i8:$a,
+                                                       v16i8:$b, v16i8:$c)),
+            (v16i8 (VPERMXOR $a, $b, $c))>;
 
 let AddedComplexity = 400 in {
 // Valid for any VSX subtarget, regardless of endianness.
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 5cc180d770b2..22c5b6c11289 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -152,9 +152,9 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                         AsmPrinter &AP) {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
-    if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP))
+    if (LowerPPCMachineOperandToMCOperand(MO, MCOp, AP))
       OutMI.addOperand(MCOp);
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
index bdff5109c1e1..9d5206f8fd43 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -149,6 +149,79 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd,
   case FusionFeature::FK_SldiAdd:
     return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) ||
            (matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57));
+
+  // rldicl rx, ra, 1, 0  - xor
+  case FusionFeature::FK_RotateLeftXor:
+    return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 0);
+
+  // rldicr rx, ra, 1, 63 - xor
+  case FusionFeature::FK_RotateRightXor:
+    return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 63);
+
+  // We actually use CMPW* and CMPD*, 'l' doesn't exist as an operand in instr.
+
+  // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 }
+  // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 }
+  case FusionFeature::FK_LoadCmp1:
+  // { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 }
+  // { ld,ldx } - cmpli 0,1,rx,{ 0,1 }
+  case FusionFeature::FK_LoadCmp2: {
+    const MachineOperand &BT = SecondMI.getOperand(0);
+    if (!BT.isReg() ||
+        (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0))
+      return false;
+    if (SecondMI.getOpcode() == PPC::CMPDI &&
+        matchingImmOps(SecondMI, 2, -1, 16))
+      return true;
+    return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1);
+  }
+
+  // { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 }
+  case FusionFeature::FK_LoadCmp3: {
+    const MachineOperand &BT = SecondMI.getOperand(0);
+    if (!BT.isReg() ||
+        (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0))
+      return false;
+    return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1) ||
+           matchingImmOps(SecondMI, 2, -1, 16);
+  }
+
+  // mtctr - { bcctr,bcctrl }
+  case FusionFeature::FK_ZeroMoveCTR:
+    // ( mtctr rx ) is alias of ( mtspr 9, rx )
+    return (FirstMI.getOpcode() != PPC::MTSPR &&
+            FirstMI.getOpcode() != PPC::MTSPR8) ||
+           matchingImmOps(FirstMI, 0, 9);
+
+  // mtlr - { bclr,bclrl }
+  case FusionFeature::FK_ZeroMoveLR:
+    // ( mtlr rx ) is alias of ( mtspr 8, rx )
+    return (FirstMI.getOpcode() != PPC::MTSPR &&
+            FirstMI.getOpcode() != PPC::MTSPR8) ||
+           matchingImmOps(FirstMI, 0, 8);
+
+  // addis rx,ra,si - addi rt,rx,SI, SI >= 0
+  case FusionFeature::FK_AddisAddi: {
+    const MachineOperand &RA = FirstMI.getOperand(1);
+    const MachineOperand &SI = SecondMI.getOperand(2);
+    if (!SI.isImm() || !RA.isReg())
+      return false;
+    if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8)
+      return false;
+    return SignExtend64(SI.getImm(), 16) >= 0;
+  }
+
+  // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
+  case FusionFeature::FK_AddiAddis: {
+    const MachineOperand &RA = FirstMI.getOperand(1);
+    const MachineOperand &SI = FirstMI.getOperand(2);
+    if (!SI.isImm() || !RA.isReg())
+      return false;
+    if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8)
+      return false;
+    int64_t ExtendedSI = SignExtend64(SI.getImm(), 16);
+    return ExtendedSI >= 2;
+  }
   }
 
   llvm_unreachable("All the cases should have been handled");
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def
index 469a24800423..e4954b722fd0 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -78,5 +78,80 @@ FUSION_FEATURE(VecLogical, hasLogicalFusion, -1,
 FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32),
                FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8))
 
+// rldicl rx, ra, 1, 0  - xor
+FUSION_FEATURE(RotateLeftXor, hasSha3Fusion, 1,
+               FUSION_OP_SET(RLDICL, RLDICL_32, RLDICL_32_64),
+               FUSION_OP_SET(XOR, XOR8))
+
+// rldicr rx, ra, 1, 63 - xor
+FUSION_FEATURE(RotateRightXor, hasSha3Fusion, 1,
+               FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(XOR, XOR8))
+
+// There're two special cases in 'load-compare' series, so we have to split
+// them into several pattern groups to fit into current framework. This can
+// be clearer once we switched to a more expressive approach.
+
+// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 }
+// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 }
+FUSION_FEATURE(LoadCmp1, hasCompareFusion, 1,
+               FUSION_OP_SET(LBZ, LBZ8, LBZX, LBZX8, LBZXTLS, LBZXTLS_,
+                             LBZXTLS_32, LHZ, LHZ8, LHZX, LHZX8, LHZXTLS,
+                             LHZXTLS_, LHZXTLS_32, LWZ, LWZ8, LWZX, LWZX8,
+                             LWZXTLS, LWZXTLS_, LWZXTLS_32),
+               FUSION_OP_SET(CMPDI, CMPLDI, CMPLWI))
+
+// { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 }
+// { ld,ldx } - cmpli 0,1,rx,{ 0,1 }
+FUSION_FEATURE(LoadCmp2, hasCompareFusion, 1,
+               FUSION_OP_SET(LD, LDX, LDXTLS, LDXTLS_),
+               FUSION_OP_SET(CMPDI, CMPLDI))
+
+// { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 }
+FUSION_FEATURE(LoadCmp3, hasCompareFusion, 1,
+               FUSION_OP_SET(LHA, LHA8, LHAX, LHAX8, LWA, LWA_32, LWAX,
+                             LWAX_32),
+               FUSION_OP_SET(CMPLDI, CMPLWI))
+
+// ori - oris
+FUSION_FEATURE(OriOris, hasWideImmFusion, 1, FUSION_OP_SET(ORI, ORI8),
+               FUSION_OP_SET(ORIS, ORIS8))
+
+// lis - ori
+FUSION_FEATURE(LisOri, hasWideImmFusion, 1, FUSION_OP_SET(LIS, LIS8),
+               FUSION_OP_SET(ORI, ORI8))
+
+// oris - ori
+FUSION_FEATURE(OrisOri, hasWideImmFusion, 1, FUSION_OP_SET(ORIS, ORIS8),
+               FUSION_OP_SET(ORI, ORI8))
+
+// xori - xoris
+FUSION_FEATURE(XoriXoris, hasWideImmFusion, 1, FUSION_OP_SET(XORI, XORI8),
+               FUSION_OP_SET(XORIS, XORIS8))
+
+// xoris - xori
+FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8),
+               FUSION_OP_SET(XORI, XORI8))
+
+// addis rx,ra,si - addi rt,rx,SI, SI >= 0
+FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1,
+               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8),
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL))
+
+// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
+FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1,
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL),
+               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8))
+
+// mtctr - { bcctr,bcctrl }
+FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1,
+               FUSION_OP_SET(MTCTR, MTCTRloop, MTSPR8, MTSPR),
+               FUSION_OP_SET(BCCTR, BCCTRn, BCCTR8, BCCTR8n, BCCTRL, BCCTRLn,
+                             BCCTRL8, BCCTRL8n, gBCCTR, gBCCTRL))
+
+// mtlr - { bclr,bclrl }
+FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1,
+               FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR),
+               FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL))
+
 #undef FUSION_FEATURE
 #undef FUSION_OP_SET
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index dfc29dbb10f1..1258a1281597 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -131,6 +131,10 @@ void PPCSubtarget::initializeEnvironment() {
   HasAddLogicalFusion = false;
   HasLogicalAddFusion = false;
   HasLogicalFusion = false;
+  HasSha3Fusion = false;
+  HasCompareFusion = false;
+  HasWideImmFusion = false;
+  HasZeroMoveFusion = false;
   IsISA2_06 = false;
   IsISA2_07 = false;
   IsISA3_0 = false;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 783ea121ccb8..d52833cb1465 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -151,6 +151,10 @@ protected:
   bool HasAddLogicalFusion;
   bool HasLogicalAddFusion;
   bool HasLogicalFusion;
+  bool HasSha3Fusion;
+  bool HasCompareFusion;
+  bool HasWideImmFusion;
+  bool HasZeroMoveFusion;
   bool IsISA2_06;
   bool IsISA2_07;
   bool IsISA3_0;
@@ -340,6 +344,10 @@ public:
   bool hasAddLogicalFusion() const { return HasAddLogicalFusion; }
   bool hasLogicalAddFusion() const { return HasLogicalAddFusion; }
   bool hasLogicalFusion() const { return HasLogicalFusion; }
+  bool hasCompareFusion() const { return HasCompareFusion; }
+  bool hasWideImmFusion() const { return HasWideImmFusion; }
+  bool hasSha3Fusion() const { return HasSha3Fusion; }
+  bool hasZeroMoveFusion() const { return HasZeroMoveFusion; }
   bool needsSwapsForVSXMemOps() const {
     return hasVSX() && isLittleEndian() && !hasP9Vector();
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 77d5a2668b60..5d6f58a77a39 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -318,9 +318,20 @@ InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
   return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
+// Check if the current Type is an MMA vector type. Valid MMA types are
+// v256i1 and v512i1 respectively.
+static bool isMMAType(Type *Ty) {
+  return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
+         (Ty->getPrimitiveSizeInBits() > 128);
+}
+
 InstructionCost PPCTTIImpl::getUserCost(const User *U,
                                         ArrayRef<const Value *> Operands,
                                         TTI::TargetCostKind CostKind) {
+  // Set the max cost if an MMA type is present (v256i1, v512i1).
+  if (isMMAType(U->getType()))
+    return InstructionCost::getMax();
+
   // We already implement getCastInstrCost and getMemoryOpCost where we perform
   // the vector adjustment there.
   if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
@@ -942,32 +953,39 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return 2;
 }
 
-// Adjust the cost of vector instructions on targets which there is overlap
-// between the vector and scalar units, thereby reducing the overall throughput
-// of vector code wrt. scalar code.
-InstructionCost PPCTTIImpl::vectorCostAdjustment(InstructionCost Cost,
-                                                 unsigned Opcode, Type *Ty1,
-                                                 Type *Ty2) {
+// Returns a cost adjustment factor to adjust the cost of vector instructions
+// on targets which there is overlap between the vector and scalar units,
+// thereby reducing the overall throughput of vector code wrt. scalar code.
+// An invalid instruction cost is returned if the type is an MMA vector type.
+InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode,
+                                                       Type *Ty1, Type *Ty2) {
+  // If the vector type is of an MMA type (v256i1, v512i1), an invalid
+  // instruction cost is returned. This is to signify to other cost computing
+  // functions to return the maximum instruction cost in order to prevent any
+  // opportunities for the optimizer to produce MMA types within the IR.
+  if (isMMAType(Ty1))
+    return InstructionCost::getInvalid();
+
   if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
-    return Cost;
+    return InstructionCost(1);
 
   std::pair<InstructionCost, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
   // If type legalization involves splitting the vector, we don't want to
   // double the cost at every step - only the last step.
   if (LT1.first != 1 || !LT1.second.isVector())
-    return Cost;
+    return InstructionCost(1);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   if (TLI->isOperationExpand(ISD, LT1.second))
-    return Cost;
+    return InstructionCost(1);
 
   if (Ty2) {
     std::pair<InstructionCost, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
     if (LT2.first != 1 || !LT2.second.isVector())
-      return Cost;
+      return InstructionCost(1);
   }
 
-  return Cost * 2;
+  return InstructionCost(2);
 }
 
 InstructionCost PPCTTIImpl::getArithmeticInstrCost(
@@ -977,6 +995,11 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
+  if (!CostFactor.isValid())
+    return InstructionCost::getMax();
+
   // TODO: Handle more cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -986,12 +1009,18 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
   // Fallback to the default implementation.
   InstructionCost Cost = BaseT::getArithmeticInstrCost(
       Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
-  return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
+  return Cost * CostFactor;
 }
 
 InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
                                            ArrayRef<int> Mask, int Index,
                                            Type *SubTp) {
+
+  InstructionCost CostFactor =
+      vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
+  if (!CostFactor.isValid())
+    return InstructionCost::getMax();
+
   // Legalize the type.
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
@@ -1000,8 +1029,7 @@ InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
   // instruction). We need one such shuffle instruction for each actual
   // register (this is not true for arbitrary shuffles, but is true for the
   // structured types of shuffles covered by TTI::ShuffleKind).
-  return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp,
-                              nullptr);
+  return LT.first * CostFactor;
 }
 
 InstructionCost PPCTTIImpl::getCFInstrCost(unsigned Opcode,
@@ -1020,9 +1048,13 @@ InstructionCost PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                              const Instruction *I) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
+  InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
+  if (!CostFactor.isValid())
+    return InstructionCost::getMax();
+
   InstructionCost Cost =
       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
-  Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src);
+  Cost *= CostFactor;
   // TODO: Allow non-throughput costs that aren't binary.
   if (CostKind != TTI::TCK_RecipThroughput)
     return Cost == 0 ? 0 : 1;
@@ -1034,12 +1066,17 @@ InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                                CmpInst::Predicate VecPred,
                                                TTI::TargetCostKind CostKind,
                                                const Instruction *I) {
+  InstructionCost CostFactor =
+      vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
+  if (!CostFactor.isValid())
+    return InstructionCost::getMax();
+
   InstructionCost Cost =
       BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
     return Cost;
-  return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
+  return Cost * CostFactor;
 }
 
 InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
@@ -1049,8 +1086,12 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
+  if (!CostFactor.isValid())
+    return InstructionCost::getMax();
+
   InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
-  Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr);
+  Cost *= CostFactor;
 
   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
     // Double-precision scalars are already located in index #0 (or #1 if LE).
@@ -1065,7 +1106,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
       if (ISD == ISD::INSERT_VECTOR_ELT)
         // A move-to VSR and a permute/insert.  Assume vector operation cost
         // for both (cost will be 2x on P9).
-        return vectorCostAdjustment(2, Opcode, Val, nullptr);
+        return 2 * CostFactor;
 
       // It's an extract.  Maybe we can do a cheap move-from VSR.
       unsigned EltSize = Val->getScalarSizeInBits();
@@ -1082,7 +1123,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
       // We need a vector extract (or mfvsrld).  Assume vector operation cost.
       // The cost of the load constant for a vector extract is disregarded
       // (invariant, easily schedulable).
-      return vectorCostAdjustment(1, Opcode, Val, nullptr);
+      return CostFactor;
 
     } else if (ST->hasDirectMove())
       // Assume permute has standard cost.
@@ -1114,6 +1155,11 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                             unsigned AddressSpace,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) {
+
+  InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
+  if (!CostFactor.isValid())
+    return InstructionCost::getMax();
+
   if (TLI->getValueType(DL, Src,  true) == MVT::Other)
     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                   CostKind);
@@ -1128,7 +1174,7 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   if (CostKind != TTI::TCK_RecipThroughput)
     return Cost;
 
-  Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
+  Cost *= CostFactor;
 
   bool IsAltivecType = ST->hasAltivec() &&
                        (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
@@ -1194,6 +1240,11 @@ InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
     bool UseMaskForCond, bool UseMaskForGaps) {
+  InstructionCost CostFactor =
+      vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
+  if (!CostFactor.isValid())
+    return InstructionCost::getMax();
+
   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace, CostKind,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index aa84013803af..7aeb0c59d503 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -100,8 +100,8 @@ public:
   unsigned getCacheLineSize() const override;
   unsigned getPrefetchDistance() const override;
   unsigned getMaxInterleaveFactor(unsigned VF);
-  InstructionCost vectorCostAdjustment(InstructionCost Cost, unsigned Opcode,
-                                       Type *Ty1, Type *Ty2);
+  InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1,
+                                             Type *Ty2);
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index d1979b5456ce..f1c3810f4ee5 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -170,6 +170,14 @@ void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
 void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNo).getImm();
+  // Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx,
+  // or non-zero bits 8/9/10.
+  if (RISCVVType::getVLMUL(Imm) == RISCVII::VLMUL::LMUL_RESERVED ||
+      RISCVVType::getSEW(Imm) > 64 || (Imm & 0x700) != 0) {
+    O << Imm;
+    return;
+  }
+  // Print the text form.
   RISCVVType::printVType(Imm, O);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 595c3cdfbb1d..f5d491938050 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -237,7 +237,13 @@ bool RISCVFrameLowering::hasBP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
-  return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF);
+  // If we do not reserve stack space for outgoing arguments in prologue,
+  // we will adjust the stack pointer before call instruction. After the
+  // adjustment, we can not use SP to access the stack objects for the
+  // arguments. Instead, use BP to access these stack objects.
+  return (MFI.hasVarSizedObjects() ||
+          (!hasReservedCallFrame(MF) && MFI.getMaxCallFrameSize() != 0)) &&
+         TRI->hasStackRealignment(MF);
 }
 
 // Determines the size of the frame and maximum call frame size.
@@ -1065,10 +1071,14 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
   if (MI != MBB.end() && !MI->isDebugInstr())
     DL = MI->getDebugLoc();
 
-  // Manually restore values not restored by libcall. Insert in reverse order.
+  // Manually restore values not restored by libcall.
+  // Keep the same order as in the prologue. There is no need to reverse the
+  // order in the epilogue. In addition, the return address will be restored
+  // first in the epilogue. It increases the opportunity to avoid the
+  // load-to-use data hazard between loading RA and return by RA.
   // loadRegFromStackSlot can insert multiple instructions.
   const auto &NonLibcallCSI = getNonLibcallCSI(*MF, CSI);
-  for (auto &CS : reverse(NonLibcallCSI)) {
+  for (auto &CS : NonLibcallCSI) {
     Register Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI);
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0f1a6e5f9154..f3331571fc55 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -335,17 +335,29 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT, MVT::f16, Custom);
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
-    for (auto Op : FPOpToExpand)
-      setOperationAction(Op, MVT::f16, Expand);
 
     setOperationAction(ISD::FREM,       MVT::f16, Promote);
-    setOperationAction(ISD::FCEIL,      MVT::f16,  Promote);
-    setOperationAction(ISD::FFLOOR,     MVT::f16,  Promote);
-    setOperationAction(ISD::FNEARBYINT, MVT::f16,  Promote);
-    setOperationAction(ISD::FRINT,      MVT::f16,  Promote);
-    setOperationAction(ISD::FROUND,     MVT::f16,  Promote);
-    setOperationAction(ISD::FROUNDEVEN, MVT::f16,  Promote);
-    setOperationAction(ISD::FTRUNC,     MVT::f16,  Promote);
+    setOperationAction(ISD::FCEIL,      MVT::f16, Promote);
+    setOperationAction(ISD::FFLOOR,     MVT::f16, Promote);
+    setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
+    setOperationAction(ISD::FRINT,      MVT::f16, Promote);
+    setOperationAction(ISD::FROUND,     MVT::f16, Promote);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
+    setOperationAction(ISD::FTRUNC,     MVT::f16, Promote);
+    setOperationAction(ISD::FPOW,       MVT::f16, Promote);
+    setOperationAction(ISD::FPOWI,      MVT::f16, Promote);
+    setOperationAction(ISD::FCOS,       MVT::f16, Promote);
+    setOperationAction(ISD::FSIN,       MVT::f16, Promote);
+    setOperationAction(ISD::FSINCOS,    MVT::f16, Promote);
+    setOperationAction(ISD::FEXP,       MVT::f16, Promote);
+    setOperationAction(ISD::FEXP2,      MVT::f16, Promote);
+    setOperationAction(ISD::FLOG,       MVT::f16, Promote);
+    setOperationAction(ISD::FLOG2,      MVT::f16, Promote);
+    setOperationAction(ISD::FLOG10,     MVT::f16, Promote);
+
+    // We need to custom promote this.
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::FPOWI, MVT::i32, Custom);
   }
 
   if (Subtarget.hasStdExtF()) {
@@ -676,6 +688,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMINNUM, VT, Legal);
       setOperationAction(ISD::FMAXNUM, VT, Legal);
 
+      setOperationAction(ISD::FTRUNC, VT, Custom);
+      setOperationAction(ISD::FCEIL, VT, Custom);
+      setOperationAction(ISD::FFLOOR, VT, Custom);
+
       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
@@ -924,6 +940,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::FP_ROUND, VT, Custom);
         setOperationAction(ISD::FP_EXTEND, VT, Custom);
 
+        setOperationAction(ISD::FTRUNC, VT, Custom);
+        setOperationAction(ISD::FCEIL, VT, Custom);
+        setOperationAction(ISD::FFLOOR, VT, Custom);
+
         for (auto CC : VFPCCToExpand)
           setCondCodeAction(CC, VT, Expand);
 
@@ -1165,6 +1185,10 @@ bool RISCVTargetLowering::shouldSinkOperands(
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
       return Operand == 1;
     case Instruction::Call:
       if (auto *II = dyn_cast<IntrinsicInst>(I)) {
@@ -1631,6 +1655,66 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) {
   return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
 }
 
+// Expand vector FTRUNC, FCEIL, and FFLOOR by converting to the integer domain
+// and back. Taking care to avoid converting values that are nan or already
+// correct.
+// TODO: Floor and ceil could be shorter by changing rounding mode, but we don't
+// have FRM dependencies modeled yet.
+static SDValue lowerFTRUNC_FCEIL_FFLOOR(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert(VT.isVector() && "Unexpected type");
+
+  SDLoc DL(Op);
+
+  // Freeze the source since we are increasing the number of uses.
+  SDValue Src = DAG.getNode(ISD::FREEZE, DL, VT, Op.getOperand(0));
+
+  // Truncate to integer and convert back to FP.
+  MVT IntVT = VT.changeVectorElementTypeToInteger();
+  SDValue Truncated = DAG.getNode(ISD::FP_TO_SINT, DL, IntVT, Src);
+  Truncated = DAG.getNode(ISD::SINT_TO_FP, DL, VT, Truncated);
+
+  MVT SetccVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+
+  if (Op.getOpcode() == ISD::FCEIL) {
+    // If the truncated value is the greater than or equal to the original
+    // value, we've computed the ceil. Otherwise, we went the wrong way and
+    // need to increase by 1.
+    // FIXME: This should use a masked operation. Handle here or in isel?
+    SDValue Adjust = DAG.getNode(ISD::FADD, DL, VT, Truncated,
+                                 DAG.getConstantFP(1.0, DL, VT));
+    SDValue NeedAdjust = DAG.getSetCC(DL, SetccVT, Truncated, Src, ISD::SETOLT);
+    Truncated = DAG.getSelect(DL, VT, NeedAdjust, Adjust, Truncated);
+  } else if (Op.getOpcode() == ISD::FFLOOR) {
+    // If the truncated value is the less than or equal to the original value,
+    // we've computed the floor. Otherwise, we went the wrong way and need to
+    // decrease by 1.
+    // FIXME: This should use a masked operation. Handle here or in isel?
+    SDValue Adjust = DAG.getNode(ISD::FSUB, DL, VT, Truncated,
+                                 DAG.getConstantFP(1.0, DL, VT));
+    SDValue NeedAdjust = DAG.getSetCC(DL, SetccVT, Truncated, Src, ISD::SETOGT);
+    Truncated = DAG.getSelect(DL, VT, NeedAdjust, Adjust, Truncated);
+  }
+
+  // Restore the original sign so that -0.0 is preserved.
+  Truncated = DAG.getNode(ISD::FCOPYSIGN, DL, VT, Truncated, Src);
+
+  // Determine the largest integer that can be represented exactly. This and
+  // values larger than it don't have any fractional bits so don't need to
+  // be converted.
+  const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+  unsigned Precision = APFloat::semanticsPrecision(FltSem);
+  APFloat MaxVal = APFloat(FltSem);
+  MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1),
+                          /*IsSigned*/ false, APFloat::rmNearestTiesToEven);
+  SDValue MaxValNode = DAG.getConstantFP(MaxVal, DL, VT);
+
+  // If abs(Src) was larger than MaxVal or nan, keep it.
+  SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, Src);
+  SDValue Setcc = DAG.getSetCC(DL, SetccVT, Abs, MaxValNode, ISD::SETOLT);
+  return DAG.getSelect(DL, VT, Setcc, Truncated, Src);
+}
+
 static SDValue lowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
@@ -2670,6 +2754,20 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
                                  DAG.getConstant(3, DL, VT));
     return DAG.getNode(ISD::MUL, DL, VT, VScale, Op.getOperand(0));
   }
+  case ISD::FPOWI: {
+    // Custom promote f16 powi with illegal i32 integer type on RV64. Once
+    // promoted this will be legalized into a libcall by LegalizeIntegerTypes.
+    if (Op.getValueType() == MVT::f16 && Subtarget.is64Bit() &&
+        Op.getOperand(1).getValueType() == MVT::i32) {
+      SDLoc DL(Op);
+      SDValue Op0 = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
+      SDValue Powi =
+          DAG.getNode(ISD::FPOWI, DL, MVT::f32, Op0, Op.getOperand(1));
+      return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Powi,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+    return SDValue();
+  }
   case ISD::FP_EXTEND: {
     // RVV can only do fp_extend to types double the size as the source. We
     // custom-lower f16->f64 extensions to two hops of ISD::FP_EXTEND, going
@@ -2858,6 +2956,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
     return lowerFP_TO_INT_SAT(Op, DAG);
+  case ISD::FTRUNC:
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+    return lowerFTRUNC_FCEIL_FFLOOR(Op, DAG);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_SMAX:
@@ -9834,6 +9936,23 @@ bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
   return false;
 }
 
+bool RISCVTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
+                                               EVT VT) const {
+  if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
+    return false;
+
+  switch (FPVT.getSimpleVT().SimpleTy) {
+  case MVT::f16:
+    return Subtarget.hasStdExtZfh();
+  case MVT::f32:
+    return Subtarget.hasStdExtF();
+  case MVT::f64:
+    return Subtarget.hasStdExtD();
+  default:
+    return false;
+  }
+}
+
 bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                      EVT VT) const {
   VT = VT.getScalarType();
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 8e3d716ae919..849928eb46ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -514,6 +514,8 @@ public:
 
   bool isLegalElementTypeForRVV(Type *ScalarTy) const;
 
+  bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
+
 private:
   /// RISCVCCAssignFn - This target-specific function extends the default
   /// CCValAssign with additional information used to lower RISC-V calling
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index b653928ccea9..6f9cde966132 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -705,6 +705,7 @@ def PseudoLD  : PseudoLoad<"ld">;
 def PseudoSD  : PseudoStore<"sd">;
 } // Predicates = [IsRV64]
 
+def : InstAlias<"li $rd, $imm",  (ADDI GPR:$rd, X0, simm12:$imm)>;
 def : InstAlias<"mv $rd, $rs",   (ADDI GPR:$rd, GPR:$rs,       0)>;
 def : InstAlias<"not $rd, $rs",  (XORI GPR:$rd, GPR:$rs,      -1)>;
 def : InstAlias<"neg $rd, $rs",  (SUB  GPR:$rd,      X0, GPR:$rs)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 388cce00bdf3..798532d5bc44 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #define GET_REGINFO_TARGET_DESC
@@ -320,3 +321,30 @@ RISCVRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     return &RISCV::VRRegClass;
   return RC;
 }
+
+void RISCVRegisterInfo::getOffsetOpcodes(const StackOffset &Offset,
+                                         SmallVectorImpl<uint64_t> &Ops) const {
+  // VLENB is the length of a vector register in bytes. We use <vscale x 8 x i8>
+  // to represent one vector register. The dwarf offset is
+  // VLENB * scalable_offset / 8.
+  assert(Offset.getScalable() % 8 == 0 && "Invalid frame offset");
+
+  // Add fixed-sized offset using existing DIExpression interface.
+  DIExpression::appendOffset(Ops, Offset.getFixed());
+
+  unsigned VLENB = getDwarfRegNum(RISCV::VLENB, true);
+  int64_t VLENBSized = Offset.getScalable() / 8;
+  if (VLENBSized > 0) {
+    Ops.push_back(dwarf::DW_OP_constu);
+    Ops.push_back(VLENBSized);
+    Ops.append({dwarf::DW_OP_bregx, VLENB, 0ULL});
+    Ops.push_back(dwarf::DW_OP_mul);
+    Ops.push_back(dwarf::DW_OP_plus);
+  } else if (VLENBSized < 0) {
+    Ops.push_back(dwarf::DW_OP_constu);
+    Ops.push_back(-VLENBSized);
+    Ops.append({dwarf::DW_OP_bregx, VLENB, 0ULL});
+    Ops.push_back(dwarf::DW_OP_mul);
+    Ops.push_back(dwarf::DW_OP_minus);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 74a5b83ff6f3..2b2bbdfbdf32 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -63,6 +63,9 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   const TargetRegisterClass *
   getLargestLegalSuperClass(const TargetRegisterClass *RC,
                             const MachineFunction &) const override;
+
+  void getOffsetOpcodes(const StackOffset &Offset,
+                        SmallVectorImpl<uint64_t> &Ops) const override;
 };
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index a915a572f3b7..a56f992d320e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -480,6 +480,8 @@ let RegAltNameIndices = [ABIRegAltName] in {
   def VL     : RISCVReg<0, "vl", ["vl"]>;
   def VXSAT  : RISCVReg<0, "vxsat", ["vxsat"]>;
   def VXRM   : RISCVReg<0, "vxrm", ["vxrm"]>;
+  def VLENB  : RISCVReg<0, "vlenb", ["vlenb"]>,
+               DwarfRegNum<[!add(4096, SysRegVLENB.Encoding)]>;
 }
 
 foreach m = [1, 2, 4] in {
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 41599dd8bb3f..5a4c579dd708 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -388,4 +388,4 @@ def : SysReg<"vxrm", 0x00A>;
 def : SysReg<"vcsr", 0x00F>;
 def : SysReg<"vl", 0xC20>;
 def : SysReg<"vtype", 0xC21>;
-def : SysReg<"vlenb", 0xC22>;
+def SysRegVLENB: SysReg<"vlenb", 0xC22>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
index 7319924a24ba..259b37954183 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -53,9 +53,8 @@ namespace {
       // instructions to fill delay slot.
       F.getRegInfo().invalidateLiveness();
 
-      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
-           FI != FE; ++FI)
-        Changed |= runOnMachineBasicBlock(*FI);
+      for (MachineBasicBlock &MBB : F)
+        Changed |= runOnMachineBasicBlock(MBB);
       return Changed;
     }
 
@@ -319,8 +318,7 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
                             SmallSet<unsigned, 32>& RegDefs,
                             SmallSet<unsigned, 32>& RegUses)
 {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
index fa05a41f3127..bd26710fcbab 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
@@ -42,8 +42,7 @@ bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
   DebugLoc DL = DebugLoc();
 
   bool Modified = false;
-  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
+  for (MachineBasicBlock &MBB : MF) {
     for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
       MachineInstr &MI = *MBBI;
       unsigned Opcode = MI.getOpcode();
@@ -77,10 +76,8 @@ bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
 
   bool Modified = false;
-  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
-    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
-      MachineInstr &MI = *MBBI;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
       unsigned Opcode = MI.getOpcode();
       if (Opcode == SP::CALL && MI.getNumOperands() > 0) {
         MachineOperand &MO = MI.getOperand(0);
@@ -129,8 +126,7 @@ bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) {
   DebugLoc DL = DebugLoc();
 
   bool Modified = false;
-  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
+  for (MachineBasicBlock &MBB : MF) {
     for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
       MachineInstr &MI = *MBBI;
       unsigned Opcode = MI.getOpcode();
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index d165052ca512..a740de9123c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -343,19 +343,18 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
   }
 
   // Rewrite MBB's Live-ins.
-  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
-       MBB != E; ++MBB) {
+  for (MachineBasicBlock &MBB : MF) {
     for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) {
-      if (!MBB->isLiveIn(reg))
+      if (!MBB.isLiveIn(reg))
         continue;
-      MBB->removeLiveIn(reg);
-      MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
+      MBB.removeLiveIn(reg);
+      MBB.addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
     }
     for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-      if (!MBB->isLiveIn(reg))
+      if (!MBB.isLiveIn(reg))
         continue;
-      MBB->removeLiveIn(reg);
-      MBB->addLiveIn(reg - SP::I0 + SP::O0);
+      MBB.removeLiveIn(reg);
+      MBB.addLiveIn(reg - SP::I0 + SP::O0);
     }
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
index 8ea317fdd453..4e7e7bb5c81b 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -97,8 +97,7 @@ void llvm::LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
 
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp = LowerOperand(MI, MO, AP);
 
     if (MCOp.isValid())
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index ac94570e568f..631cbff303e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -144,8 +144,7 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
   if (MI.isDebugInstr())
     return Ref;
 
-  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI.getOperand(I);
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg()) {
       if (Register MOReg = MO.getReg()) {
         if (TRI->regsOverlap(MOReg, Reg)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index d11d118fb8ee..2f7cdfcf7bde 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -270,8 +270,8 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters(
 
     // Make sure all call-saved GPRs are included as operands and are
     // marked as live on entry.
-    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-      unsigned Reg = CSI[I].getReg();
+    for (const CalleeSavedInfo &I : CSI) {
+      unsigned Reg = I.getReg();
       if (SystemZ::GR64BitRegClass.contains(Reg))
         addSavedGPR(MBB, MIB, Reg, true);
     }
@@ -283,16 +283,16 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters(
   }
 
   // Save FPRs/VRs in the normal TargetInstrInfo way.
-  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-    unsigned Reg = CSI[I].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
-      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
                                &SystemZ::FP64BitRegClass, TRI);
     }
     if (SystemZ::VR128BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
-      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
                                &SystemZ::VR128BitRegClass, TRI);
     }
   }
@@ -313,13 +313,13 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters(
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Restore FPRs/VRs in the normal TargetInstrInfo way.
-  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-    unsigned Reg = CSI[I].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
-      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
                                 &SystemZ::FP64BitRegClass, TRI);
     if (SystemZ::VR128BitRegClass.contains(Reg))
-      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
                                 &SystemZ::VR128BitRegClass, TRI);
   }
 
@@ -345,8 +345,8 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters(
     MIB.addImm(RestoreGPRs.GPROffset);
 
     // Do a second scan adding regs as being defined by instruction
-    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-      unsigned Reg = CSI[I].getReg();
+    for (const CalleeSavedInfo &I : CSI) {
+      unsigned Reg = I.getReg();
       if (Reg != RestoreGPRs.LowGPR && Reg != RestoreGPRs.HighGPR &&
           SystemZ::GR64BitRegClass.contains(Reg))
         MIB.addReg(Reg, RegState::ImplicitDefine);
@@ -965,24 +965,24 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
     // Make sure all call-saved GPRs are included as operands and are
     // marked as live on entry.
     auto &GRRegClass = SystemZ::GR64BitRegClass;
-    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-      unsigned Reg = CSI[I].getReg();
+    for (const CalleeSavedInfo &I : CSI) {
+      unsigned Reg = I.getReg();
       if (GRRegClass.contains(Reg))
         addSavedGPR(MBB, MIB, Reg, true);
     }
   }
 
   // Spill FPRs to the stack in the normal TargetInstrInfo way
-  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-    unsigned Reg = CSI[I].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
-      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
                                &SystemZ::FP64BitRegClass, TRI);
     }
     if (SystemZ::VR128BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
-      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
                                &SystemZ::VR128BitRegClass, TRI);
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 6fddb4f81c41..af219da79c32 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -29,7 +29,18 @@ public:
   create(const SystemZSubtarget &STI);
 
   // Override TargetFrameLowering.
-  bool isFPCloseToIncomingSP() const override { return false; }
+  bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const override {
+    // SystemZ wants normal register scavenging slots, as close to the stack or
+    // frame pointer as possible.
+    // The default implementation assumes an x86-like layout, where the frame
+    // pointer is at the opposite end of the frame from the stack pointer.
+    // This meant that when frame pointer elimination was disabled,
+    // the slots ended up being as close as possible to the incoming
+    // stack pointer, which is the opposite of what we want on SystemZ.
+    return false;
+  }
+
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -43,7 +54,6 @@ public:
   SystemZELFFrameLowering();
 
   // Override TargetFrameLowering.
-  bool isFPCloseToIncomingSP() const override { return false; }
   bool
   assignCalleeSavedSpillSlots(MachineFunction &MF,
                               const TargetRegisterInfo *TRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 2bf80882fa61..e80496e37781 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -203,8 +203,8 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                Size, MI.getOperand(1).isKill(), MI.getOperand(1).isUndef());
 
   // Keep the remaining operands as-is.
-  for (unsigned I = 2; I < MI.getNumOperands(); ++I)
-    MIB.add(MI.getOperand(I));
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), 2))
+    MIB.add(MO);
 
   MI.eraseFromParent();
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
index ef39f80a94ef..d2932de5a6ea 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -93,10 +93,8 @@ MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
 
 void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
+  for (const MachineOperand &MO : MI->operands())
     // Ignore all implicit register operands.
     if (!MO.isReg() || !MO.isImplicit())
       OutMI.addOperand(lowerOperand(MO));
-  }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
index 1fe9423e01b8..1d8c3d514bfb 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
@@ -23,14 +23,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ve-asmprinter"
 
-// The generated AsmMatcher VEGenAsmWriter uses "VE" as the target
-// namespace.
-namespace llvm {
-namespace VE {
-using namespace VE;
-}
-} // namespace llvm
-
 #define GET_INSTRUCTION_NAME
 #define PRINT_ALIAS_INSTR
 #include "VEGenAsmWriter.inc"
@@ -62,13 +54,10 @@ void VEInstPrinter::printOperand(const MCInst *MI, int OpNum,
   }
 
   if (MO.isImm()) {
-    switch (MI->getOpcode()) {
-    default:
-      // Expects signed 32bit literals
-      int32_t TruncatedImm = static_cast<int32_t>(MO.getImm());
-      O << TruncatedImm;
-      return;
-    }
+    // Expects signed 32bit literals.
+    int32_t TruncatedImm = static_cast<int32_t>(MO.getImm());
+    O << TruncatedImm;
+    return;
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
index ddcfb9da8249..46846edfeafb 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -942,11 +942,11 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MachineInstrBuilder MIB =
         BuildMI(*MBB, MI, DL, get(VE::SVMmi), Dest).addReg(VMZ).addImm(Imm);
     MachineInstr *Inst = MIB.getInstr();
-    MI.eraseFromParent();
     if (KillSrc) {
       const TargetRegisterInfo *TRI = &getRegisterInfo();
       Inst->addRegisterKilled(MI.getOperand(1).getReg(), TRI, true);
     }
+    MI.eraseFromParent();
     return true;
   }
   case VE::VFMKyal:
@@ -956,6 +956,7 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case VE::VFMKSyvl:
   case VE::VFMKSyvyl:
     expandPseudoVFMK(*this, MI);
+    return true;
   }
   return false;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 80abccd74782..7b70d99b5f52 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -368,8 +368,8 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
   return nullptr; // No reg alloc
 }
 
-static void checkSanityForEHAndSjLj(const TargetMachine *TM) {
-  // Sanity checking related to -exception-model
+static void basicCheckForEHAndSjLj(const TargetMachine *TM) {
+  // Basic Correctness checking related to -exception-model
   if (TM->Options.ExceptionModel != ExceptionHandling::None &&
       TM->Options.ExceptionModel != ExceptionHandling::Wasm)
     report_fatal_error("-exception-model should be either 'none' or 'wasm'");
@@ -431,7 +431,7 @@ void WebAssemblyPassConfig::addIRPasses() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createWebAssemblyOptimizeReturned());
 
-  checkSanityForEHAndSjLj(TM);
+  basicCheckForEHAndSjLj(TM);
 
   // If exception handling is not enabled and setjmp/longjmp handling is
   // enabled, we lower invokes into calls and delete unreachable landingpad
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index d4f39b571394..3df48b466d07 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -148,7 +148,7 @@ public:
       AlignBranchType.addKind(X86::AlignBranchJcc);
       AlignBranchType.addKind(X86::AlignBranchJmp);
     }
-    // Allow overriding defaults set by master flag
+    // Allow overriding defaults set by main flag
     if (X86AlignBranchBoundary.getNumOccurrences())
       AlignBoundary = assumeAligned(X86AlignBranchBoundary);
     if (X86AlignBranch.getNumOccurrences())
@@ -1452,9 +1452,7 @@ public:
     unsigned NumDefCFAOffsets = 0;
     int MinAbsOffset = std::numeric_limits<int>::max();
 
-    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
-      const MCCFIInstruction &Inst = Instrs[i];
-
+    for (const MCCFIInstruction &Inst : Instrs) {
       switch (Inst.getOperation()) {
       default:
         // Any other CFI directives indicate a frame that we aren't prepared
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
index a2ae6345c006..9826bf4bf861 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -186,8 +186,8 @@ public:
         TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
                          *MBB->getParent()));
     MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
-    for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx)
-      Bld.add(MI->getOperand(Idx));
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
+      Bld.add(MO);
 
     BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY))
         .add(MI->getOperand(0))
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
index df8df1e3a65d..c8ceebb8b8e6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
@@ -212,6 +212,12 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) {
   bool Is64BitAlloca = MI->getOpcode() == X86::DYN_ALLOCA_64;
   assert(SlotSize == 4 || SlotSize == 8);
 
+  Optional<MachineFunction::DebugInstrOperandPair> InstrNum = None;
+  if (unsigned Num = MI->peekDebugInstrNum()) {
+    // Operand 2 of DYN_ALLOCAs contains the stack def.
+    InstrNum = {Num, 2};
+  }
+
   switch (L) {
   case TouchAndSub: {
     assert(Amount >= SlotSize);
@@ -251,7 +257,7 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) {
 
       // Do the probe.
       STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
-                                              /*InProlog=*/false);
+                                              /*InProlog=*/false, InstrNum);
     } else {
       // Sub
       BuildMI(*MBB, I, DL,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 01dc509df795..93bc23006dc4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -209,10 +209,8 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB,
     llvm_unreachable("unexpected opcode");
 
   OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
-  unsigned OpStart = 1;
   bool RAXImplicitDead = false;
-  for (; OpStart < MI.getNumOperands(); ++OpStart) {
-    MachineOperand &Op = MI.getOperand(OpStart);
+  for (MachineOperand &Op : llvm::drop_begin(MI.operands())) {
     // RAX may be 'implicit dead', if there are no other users of the return
     // value. We introduce a new use, so change it to 'implicit def'.
     if (Op.isReg() && Op.isImplicit() && Op.isDead() &&
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 9a63cffe0a09..4730b936ec1f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -278,10 +278,9 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
   RegUsageState RegUsage = RU_NotUsed;
   MachineInstr &MI = *I;
 
-  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
-    MachineOperand &opnd = MI.getOperand(i);
-    if (opnd.isReg() && opnd.getReg() == p.getReg()) {
-      if (opnd.isDef())
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() == p.getReg()) {
+      if (MO.isDef())
         return RU_Write;
       RegUsage = RU_Read;
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 60e1b37ed61c..4d9160f35226 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -446,11 +446,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     // Get dead variables list now because the MI pointer may be deleted as part
     // of processing!
     SmallVector<unsigned, 8> DeadRegs;
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI.getOperand(i);
+    for (const MachineOperand &MO : MI.operands())
       if (MO.isReg() && MO.isDead())
         DeadRegs.push_back(MO.getReg());
-    }
 
     switch (FPInstClass) {
     case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
@@ -1672,8 +1670,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     // Collect all FP registers (register operands with constraints "t", "u",
     // and "f") to kill afer the instruction.
     unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &Op = MI.getOperand(i);
+    for (const MachineOperand &Op : MI.operands()) {
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
         continue;
       unsigned FPReg = getFPReg(Op);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
index bd780273509f..c29ae9f6af4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -465,13 +465,11 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  if (CSI.empty()) return;
 
   // Calculate offsets.
-  for (std::vector<CalleeSavedInfo>::const_iterator
-         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
-    int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
-    unsigned Reg = I->getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
+    unsigned Reg = I.getReg();
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
 
     if (IsPrologue) {
@@ -484,10 +482,10 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
   }
 }
 
-void X86FrameLowering::emitStackProbe(MachineFunction &MF,
-                                      MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      const DebugLoc &DL, bool InProlog) const {
+void X86FrameLowering::emitStackProbe(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
+    Optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   if (STI.isTargetWindowsCoreCLR()) {
     if (InProlog) {
@@ -497,10 +495,14 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF,
       emitStackProbeInline(MF, MBB, MBBI, DL, false);
     }
   } else {
-    emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+    emitStackProbeCall(MF, MBB, MBBI, DL, InProlog, InstrNum);
   }
 }
 
+bool X86FrameLowering::stackProbeFunctionModifiesSP() const {
+  return STI.isOSWindows() && !STI.isTargetWin64();
+}
+
 void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
                                         MachineBasicBlock &PrologMBB) const {
   auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
@@ -971,11 +973,10 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
   }
 }
 
-void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
-                                          MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          const DebugLoc &DL,
-                                          bool InProlog) const {
+void X86FrameLowering::emitStackProbeCall(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
+    Optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
   bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
 
   // FIXME: Add indirect thunk support and remove this.
@@ -1015,6 +1016,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
       .addReg(SP, RegState::Define | RegState::Implicit)
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
 
+  MachineInstr *ModInst = CI;
   if (STI.isTargetWin64() || !STI.isOSWindows()) {
     // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
     // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
@@ -1022,9 +1024,27 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
     // adjusting %rsp.
     // All other platforms do not specify a particular ABI for the stack probe
     // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
-    BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
-        .addReg(SP)
-        .addReg(AX);
+    ModInst =
+        BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
+            .addReg(SP)
+            .addReg(AX);
+  }
+
+  // DebugInfo variable locations -- if there's an instruction number for the
+  // allocation (i.e., DYN_ALLOC_*), substitute it for the instruction that
+  // modifies SP.
+  if (InstrNum) {
+    if (STI.isTargetWin64() || !STI.isOSWindows()) {
+      // Label destination operand of the subtract.
+      MF.makeDebugValueSubstitution(*InstrNum,
+                                    {ModInst->getDebugInstrNum(), 0});
+    } else {
+      // Label the call. The operand number is the penultimate operand, zero
+      // based.
+      unsigned SPDefOperand = ModInst->getNumOperands() - 2;
+      MF.makeDebugValueSubstitution(
+          *InstrNum, {ModInst->getDebugInstrNum(), SPDefOperand});
+    }
   }
 
   if (InProlog) {
@@ -2652,8 +2672,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   // Reload XMMs from stack frame.
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     if (X86::GR64RegClass.contains(Reg) ||
         X86::GR32RegClass.contains(Reg))
       continue;
@@ -2664,13 +2684,13 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
       VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI);
   }
 
   // POP GPRs.
   unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     if (!X86::GR64RegClass.contains(Reg) &&
         !X86::GR32RegClass.contains(Reg))
       continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
index 6309b8a066c4..e18be0d26321 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Support/TypeSize.h"
 
@@ -51,9 +52,14 @@ public:
   /// Emit target stack probe code. This is required for all
   /// large stack allocations on Windows. The caller is required to materialize
   /// the number of bytes to probe in RAX/EAX.
-  void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
-                      MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-                      bool InProlog) const;
+  /// \p InstrNum optionally contains a debug-info instruction number for the
+  ///    new stack pointer.
+  void emitStackProbe(
+      MachineFunction &MF, MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
+      Optional<MachineFunction::DebugInstrOperandPair> InstrNum = None) const;
+
+  bool stackProbeFunctionModifiesSP() const override;
 
   /// Replace a StackProbe inline-stub with the actual probe code inline.
   void inlineStackProbe(MachineFunction &MF,
@@ -198,9 +204,10 @@ private:
   uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
 
   /// Emit target stack probe as a call to a helper function
-  void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-                          bool InProlog) const;
+  void emitStackProbeCall(
+      MachineFunction &MF, MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
+      Optional<MachineFunction::DebugInstrOperandPair> InstrNum) const;
 
   /// Emit target stack probe as an inline sequence.
   void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
index 17d14053d804..62b2387396be 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23190,6 +23190,10 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
+  // We don't need to replace SQRT with RSQRT for half type.
+  if (VT.getScalarType() == MVT::f16)
+    return true;
+
   // We never want to use both SQRT and RSQRT instructions for the same input.
   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
     return false;
@@ -23228,11 +23232,15 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
     UseOneConstNR = false;
     // There is no FSQRT for 512-bits, but there is RSQRT14.
     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
-    return DAG.getNode(Opcode, DL, VT, Op);
+    SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
+    if (RefinementSteps == 0 && !Reciprocal)
+      Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
+    return Estimate;
   }
 
   if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
       Subtarget.hasFP16()) {
+    assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = 0;
 
@@ -45680,7 +45688,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           if (is64BitFP && !Subtarget.is64Bit()) {
             // On a 32-bit target, we cannot bitcast the 64-bit float to a
             // 64-bit integer, since that's not a legal type. Since
-            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+            // OnesOrZeroesF is all ones or all zeroes, we don't need all the
             // bits, but can do this little dance to extract the lowest 32 bits
             // and work with those going forward.
             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
@@ -46577,6 +46585,59 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return Ret;
 }
 
+static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
+                                   SDValue And1_L, SDValue And1_R, SDLoc DL,
+                                   SelectionDAG &DAG) {
+  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
+    return SDValue();
+  SDValue NotOp = And0_L->getOperand(0);
+  if (NotOp == And1_R)
+    std::swap(And1_R, And1_L);
+  if (NotOp != And1_L)
+    return SDValue();
+
+  // (~(NotOp) & And0_R) | (NotOp & And1_R)
+  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
+  EVT VT = And1_L->getValueType(0);
+  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for  targets without a fused
+/// "and-not" operation. This function is intended to be called from a
+/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+
+  SDLoc DL(Node);
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
                          const X86Subtarget &Subtarget) {
@@ -46670,6 +46731,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
       return Res;
   }
 
+  // We should fold "masked merge" patterns when `andn` is not available.
+  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG))
+      return R;
+
   return SDValue();
 }
 
@@ -48504,20 +48570,50 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
   SDValue LHS = Src.getOperand(0).getOperand(0);
   SDValue RHS = Src.getOperand(0).getOperand(1);
 
-  unsigned ExtOpc = LHS.getOpcode();
-  if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
-      RHS.getOpcode() != ExtOpc)
-    return SDValue();
-
-  // Peek through the extends.
-  LHS = LHS.getOperand(0);
-  RHS = RHS.getOperand(0);
-
-  // Ensure the input types match.
-  if (LHS.getValueType() != VT || RHS.getValueType() != VT)
-    return SDValue();
+  // Count leading sign/zero bits on both inputs - if there are enough then
+  // truncation back to vXi16 will be cheap - either as a pack/shuffle
+  // sequence or using AVX512 truncations. If the inputs are sext/zext then the
+  // truncations may actually be free by peeking through to the ext source.
+  auto IsSext = [&DAG](SDValue V) {
+    return DAG.ComputeMinSignedBits(V) <= 16;
+  };
+  auto IsZext = [&DAG](SDValue V) {
+    return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
+  };
 
-  unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+  bool IsSigned = IsSext(LHS) && IsSext(RHS);
+  bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
+  if (!IsSigned && !IsUnsigned)
+    return SDValue();
+
+  // Check if both inputs are extensions, which will be removed by truncation.
+  bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
+                         LHS.getOpcode() == ISD::ZERO_EXTEND) &&
+                        (RHS.getOpcode() == ISD::SIGN_EXTEND ||
+                         RHS.getOpcode() == ISD::ZERO_EXTEND) &&
+                        LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
+                        RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
+
+  // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
+  // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
+  // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
+  // will have to split anyway.
+  unsigned InSizeInBits = InVT.getSizeInBits();
+  if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
+      !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
+      (InSizeInBits % 16) == 0) {
+    EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                InVT.getSizeInBits() / 16);
+    SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
+                              DAG.getBitcast(BCVT, RHS));
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
+  }
+
+  // Truncate back to source type.
+  LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
+  RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
+
+  unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
   return DAG.getNode(Opc, DL, VT, LHS, RHS);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
index 8aee96e1c504..1db83033ba35 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12937,8 +12937,8 @@ def : Pat<(v16i32 (X86vzmovl
                                      (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>;
 
-def : Pat<(v8i16 (X86vzmovl (v8i16 (scalar_to_vector (i16 (trunc GR32:$src)))))),
-          (VMOVW2SHrr GR32:$src)>;
+def : Pat<(v8i16 (X86vzmovl (scalar_to_vector (i16 GR16:$src)))),
+          (VMOVW2SHrr (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit))>;
 
 // AVX 128-bit movw instruction write zeros in the high 128-bit part.
 def : Pat<(v8i16 (X86vzload16 addr:$src)),
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
index 639aa5199ea5..bb5637a31947 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1163,8 +1163,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
 bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.isDef() &&
         MO.getReg() == X86::EFLAGS && !MO.isDead()) {
       return true;
@@ -5676,10 +5675,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
     MachineOperand &MO = MI.getOperand(i + 2);
     MIB.add(MO);
   }
-  for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
     MIB.add(MO);
-  }
 
   updateOperandRegConstraints(MF, *NewMI, TII);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td
index 74c515850ab1..91a497252595 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td
@@ -1,4 +1,4 @@
-//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//=- X86RegisterBank.td - Describe the X86 Banks -------------*- tablegen -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 06dacb638d16..869762b35196 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1584,54 +1584,98 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
 
     // Mask sign extend has an instruction.
-    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
-    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
-    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
-    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
-    { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1,  1 },
 
     // Mask zero extend is a sext + shift.
-    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
-    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
-    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
-    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
-    { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1,  2 },
+
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, 2 },
 
     { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // vpmovwb
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // vpmovwb
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // vpmovwb
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
-    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, 2 },
-    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
   };
 
   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+    // Mask sign extend has an instruction.
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 },
+
+    // Mask zero extend is a sext + shift.
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
+
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  2 },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i32, 2 },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i64,  2 },
+
     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
 
@@ -1786,40 +1830,94 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
   static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
     // Mask sign extend has an instruction.
-    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
-    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v64i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1,  1 },
 
     // Mask zero extend is a sext + shift.
-    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
-    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v64i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1,  2 },
+
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v16i16, 2 },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i8,  2 },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v16i16, 2 },
 
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 }, // vpsllw+vptestmb
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // vpsllw+vptestmw
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // vpsllw+vptestmb
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 }, // vpsllw+vptestmw
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 }, // vpsllw+vptestmb
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 }, // vpsllw+vptestmw
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 }, // vpsllw+vptestmb
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 }, // vpsllw+vptestmw
-    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 }, // vpsllw+vptestmb
   };
 
   static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
+    // Mask sign extend has an instruction.
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
+
+    // Mask zero extend is a sext + shift.
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i1,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
+
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
+
     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
@@ -3674,6 +3772,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
         PromEltTyBits = 16; // promote to i16, AVX512BW.
       break;
     }
+    if (ST->hasDQI()) {
+      PromEltTyBits = 32; // promote to i32, AVX512F.
+      break;
+    }
     return bailout();
   default:
     return bailout();
@@ -3969,7 +4071,9 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
   // Even in the case of (loop invariant) stride whose value is not known at
   // compile time, the address computation will not incur more than one extra
   // ADD instruction.
-  if (Ty->isVectorTy() && SE) {
+  if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
+    // TODO: AVX2 is the current cut-off because we don't have correct
+    //       interleaving costs for prior ISA's.
     if (!BaseT::isStridedAccess(Ptr))
       return NumVectorInstToHideOverhead;
     if (!BaseT::getConstantStrideStep(SE, Ptr))
@@ -5173,7 +5277,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
   auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
                                              LegalVT.getVectorNumElements());
   InstructionCost MemOpCost;
-  if (UseMaskForCond || UseMaskForGaps)
+  bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
+  if (UseMaskedMemOp)
     MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
                                       AddressSpace, CostKind);
   else
@@ -5183,9 +5288,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
   unsigned VF = VecTy->getNumElements() / Factor;
   MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
 
-  // FIXME: this is the most conservative estimate for the mask cost.
   InstructionCost MaskCost;
-  if (UseMaskForCond || UseMaskForGaps) {
+  if (UseMaskedMemOp) {
     APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
     for (unsigned Index : Indices) {
       assert(Index < Factor && "Invalid index for interleaved memory op");
@@ -5193,10 +5297,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
     }
 
-    Type *I8Type = Type::getInt8Ty(VecTy->getContext());
+    Type *I1Type = Type::getInt1Ty(VecTy->getContext());
 
     MaskCost = getReplicationShuffleCost(
-        I8Type, Factor, VF,
+        I1Type, Factor, VF,
         UseMaskForGaps ? DemandedLoadStoreElts
                        : APInt::getAllOnes(VecTy->getNumElements()),
         CostKind);
@@ -5207,7 +5311,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
     // memory access, we need to account for the cost of And-ing the two masks
     // inside the loop.
     if (UseMaskForGaps) {
-      auto *MaskVT = FixedVectorType::get(I8Type, VecTy->getNumElements());
+      auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
       MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
     }
   }
@@ -5248,9 +5352,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
         NumOfLoadsInInterleaveGrp;
 
     // About a half of the loads may be folded in shuffles when we have only
-    // one result. If we have more than one result, we do not fold loads at all.
+    // one result. If we have more than one result, or the loads are masked,
+    // we do not fold loads at all.
     unsigned NumOfUnfoldedLoads =
-        NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+        UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
 
     // Get a number of shuffle operations per result.
     unsigned NumOfShufflesPerResult =
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index 27ac6a4d1439..f2f89f4269ed 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -427,19 +427,19 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters(
   if (MI != MBB.end() && !MI->isDebugInstr())
     DL = MI->getDebugLoc();
 
-  for (auto it = CSI.begin(); it != CSI.end(); ++it) {
-    unsigned Reg = it->getReg();
+  for (const CalleeSavedInfo &I : CSI) {
+    unsigned Reg = I.getReg();
     assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
            "LR & FP are always handled in emitPrologue");
 
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, it->getFrameIdx(), RC, TRI);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI);
     if (emitFrameMoves) {
       auto Store = MI;
       --Store;
-      XFI->getSpillLabels().push_back(std::make_pair(Store, *it));
+      XFI->getSpillLabels().push_back(std::make_pair(Store, I));
     }
   }
   return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index b5dbdea98eea..71836133fae6 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -48,9 +48,7 @@ bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
   const XCoreInstrInfo &TII =
       *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
   unsigned StackSize = MF.getFrameInfo().getStackSize();
-  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
+  for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator MBBI = MBB.begin(), EE = MBB.end();
          MBBI != EE; ++MBBI) {
       if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) {
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
index cd28fa5cd144..6f5dcb291e6e 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -103,8 +103,7 @@ MCOperand XCoreMCInstLower::LowerOperand(const MachineOperand &MO,
 void XCoreMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp = LowerOperand(MO);
 
     if (MCOp.isValid())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b2c2efed7db8..ba7589c2bf60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -275,94 +276,64 @@ CleanupPointerRootUsers(GlobalVariable *GV,
 /// We just marked GV constant.  Loop over all users of the global, cleaning up
 /// the obvious ones.  This is largely just a quick scan over the use list to
 /// clean up the easy and obvious cruft.  This returns true if it made a change.
-static bool CleanupConstantGlobalUsers(
-    Value *V, Constant *Init, const DataLayout &DL,
-    function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
+                                       const DataLayout &DL) {
+  Constant *Init = GV->getInitializer();
+  SmallVector<User *, 8> WorkList(GV->users());
+  SmallPtrSet<User *, 8> Visited;
   bool Changed = false;
-  // Note that we need to use a weak value handle for the worklist items. When
-  // we delete a constant array, we may also be holding pointer to one of its
-  // elements (or an element of one of its elements if we're dealing with an
-  // array of arrays) in the worklist.
-  SmallVector<WeakTrackingVH, 8> WorkList(V->users());
+
+  SmallVector<WeakTrackingVH> MaybeDeadInsts;
+  auto EraseFromParent = [&](Instruction *I) {
+    for (Value *Op : I->operands())
+      if (auto *OpI = dyn_cast<Instruction>(Op))
+        MaybeDeadInsts.push_back(OpI);
+    I->eraseFromParent();
+    Changed = true;
+  };
   while (!WorkList.empty()) {
-    Value *UV = WorkList.pop_back_val();
-    if (!UV)
+    User *U = WorkList.pop_back_val();
+    if (!Visited.insert(U).second)
       continue;
 
-    User *U = cast<User>(UV);
+    if (auto *BO = dyn_cast<BitCastOperator>(U))
+      append_range(WorkList, BO->users());
+    if (auto *ASC = dyn_cast<AddrSpaceCastOperator>(U))
+      append_range(WorkList, ASC->users());
+    else if (auto *GEP = dyn_cast<GEPOperator>(U))
+      append_range(WorkList, GEP->users());
+    else if (auto *LI = dyn_cast<LoadInst>(U)) {
+      // A load from zeroinitializer is always zeroinitializer, regardless of
+      // any applied offset.
+      if (Init->isNullValue()) {
+        LI->replaceAllUsesWith(Constant::getNullValue(LI->getType()));
+        EraseFromParent(LI);
+        continue;
+      }
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (Init) {
-        if (auto *Casted =
-                ConstantFoldLoadThroughBitcast(Init, LI->getType(), DL)) {
-          // Replace the load with the initializer.
-          LI->replaceAllUsesWith(Casted);
-          LI->eraseFromParent();
-          Changed = true;
+      Value *PtrOp = LI->getPointerOperand();
+      APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+      PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
+          DL, Offset, /* AllowNonInbounds */ true);
+      if (PtrOp == GV) {
+        if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(),
+                                                    Offset, DL)) {
+          LI->replaceAllUsesWith(Value);
+          EraseFromParent(LI);
         }
       }
     } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       // Store must be unreachable or storing Init into the global.
-      SI->eraseFromParent();
-      Changed = true;
-    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
-      if (CE->getOpcode() == Instruction::GetElementPtr) {
-        Constant *SubInit = nullptr;
-        if (Init)
-          SubInit = ConstantFoldLoadThroughGEPConstantExpr(
-              Init, CE, V->getType()->getPointerElementType(), DL);
-        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
-      } else if ((CE->getOpcode() == Instruction::BitCast &&
-                  CE->getType()->isPointerTy()) ||
-                 CE->getOpcode() == Instruction::AddrSpaceCast) {
-        // Pointer cast, delete any stores and memsets to the global.
-        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
-      }
-
-      if (CE->use_empty()) {
-        CE->destroyConstant();
-        Changed = true;
-      }
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
-      // Do not transform "gepinst (gep constexpr (GV))" here, because forming
-      // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
-      // and will invalidate our notion of what Init is.
-      Constant *SubInit = nullptr;
-      if (!isa<ConstantExpr>(GEP->getOperand(0))) {
-        ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
-            ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
-        if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
-          SubInit = ConstantFoldLoadThroughGEPConstantExpr(
-              Init, CE, V->getType()->getPointerElementType(), DL);
-
-        // If the initializer is an all-null value and we have an inbounds GEP,
-        // we already know what the result of any load from that GEP is.
-        // TODO: Handle splats.
-        if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
-          SubInit = Constant::getNullValue(GEP->getResultElementType());
-      }
-      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);
-
-      if (GEP->use_empty()) {
-        GEP->eraseFromParent();
-        Changed = true;
-      }
+      EraseFromParent(SI);
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
-      if (MI->getRawDest() == V) {
-        MI->eraseFromParent();
-        Changed = true;
-      }
-
-    } else if (Constant *C = dyn_cast<Constant>(U)) {
-      // If we have a chain of dead constantexprs or other things dangling from
-      // us, and if they are all dead, nuke them without remorse.
-      if (isSafeToDestroyConstant(C)) {
-        C->destroyConstant();
-        CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
-        return true;
-      }
+      if (getUnderlyingObject(MI->getRawDest()) == GV)
+        EraseFromParent(MI);
     }
   }
+
+  Changed |=
+      RecursivelyDeleteTriviallyDeadInstructionsPermissive(MaybeDeadInsts);
+  GV->removeDeadConstantUsers();
   return Changed;
 }
 
@@ -889,7 +860,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(
       Changed |= CleanupPointerRootUsers(GV, GetTLI);
     } else {
       Changed = true;
-      CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
+      CleanupConstantGlobalUsers(GV, DL);
     }
     if (GV->use_empty()) {
       LLVM_DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
@@ -1557,8 +1528,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     } else {
       // Delete any stores we can find to the global.  We may not be able to
       // make it completely dead though.
-      Changed =
-          CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+      Changed = CleanupConstantGlobalUsers(GV, DL);
     }
 
     // If the global is dead now, delete it.
@@ -1583,7 +1553,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     }
 
     // Clean up any obviously simplifiable users now.
-    Changed |= CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+    Changed |= CleanupConstantGlobalUsers(GV, DL);
 
     // If the global is dead now, just nuke it.
     if (GV->use_empty()) {
@@ -1628,7 +1598,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
       GV->setInitializer(SOVConstant);
 
       // Clean up any obviously simplifiable users now.
-      CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+      CleanupConstantGlobalUsers(GV, DL);
 
       if (GV->use_empty()) {
         LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index f342c35fa283..055ee6b50296 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1885,6 +1885,7 @@ private:
                                        OMPRTL___kmpc_barrier_simple_generic);
     ExternalizationRAII ThreadId(OMPInfoCache,
                                  OMPRTL___kmpc_get_hardware_thread_id_in_block);
+    ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size);
 
     registerAAs(IsModulePass);
 
@@ -3727,12 +3728,37 @@ struct AAKernelInfoFunction : AAKernelInfo {
               CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
         SPMDCompatibilityTracker.indicatePessimisticFixpoint();
 
+    bool UsedAssumedInformationFromReachingKernels = false;
     if (!IsKernelEntry) {
-      updateReachingKernelEntries(A);
       updateParallelLevels(A);
 
+      bool AllReachingKernelsKnown = true;
+      updateReachingKernelEntries(A, AllReachingKernelsKnown);
+      UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
+
       if (!ParallelLevels.isValidState())
         SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+      else if (!ReachingKernelEntries.isValidState())
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+      else if (!SPMDCompatibilityTracker.empty()) {
+        // Check if all reaching kernels agree on the mode as we can otherwise
+        // not guard instructions. We might not be sure about the mode so we
+        // we cannot fix the internal spmd-zation state either.
+        int SPMD = 0, Generic = 0;
+        for (auto *Kernel : ReachingKernelEntries) {
+          auto &CBAA = A.getAAFor<AAKernelInfo>(
+              *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
+          if (CBAA.SPMDCompatibilityTracker.isValidState() &&
+              CBAA.SPMDCompatibilityTracker.isAssumed())
+            ++SPMD;
+          else
+            ++Generic;
+          if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
+            UsedAssumedInformationFromReachingKernels = true;
+        }
+        if (SPMD != 0 && Generic != 0)
+          SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+      }
     }
 
     // Callback to check a call instruction.
@@ -3779,7 +3805,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // If we haven't used any assumed information for the SPMD state we can fix
     // it.
     if (!UsedAssumedInformationInCheckRWInst &&
-        !UsedAssumedInformationInCheckCallInst && AllSPMDStatesWereFixed)
+        !UsedAssumedInformationInCheckCallInst &&
+        !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
       SPMDCompatibilityTracker.indicateOptimisticFixpoint();
 
     return StateBefore == getState() ? ChangeStatus::UNCHANGED
@@ -3788,7 +3815,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
 
 private:
   /// Update info regarding reaching kernels.
-  void updateReachingKernelEntries(Attributor &A) {
+  void updateReachingKernelEntries(Attributor &A,
+                                   bool &AllReachingKernelsKnown) {
     auto PredCallSite = [&](AbstractCallSite ACS) {
       Function *Caller = ACS.getInstruction()->getFunction();
 
@@ -3808,10 +3836,9 @@ private:
       return true;
     };
 
-    bool AllCallSitesKnown;
     if (!A.checkForAllCallSites(PredCallSite, *this,
                                 true /* RequireAllCallSites */,
-                                AllCallSitesKnown))
+                                AllReachingKernelsKnown))
       ReachingKernelEntries.indicatePessimisticFixpoint();
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 7402e399a88a..2d717475ce7f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -641,8 +641,7 @@ PartialInlinerImpl::computeOutliningInfo(Function &F) const {
   if (!CandidateFound)
     return std::unique_ptr<FunctionOutliningInfo>();
 
-  // Do sanity check of the entries: threre should not
-  // be any successors (not in the entry set) other than
+  // There should not be any successors (not in the entry set) other than
   // {ReturnBlock, NonReturnBlock}
   assert(OutliningInfo->Entries[0] == &F.front() &&
          "Function Entry must be the first in Entries vector");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
index a961c47a7501..b8fac9d47763 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -84,6 +84,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
 #include <algorithm>
@@ -173,6 +174,9 @@ static cl::opt<bool>
                          cl::desc("Process functions in a top-down order "
                                   "defined by the profiled call graph when "
                                   "-sample-profile-top-down-load is on."));
+cl::opt<bool>
+    SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
+                    cl::desc("Sort profiled recursion by edge weights."));
 
 static cl::opt<bool> ProfileSizeInline(
     "sample-profile-inline-size", cl::Hidden, cl::init(false),
@@ -1648,6 +1652,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
     SmallVector<uint32_t, 4> Weights;
     uint32_t MaxWeight = 0;
     Instruction *MaxDestInst;
+    // Since profi treats multiple edges (multiway branches) as a single edge,
+    // we need to distribute the computed weight among the branches. We do
+    // this by evenly splitting the edge weight among destinations.
+    DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity;
+    std::vector<uint64_t> EdgeIndex;
+    if (SampleProfileUseProfi) {
+      EdgeIndex.resize(TI->getNumSuccessors());
+      for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
+        const BasicBlock *Succ = TI->getSuccessor(I);
+        EdgeIndex[I] = EdgeMultiplicity[Succ];
+        EdgeMultiplicity[Succ]++;
+      }
+    }
     for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
       BasicBlock *Succ = TI->getSuccessor(I);
       Edge E = std::make_pair(BB, Succ);
@@ -1660,9 +1677,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
         LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
         Weight = std::numeric_limits<uint32_t>::max();
       }
-      // Weight is added by one to avoid propagation errors introduced by
-      // 0 weights.
-      Weights.push_back(static_cast<uint32_t>(Weight + 1));
+      if (!SampleProfileUseProfi) {
+        // Weight is added by one to avoid propagation errors introduced by
+        // 0 weights.
+        Weights.push_back(static_cast<uint32_t>(Weight + 1));
+      } else {
+        // Profi creates proper weights that do not require "+1" adjustments but
+        // we evenly split the weight among branches with the same destination.
+        uint64_t W = Weight / EdgeMultiplicity[Succ];
+        // Rounding up, if needed, so that first branches are hotter.
+        if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
+          W++;
+        Weights.push_back(static_cast<uint32_t>(W));
+      }
       if (Weight != 0) {
         if (Weight > MaxWeight) {
           MaxWeight = Weight;
@@ -1853,7 +1880,13 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
     std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG);
     scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
     while (!CGI.isAtEnd()) {
-      for (ProfiledCallGraphNode *Node : *CGI) {
+      auto Range = *CGI;
+      if (SortProfiledSCC) {
+        // Sort nodes in one SCC based on callsite hotness.
+        scc_member_iterator<ProfiledCallGraph *> SI(*CGI);
+        Range = *SI;
+      }
+      for (auto *Node : Range) {
         Function *F = SymbolMap.lookup(Node->Name);
         if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
           FunctionOrderList.push_back(F);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 06c9bf650f37..dc55b5a31596 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1727,16 +1727,18 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
       (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  Value *A, *B, *C;
+  Value *A, *B, *C, *X, *Y;
 
   // (~(A | B) & C) | ... --> ...
   // (~(A & B) | C) & ... --> ...
   // TODO: One use checks are conservative. We just need to check that a total
   //       number of multiple used values does not exceed reduction
   //       in operations.
-  if (match(Op0, m_c_BinOp(FlippedOpcode,
-                           m_Not(m_BinOp(Opcode, m_Value(A), m_Value(B))),
-                           m_Value(C)))) {
+  if (match(Op0,
+            m_c_BinOp(FlippedOpcode,
+                      m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A),
+                                                             m_Value(B)))),
+                      m_Value(C)))) {
     // (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A
     // (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A)
     if (match(Op1,
@@ -1776,6 +1778,21 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
                        m_c_BinOp(Opcode, m_Specific(B), m_Specific(C)))))))
       return BinaryOperator::CreateNot(Builder.CreateBinOp(
           Opcode, Builder.CreateBinOp(FlippedOpcode, A, C), B));
+
+    // (~(A | B) & C) | ~(C | (A ^ B)) --> ~((A | B) & (C | (A ^ B)))
+    // Note, the pattern with swapped and/or is not handled because the
+    // result is more undefined than a source:
+    // (~(A & B) | C) & ~(C & (A ^ B)) --> (A ^ B ^ C) | ~(A | C) is invalid.
+    if (Opcode == Instruction::Or && Op0->hasOneUse() &&
+        match(Op1, m_OneUse(m_Not(m_CombineAnd(
+                       m_Value(Y),
+                       m_c_BinOp(Opcode, m_Specific(C),
+                                 m_c_Xor(m_Specific(A), m_Specific(B)))))))) {
+      // X = ~(A | B)
+      // Y = (C | (A ^ B)
+      Value *Or = cast<BinaryOperator>(X)->getOperand(0);
+      return BinaryOperator::CreateNot(Builder.CreateAnd(Or, Y));
+    }
   }
 
   return nullptr;
@@ -2061,7 +2078,14 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
     return CastedAnd;
 
+  if (Instruction *Sel = foldBinopOfSextBoolToSelect(I))
+    return Sel;
+
   // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>.
+  // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold
+  //       with binop identity constant. But creating a select with non-constant
+  //       arm may not be reversible due to poison semantics. Is that a good
+  //       canonicalization?
   Value *A;
   if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
       A->getType()->isIntOrIntVectorTy(1))
@@ -2322,11 +2346,20 @@ Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
   Value *Cond;
   Value *NotB;
   if (match(A, m_SExt(m_Value(Cond))) &&
-      Cond->getType()->isIntOrIntVectorTy(1) &&
-      match(B, m_OneUse(m_Not(m_Value(NotB))))) {
-    NotB = peekThroughBitcast(NotB, true);
-    if (match(NotB, m_SExt(m_Specific(Cond))))
+      Cond->getType()->isIntOrIntVectorTy(1)) {
+    // A = sext i1 Cond; B = sext (not (i1 Cond))
+    if (match(B, m_SExt(m_Not(m_Specific(Cond)))))
       return Cond;
+
+    // A = sext i1 Cond; B = not ({bitcast} (sext (i1 Cond)))
+    // TODO: The one-use checks are unnecessary or misplaced. If the caller
+    //       checked for uses on logic ops/casts, that should be enough to
+    //       make this transform worthwhile.
+    if (match(B, m_OneUse(m_Not(m_Value(NotB))))) {
+      NotB = peekThroughBitcast(NotB, true);
+      if (match(NotB, m_SExt(m_Specific(Cond))))
+        return Cond;
+    }
   }
 
   // All scalar (and most vector) possibilities should be handled now.
@@ -2569,7 +2602,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     return replaceInstUsesWith(I, V);
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  if (I.getType()->isIntOrIntVectorTy(1)) {
+  Type *Ty = I.getType();
+  if (Ty->isIntOrIntVectorTy(1)) {
     if (auto *SI0 = dyn_cast<SelectInst>(Op0)) {
       if (auto *I =
               foldAndOrOfSelectUsingImpliedCond(Op1, *SI0, /* IsAnd */ false))
@@ -2602,7 +2636,16 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0
     // The check for a 'not' op is for efficiency (if Y is known zero --> ~X).
     Value *Or = Builder.CreateOr(X, Y);
-    return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV));
+    return BinaryOperator::CreateXor(Or, ConstantInt::get(Ty, *CV));
+  }
+
+  // If the operands have no common bits set:
+  // or (mul X, Y), X --> add (mul X, Y), X --> mul X, (Y + 1)
+  if (match(&I,
+            m_c_Or(m_OneUse(m_Mul(m_Value(X), m_Value(Y))), m_Deferred(X))) &&
+      haveNoCommonBitsSet(Op0, Op1, DL)) {
+    Value *IncrementY = Builder.CreateAdd(Y, ConstantInt::get(Ty, 1));
+    return BinaryOperator::CreateMul(X, IncrementY);
   }
 
   // (A & C) | (B & D)
@@ -2635,14 +2678,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
         // iff (C0 & C1) == 0 and (X & ~C0) == 0
         if (match(A, m_c_Or(m_Value(X), m_Specific(B))) &&
             MaskedValueIsZero(X, ~*C0, 0, &I)) {
-          Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+          Constant *C01 = ConstantInt::get(Ty, *C0 | *C1);
           return BinaryOperator::CreateAnd(A, C01);
         }
         // (A & C0) | ((X | A) & C1) --> (X | A) & (C0 | C1)
         // iff (C0 & C1) == 0 and (X & ~C1) == 0
         if (match(B, m_c_Or(m_Value(X), m_Specific(A))) &&
             MaskedValueIsZero(X, ~*C1, 0, &I)) {
-          Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+          Constant *C01 = ConstantInt::get(Ty, *C0 | *C1);
           return BinaryOperator::CreateAnd(B, C01);
         }
         // ((X | C2) & C0) | ((X | C3) & C1) --> (X | C2 | C3) & (C0 | C1)
@@ -2652,7 +2695,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
             match(B, m_Or(m_Specific(X), m_APInt(C3))) &&
             (*C2 & ~*C0).isZero() && (*C3 & ~*C1).isZero()) {
           Value *Or = Builder.CreateOr(X, *C2 | *C3, "bitfield");
-          Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+          Constant *C01 = ConstantInt::get(Ty, *C0 | *C1);
           return BinaryOperator::CreateAnd(Or, C01);
         }
       }
@@ -2788,13 +2831,20 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
     return CastedOr;
 
+  if (Instruction *Sel = foldBinopOfSextBoolToSelect(I))
+    return Sel;
+
   // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
+  // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold
+  //       with binop identity constant. But creating a select with non-constant
+  //       arm may not be reversible due to poison semantics. Is that a good
+  //       canonicalization?
   if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
       A->getType()->isIntOrIntVectorTy(1))
-    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
+    return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op1);
   if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
       A->getType()->isIntOrIntVectorTy(1))
-    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
+    return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op0);
 
   // Note: If we've gotten to the point of visiting the outer OR, then the
   // inner one couldn't be simplified.  If it was a constant, then it won't
@@ -2826,7 +2876,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y) - 1), X)  --> X s> Y ? -1 : X.
   {
     Value *X, *Y;
-    Type *Ty = I.getType();
     if (match(&I, m_c_Or(m_OneUse(m_AShr(
                              m_NSWSub(m_Value(Y), m_Value(X)),
                              m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
@@ -2876,7 +2925,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (match(&I, m_c_Or(m_Add(m_Shl(m_One(), m_Value(X)), m_AllOnes()),
                        m_Shl(m_One(), m_Deferred(X)))) &&
       match(&I, m_c_Or(m_OneUse(m_Value()), m_Value()))) {
-    Type *Ty = X->getType();
     Value *Sub = Builder.CreateSub(
         ConstantInt::get(Ty, Ty->getScalarSizeInBits() - 1), X);
     return BinaryOperator::CreateLShr(Constant::getAllOnesValue(Ty), Sub);
@@ -3601,6 +3649,14 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(A)), m_Value(B)), m_Deferred(A))))
     return BinaryOperator::CreateOr(A, B);
 
+  // (~A | B) ^ A --> ~(A & B)
+  if (match(Op0, m_OneUse(m_c_Or(m_Not(m_Specific(Op1)), m_Value(B)))))
+    return BinaryOperator::CreateNot(Builder.CreateAnd(Op1, B));
+
+  // A ^ (~A | B) --> ~(A & B)
+  if (match(Op1, m_OneUse(m_c_Or(m_Not(m_Specific(Op0)), m_Value(B)))))
+    return BinaryOperator::CreateNot(Builder.CreateAnd(Op0, B));
+
   // (A | B) ^ (A | C) --> (B ^ C) & ~A -- There are 4 commuted variants.
   // TODO: Loosen one-use restriction if common operand is a constant.
   Value *D;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index bfa7bfa2290a..7da2669e1d13 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2641,7 +2641,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
     ArgNo++;
   }
 
-  assert(ArgNo == Call.arg_size() && "sanity check");
+  assert(ArgNo == Call.arg_size() && "Call arguments not processed correctly.");
 
   if (!ArgNos.empty()) {
     AttributeList AS = Call.getAttributes();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ca87477c5d81..33f217659c01 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2771,7 +2771,7 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
     if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))),
                                         m_Value(Y), m_ConstantInt(IndexC)))) &&
         DestTy->isIntegerTy() && X->getType() == DestTy &&
-        isDesirableIntType(BitWidth)) {
+        Y->getType()->isIntegerTy() && isDesirableIntType(BitWidth)) {
       // Adjust for big endian - the LSBs are at the high index.
       if (DL.isBigEndian())
         IndexC = SrcVTy->getNumElements() - 1 - IndexC;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7a9e177f19da..ed53b88aed61 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1894,23 +1895,6 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
     return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
   }
 
-  // (X & C2) == 0 -> (trunc X) >= 0
-  // (X & C2) != 0 -> (trunc X) <  0
-  //   iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
-  const APInt *C2;
-  if (And->hasOneUse() && C.isZero() && match(Y, m_APInt(C2))) {
-    int32_t ExactLogBase2 = C2->exactLogBase2();
-    if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
-      Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
-      if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
-        NTy = VectorType::get(NTy, AndVTy->getElementCount());
-      Value *Trunc = Builder.CreateTrunc(X, NTy);
-      auto NewPred =
-          Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE : CmpInst::ICMP_SLT;
-      return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
-    }
-  }
-
   return nullptr;
 }
 
@@ -2803,7 +2787,8 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
             PredB, cast<Constant>(RHS2));
     if (!FlippedStrictness)
       return false;
-    assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
+    assert(FlippedStrictness->first == ICmpInst::ICMP_SGE &&
+           "basic correctness failure");
     RHS2 = FlippedStrictness->second;
     // And kind-of perform the result swap.
     std::swap(Less, Greater);
@@ -4614,7 +4599,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
 
 static Instruction *foldICmpWithTrunc(ICmpInst &ICmp,
                                       InstCombiner::BuilderTy &Builder) {
-  const ICmpInst::Predicate Pred = ICmp.getPredicate();
+  ICmpInst::Predicate Pred = ICmp.getPredicate();
   Value *Op0 = ICmp.getOperand(0), *Op1 = ICmp.getOperand(1);
 
   // Try to canonicalize trunc + compare-to-constant into a mask + cmp.
@@ -4624,41 +4609,31 @@ static Instruction *foldICmpWithTrunc(ICmpInst &ICmp,
   if (!match(Op0, m_OneUse(m_Trunc(m_Value(X)))) || !match(Op1, m_APInt(C)))
     return nullptr;
 
+  // This matches patterns corresponding to tests of the signbit as well as:
+  // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?)
+  // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?)
+  APInt Mask;
+  if (decomposeBitTestICmp(Op0, Op1, Pred, X, Mask, true /* WithTrunc */)) {
+    Value *And = Builder.CreateAnd(X, Mask);
+    Constant *Zero = ConstantInt::getNullValue(X->getType());
+    return new ICmpInst(Pred, And, Zero);
+  }
+
   unsigned SrcBits = X->getType()->getScalarSizeInBits();
-  if (Pred == ICmpInst::ICMP_ULT) {
-    if (C->isPowerOf2()) {
-      // If C is a power-of-2 (one set bit):
-      // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?)
-      Constant *MaskC = ConstantInt::get(X->getType(), (-*C).zext(SrcBits));
-      Value *And = Builder.CreateAnd(X, MaskC);
-      Constant *Zero = ConstantInt::getNullValue(X->getType());
-      return new ICmpInst(ICmpInst::ICMP_EQ, And, Zero);
-    }
+  if (Pred == ICmpInst::ICMP_ULT && C->isNegatedPowerOf2()) {
     // If C is a negative power-of-2 (high-bit mask):
     // (trunc X) u< C --> (X & C) != C (are any masked-high-bits clear?)
-    if (C->isNegatedPowerOf2()) {
-      Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits));
-      Value *And = Builder.CreateAnd(X, MaskC);
-      return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC);
-    }
+    Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits));
+    Value *And = Builder.CreateAnd(X, MaskC);
+    return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC);
   }
 
-  if (Pred == ICmpInst::ICMP_UGT) {
-    // If C is a low-bit-mask (C+1 is a power-of-2):
-    // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?)
-    if (C->isMask()) {
-      Constant *MaskC = ConstantInt::get(X->getType(), (~*C).zext(SrcBits));
-      Value *And = Builder.CreateAnd(X, MaskC);
-      Constant *Zero = ConstantInt::getNullValue(X->getType());
-      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
-    }
+  if (Pred == ICmpInst::ICMP_UGT && (~*C).isPowerOf2()) {
     // If C is not-of-power-of-2 (one clear bit):
     // (trunc X) u> C --> (X & (C+1)) == C+1 (are all masked-high-bits set?)
-    if ((~*C).isPowerOf2()) {
-      Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits));
-      Value *And = Builder.CreateAnd(X, MaskC);
-      return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC);
-    }
+    Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits));
+    Value *And = Builder.CreateAnd(X, MaskC);
+    return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC);
   }
 
   return nullptr;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 72e1b21e8d49..20c75188ec9f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -319,6 +319,7 @@ private:
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt);
   Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
+  Instruction *foldBinopOfSextBoolToSelect(BinaryOperator &I);
   Instruction *narrowBinOp(TruncInst &Trunc);
   Instruction *narrowMaskedBinOp(BinaryOperator &And);
   Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
index 7dc516c6fdc3..42ba4a34a5a9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -403,7 +403,7 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
       NonNegatedOps.emplace_back(Op); // Just record which operand that was.
     }
     assert((NegatedOps.size() + NonNegatedOps.size()) == 2 &&
-           "Internal consistency sanity check.");
+           "Internal consistency check failed.");
     // Did we manage to sink negation into both of the operands?
     if (NegatedOps.size() == 2) // Then we get to keep the `add`!
       return Builder.CreateAdd(NegatedOps[0], NegatedOps[1],
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 4a1e82ae9c1d..518d3952dce5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -246,12 +246,16 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
 static unsigned getSelectFoldableOperands(BinaryOperator *I) {
   switch (I->getOpcode()) {
   case Instruction::Add:
+  case Instruction::FAdd:
   case Instruction::Mul:
+  case Instruction::FMul:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
     return 3;              // Can fold through either operand.
   case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::FSub:
+  case Instruction::FDiv:  // Can only fold on the divisor amount.
   case Instruction::Shl:   // Can only fold on the shift amount.
   case Instruction::LShr:
   case Instruction::AShr:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 47b6dcb67a78..1f81624f79e7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -967,6 +967,29 @@ Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
   return nullptr;
 }
 
+/// A binop with a constant operand and a sign-extended boolean operand may be
+/// converted into a select of constants by applying the binary operation to
+/// the constant with the two possible values of the extended boolean (0 or -1).
+Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
+  // TODO: Handle non-commutative binop (constant is operand 0).
+  // TODO: Handle zext.
+  // TODO: Peek through 'not' of cast.
+  Value *BO0 = BO.getOperand(0);
+  Value *BO1 = BO.getOperand(1);
+  Value *X;
+  Constant *C;
+  if (!match(BO0, m_SExt(m_Value(X))) || !match(BO1, m_ImmConstant(C)) ||
+      !X->getType()->isIntOrIntVectorTy(1))
+    return nullptr;
+
+  // bo (sext i1 X), C --> select X, (bo -1, C), (bo 0, C)
+  Constant *Ones = ConstantInt::getAllOnesValue(BO.getType());
+  Constant *Zero = ConstantInt::getNullValue(BO.getType());
+  Constant *TVal = ConstantExpr::get(BO.getOpcode(), Ones, C);
+  Constant *FVal = ConstantExpr::get(BO.getOpcode(), Zero, C);
+  return SelectInst::Create(X, TVal, FVal);
+}
+
 static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
                                              InstCombiner::BuilderTy &Builder) {
   if (auto *Cast = dyn_cast<CastInst>(&I))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b56329ad76ae..bd2dc8d639fc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file is a part of AddressSanitizer, an address sanity checker.
+// This file is a part of AddressSanitizer, an address basic correctness
+// checker.
 // Details of the algorithm:
 //  https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
 //
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 62c265e40dab..8d3bc1383e96 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// This file is a part of HWAddressSanitizer, an address sanity checker
-/// based on tagged addressing.
+/// This file is a part of HWAddressSanitizer, an address basic correctness
+/// checker based on tagged addressing.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 36a66e096382..d1d3b8ffdf7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -64,10 +64,10 @@ cl::opt<bool> DoHashBasedCounterSplit(
     cl::desc("Rename counter variable of a comdat function based on cfg hash"),
     cl::init(true));
 
-cl::opt<bool> RuntimeCounterRelocation(
-    "runtime-counter-relocation",
-    cl::desc("Enable relocating counters at runtime."),
-    cl::init(false));
+cl::opt<bool>
+    RuntimeCounterRelocation("runtime-counter-relocation",
+                             cl::desc("Enable relocating counters at runtime."),
+                             cl::init(false));
 
 cl::opt<bool> ValueProfileStaticAlloc(
     "vp-static-alloc",
@@ -331,8 +331,9 @@ private:
 
   // Check whether the loop satisfies the basic conditions needed to perform
   // Counter Promotions.
-  bool isPromotionPossible(Loop *LP,
-                           const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
+  bool
+  isPromotionPossible(Loop *LP,
+                      const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
     // We can't insert into a catchswitch.
     if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
           return isa<CatchSwitchInst>(Exit->getTerminator());
@@ -421,13 +422,13 @@ PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
 }
 
 char InstrProfilingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(
-    InstrProfilingLegacyPass, "instrprof",
-    "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_BEGIN(InstrProfilingLegacyPass, "instrprof",
+                      "Frontend instrumentation-based coverage lowering.",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
-    InstrProfilingLegacyPass, "instrprof",
-    "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_END(InstrProfilingLegacyPass, "instrprof",
+                    "Frontend instrumentation-based coverage lowering.", false,
+                    false)
 
 ModulePass *
 llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
@@ -634,13 +635,9 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
   GlobalVariable *Name = Ind->getName();
   uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
   uint64_t Index = Ind->getIndex()->getZExtValue();
-  auto It = ProfileDataMap.find(Name);
-  if (It == ProfileDataMap.end()) {
-    PerFunctionProfileData PD;
-    PD.NumValueSites[ValueKind] = Index + 1;
-    ProfileDataMap[Name] = PD;
-  } else if (It->second.NumValueSites[ValueKind] <= Index)
-    It->second.NumValueSites[ValueKind] = Index + 1;
+  auto &PD = ProfileDataMap[Name];
+  PD.NumValueSites[ValueKind] =
+      std::max(PD.NumValueSites[ValueKind], (uint32_t)(Index + 1));
 }
 
 void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
@@ -703,14 +700,15 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
     LoadInst *LI = dyn_cast<LoadInst>(&I);
     if (!LI) {
       IRBuilder<> Builder(&I);
-      GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
+      GlobalVariable *Bias =
+          M->getGlobalVariable(getInstrProfCounterBiasVarName());
       if (!Bias) {
         // Compiler must define this variable when runtime counter relocation
         // is being used. Runtime has a weak external reference that is used
         // to check whether that's the case or not.
-        Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
-                                  Constant::getNullValue(Int64Ty),
-                                  getInstrProfCounterBiasVarName());
+        Bias = new GlobalVariable(
+            *M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
+            Constant::getNullValue(Int64Ty), getInstrProfCounterBiasVarName());
         Bias->setVisibility(GlobalVariable::HiddenVisibility);
         // A definition that's weak (linkonce_odr) without being in a COMDAT
         // section wouldn't lead to link errors, but it would lead to a dead
@@ -839,8 +837,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
     return false;
   // Use linker script magic to get data/cnts/name start/end.
   if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
-      TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() ||
-      TT.isOSWindows())
+      TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || TT.isOSWindows())
     return false;
 
   return true;
@@ -849,13 +846,9 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
 GlobalVariable *
 InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   GlobalVariable *NamePtr = Inc->getName();
-  auto It = ProfileDataMap.find(NamePtr);
-  PerFunctionProfileData PD;
-  if (It != ProfileDataMap.end()) {
-    if (It->second.RegionCounters)
-      return It->second.RegionCounters;
-    PD = It->second;
-  }
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.RegionCounters)
+    return PD.RegionCounters;
 
   // Match the linkage and visibility of the name global.
   Function *Fn = Inc->getParent()->getParent();
@@ -922,6 +915,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   CounterPtr->setAlignment(Align(8));
   MaybeSetComdat(CounterPtr);
   CounterPtr->setLinkage(Linkage);
+  PD.RegionCounters = CounterPtr;
 
   auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
   // Allocate statically the array of pointers to value profile nodes for
@@ -1000,9 +994,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   MaybeSetComdat(Data);
   Data->setLinkage(Linkage);
 
-  PD.RegionCounters = CounterPtr;
   PD.DataVar = Data;
-  ProfileDataMap[NamePtr] = PD;
 
   // Mark the data variable as used so that it isn't stripped out.
   CompilerUsedVars.push_back(Data);
@@ -1013,7 +1005,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   // Collect the referenced names to be used by emitNameData.
   ReferencedNames.push_back(NamePtr);
 
-  return CounterPtr;
+  return PD.RegionCounters;
 }
 
 void InstrProfiling::emitVNodes() {
@@ -1078,8 +1070,8 @@ void InstrProfiling::emitNameData() {
   }
 
   auto &Ctx = M->getContext();
-  auto *NamesVal = ConstantDataArray::getString(
-      Ctx, StringRef(CompressedNameStr), false);
+  auto *NamesVal =
+      ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false);
   NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
                                 GlobalValue::PrivateLinkage, NamesVal,
                                 getInstrProfNamesVarName());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index f98e39d751f4..180012198c42 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -110,7 +110,7 @@ namespace {
 /// the module.
 struct ThreadSanitizer {
   ThreadSanitizer() {
-    // Sanity check options and warn user.
+    // Check options and warn user.
     if (ClInstrumentReadBeforeWrite && ClCompoundReadBeforeWrite) {
       errs()
           << "warning: Option -tsan-compound-read-before-write has no effect "
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 74e4eb07b219..4921209f041b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -94,11 +94,9 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
       return false;
   } else if (const auto *CS = dyn_cast<CallBase>(Inst)) {
     // For calls, just check the arguments (and not the callee operand).
-    for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) {
-      const Value *Op = *OI;
+    for (const Value *Op : CS->args())
       if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
-    }
     return false;
   } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     // Special-case stores, because we don't care about the stored value, just
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index ca9567dc7ac8..a3fd97079b1d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -52,6 +52,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "correlated-value-propagation"
 
+static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned(
+    "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden,
+    cl::desc("Enables canonicalization of signed relational predicates to "
+             "unsigned (e.g. sgt => ugt)"));
+
 STATISTIC(NumPhis,      "Number of phis propagated");
 STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
 STATISTIC(NumSelects,   "Number of selects propagated");
@@ -64,7 +69,8 @@ STATISTIC(NumSDivSRemsNarrowed,
 STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
 STATISTIC(NumUDivURemsNarrowed,
           "Number of udivs/urems whose width was decreased");
-STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
+STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr");
+STATISTIC(NumAShrsRemoved, "Number of ashr removed");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
 STATISTIC(NumSExt,      "Number of sext converted to zext");
 STATISTIC(NumSICmps,    "Number of signed icmp preds simplified to unsigned");
@@ -297,6 +303,9 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
 }
 
 static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
+  if (!CanonicalizeICmpPredicatesToUnsigned)
+    return false;
+
   // Only for signed relational comparisons of scalar integers.
   if (Cmp->getType()->isVectorTy() ||
       !Cmp->getOperand(0)->getType()->isIntegerTy())
@@ -376,13 +385,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
     // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
     SwitchInstProfUpdateWrapper SI(*I);
 
-    APInt Low =
-        APInt::getSignedMaxValue(Cond->getType()->getScalarSizeInBits());
-    APInt High =
-        APInt::getSignedMinValue(Cond->getType()->getScalarSizeInBits());
-
-    SwitchInst::CaseIt CI = SI->case_begin();
-    for (auto CE = SI->case_end(); CI != CE;) {
+    for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
       ConstantInt *Case = CI->getCaseValue();
       LazyValueInfo::Tristate State =
           LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
@@ -415,28 +418,9 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
         break;
       }
 
-      // Get Lower/Upper bound from switch cases.
-      Low = APIntOps::smin(Case->getValue(), Low);
-      High = APIntOps::smax(Case->getValue(), High);
-
       // Increment the case iterator since we didn't delete it.
       ++CI;
     }
-
-    // Try to simplify default case as unreachable
-    if (CI == SI->case_end() && SI->getNumCases() != 0 &&
-        !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg())) {
-      const ConstantRange SIRange =
-          LVI->getConstantRange(SI->getCondition(), SI);
-
-      // If the numbered switch cases cover the entire range of the condition,
-      // then the default case is not reachable.
-      if (SIRange.getSignedMin() == Low && SIRange.getSignedMax() == High &&
-          SI->getNumCases() == High - Low + 1) {
-        createUnreachableSwitchDefault(SI, &DTU);
-        Changed = true;
-      }
-    }
   }
 
   if (Changed)
@@ -688,7 +672,7 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
     ArgNo++;
   }
 
-  assert(ArgNo == CB.arg_size() && "sanity check");
+  assert(ArgNo == CB.arg_size() && "Call arguments not processed correctly.");
 
   if (ArgNos.empty())
     return Changed;
@@ -954,10 +938,22 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
   if (SDI->getType()->isVectorTy())
     return false;
 
+  ConstantRange LRange = LVI->getConstantRange(SDI->getOperand(0), SDI);
+  unsigned OrigWidth = SDI->getType()->getIntegerBitWidth();
+  ConstantRange NegOneOrZero =
+      ConstantRange(APInt(OrigWidth, (uint64_t)-1, true), APInt(OrigWidth, 1));
+  if (NegOneOrZero.contains(LRange)) {
+    // ashr of -1 or 0 never changes the value, so drop the whole instruction
+    ++NumAShrsRemoved;
+    SDI->replaceAllUsesWith(SDI->getOperand(0));
+    SDI->eraseFromParent();
+    return true;
+  }
+
   if (!isNonNegative(SDI->getOperand(0), LVI, SDI))
     return false;
 
-  ++NumAShrs;
+  ++NumAShrsConverted;
   auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
                                         SDI->getName(), SDI);
   BO->setDebugLoc(SDI->getDebugLoc());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a8ec8bb97970..e0d3a6accadd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -159,52 +159,22 @@ static cl::opt<unsigned> MemorySSAPathCheckLimit(
     cl::desc("The maximum number of blocks to check when trying to prove that "
              "all paths to an exit go through a killing block (default = 50)"));
 
+// This flags allows or disallows DSE to optimize MemorySSA during its
+// traversal. Note that DSE optimizing MemorySSA may impact other passes
+// downstream of the DSE invocation and can lead to issues not being
+// reproducible in isolation (i.e. when MemorySSA is built from scratch). In
+// those cases, the flag can be used to check if DSE's MemorySSA optimizations
+// impact follow-up passes.
+static cl::opt<bool>
+    OptimizeMemorySSA("dse-optimize-memoryssa", cl::init(true), cl::Hidden,
+                      cl::desc("Allow DSE to optimize memory accesses."));
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
 using OverlapIntervalsTy = std::map<int64_t, int64_t>;
 using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
 
-/// Does this instruction write some memory?  This only returns true for things
-/// that we can analyze with other helpers below.
-static bool hasAnalyzableMemoryWrite(Instruction *I,
-                                     const TargetLibraryInfo &TLI) {
-  if (isa<StoreInst>(I))
-    return true;
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::memset:
-    case Intrinsic::memmove:
-    case Intrinsic::memcpy:
-    case Intrinsic::memcpy_inline:
-    case Intrinsic::memcpy_element_unordered_atomic:
-    case Intrinsic::memmove_element_unordered_atomic:
-    case Intrinsic::memset_element_unordered_atomic:
-    case Intrinsic::init_trampoline:
-    case Intrinsic::lifetime_end:
-    case Intrinsic::masked_store:
-      return true;
-    }
-  }
-  if (auto *CB = dyn_cast<CallBase>(I)) {
-    LibFunc LF;
-    if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
-      switch (LF) {
-      case LibFunc_strcpy:
-      case LibFunc_strncpy:
-      case LibFunc_strcat:
-      case LibFunc_strncat:
-        return true;
-      default:
-        return false;
-      }
-    }
-  }
-  return false;
-}
-
 /// If the value of this instruction and the memory it writes to is unused, may
 /// we delete this instruction?
 static bool isRemovable(Instruction *I) {
@@ -214,7 +184,7 @@ static bool isRemovable(Instruction *I) {
 
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
-    default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
+    default: llvm_unreachable("Does not have LocForWrite");
     case Intrinsic::lifetime_end:
       // Never remove dead lifetime_end's, e.g. because it is followed by a
       // free.
@@ -296,6 +266,7 @@ enum OverwriteResult {
   OW_End,
   OW_PartialEarlierWithFullLater,
   OW_MaybePartial,
+  OW_None,
   OW_Unknown
 };
 
@@ -841,7 +812,7 @@ struct DSEState {
 
   /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
   /// basic block.
-  DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
+  MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs;
 
   // Class contains self-reference, make sure it's not copied/moved.
   DSEState(const DSEState &) = delete;
@@ -889,6 +860,7 @@ struct DSEState {
   /// Return OW_MaybePartial if \p KillingI does not completely overwrite
   /// \p DeadI, but they both write to the same underlying object. In that
   /// case, use isPartialOverwrite to check if \p KillingI partially overwrites
+  /// \p DeadI. Returns 'OR_None' if \p KillingI is known to not overwrite the
   /// \p DeadI. Returns 'OW_Unknown' if nothing can be determined.
   OverwriteResult isOverwrite(const Instruction *KillingI,
                               const Instruction *DeadI,
@@ -951,8 +923,16 @@ struct DSEState {
 
     // If we can't resolve the same pointers to the same object, then we can't
     // analyze them at all.
-    if (DeadUndObj != KillingUndObj)
+    if (DeadUndObj != KillingUndObj) {
+      // Non aliasing stores to different objects don't overlap. Note that
+      // if the killing store is known to overwrite whole object (out of
+      // bounds access overwrites whole object as well) then it is assumed to
+      // completely overwrite any store to the same object even if they don't
+      // actually alias (see next check).
+      if (AAR == AliasResult::NoAlias)
+        return OW_None;
       return OW_Unknown;
+    }
 
     // If the KillingI store is to a recognizable object, get its size.
     uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F);
@@ -1006,9 +986,8 @@ struct DSEState {
       return OW_MaybePartial;
     }
 
-    // Can reach here only if accesses are known not to overlap. There is no
-    // dedicated code to indicate no overlap so signal "unknown".
-    return OW_Unknown;
+    // Can reach here only if accesses are known not to overlap.
+    return OW_None;
   }
 
   bool isInvisibleToCallerAfterRet(const Value *V) {
@@ -1304,6 +1283,15 @@ struct DSEState {
     Instruction *KillingI = KillingDef->getMemoryInst();
     LLVM_DEBUG(dbgs() << "  trying to get dominating access\n");
 
+    // Only optimize defining access of KillingDef when directly starting at its
+    // defining access. The defining access also must only access KillingLoc. At
+    // the moment we only support instructions with a single write location, so
+    // it should be sufficient to disable optimizations for instructions that
+    // also read from memory.
+    bool CanOptimize = OptimizeMemorySSA &&
+                       KillingDef->getDefiningAccess() == StartAccess &&
+                       !KillingI->mayReadFromMemory();
+
     // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
     Optional<MemoryLocation> CurrentLoc;
     for (;; Current = cast<MemoryDef>(Current)->getDefiningAccess()) {
@@ -1345,8 +1333,10 @@ struct DSEState {
       Instruction *CurrentI = CurrentDef->getMemoryInst();
 
       if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj),
-                     TLI))
+                     TLI)) {
+        CanOptimize = false;
         continue;
+      }
 
       // Before we try to remove anything, check for any extra throwing
       // instructions that block us from DSEing
@@ -1380,15 +1370,13 @@ struct DSEState {
         return None;
       }
 
-      // If Current cannot be analyzed or is not removable, check the next
-      // candidate.
-      if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI))
-        continue;
-
-      // If Current does not have an analyzable write location, skip it
+      // If Current does not have an analyzable write location or is not
+      // removable, skip it.
       CurrentLoc = getLocForWriteEx(CurrentI);
-      if (!CurrentLoc)
+      if (!CurrentLoc || !isRemovable(CurrentI)) {
+        CanOptimize = false;
         continue;
+      }
 
       // AliasAnalysis does not account for loops. Limit elimination to
       // candidates for which we can guarantee they always store to the same
@@ -1396,6 +1384,7 @@ struct DSEState {
       if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
         LLVM_DEBUG(dbgs() << "  ... not guaranteed loop independent\n");
         WalkerStepLimit -= 1;
+        CanOptimize = false;
         continue;
       }
 
@@ -1403,16 +1392,32 @@ struct DSEState {
         // If the killing def is a memory terminator (e.g. lifetime.end), check
         // the next candidate if the current Current does not write the same
         // underlying object as the terminator.
-        if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI))
+        if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) {
+          CanOptimize = false;
           continue;
+        }
       } else {
         int64_t KillingOffset = 0;
         int64_t DeadOffset = 0;
         auto OR = isOverwrite(KillingI, CurrentI, KillingLoc, *CurrentLoc,
                               KillingOffset, DeadOffset);
+        if (CanOptimize) {
+          // CurrentDef is the earliest write clobber of KillingDef. Use it as
+          // optimized access. Do not optimize if CurrentDef is already the
+          // defining access of KillingDef.
+          if (CurrentDef != KillingDef->getDefiningAccess() &&
+              (OR == OW_Complete || OR == OW_MaybePartial))
+            KillingDef->setOptimized(CurrentDef);
+
+          // Once a may-aliasing def is encountered do not set an optimized
+          // access.
+          if (OR != OW_None)
+            CanOptimize = false;
+        }
+
         // If Current does not write to the same object as KillingDef, check
         // the next candidate.
-        if (OR == OW_Unknown)
+        if (OR == OW_Unknown || OR == OW_None)
           continue;
         else if (OR == OW_MaybePartial) {
           // If KillingDef only partially overwrites Current, check the next
@@ -1421,6 +1426,7 @@ struct DSEState {
           // which are less likely to be removable in the end.
           if (PartialLimit <= 1) {
             WalkerStepLimit -= 1;
+            LLVM_DEBUG(dbgs() << "   ... reached partial limit ... continue with next access\n");
             continue;
           }
           PartialLimit -= 1;
@@ -1922,7 +1928,14 @@ struct DSEState {
       if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) ||
           !isRemovable(Def->getMemoryInst()))
         continue;
-      auto *UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess());
+      MemoryDef *UpperDef;
+      // To conserve compile-time, we avoid walking to the next clobbering def.
+      // Instead, we just try to get the optimized access, if it exists. DSE
+      // will try to optimize defs during the earlier traversal.
+      if (Def->isOptimized())
+        UpperDef = dyn_cast<MemoryDef>(Def->getOptimized());
+      else
+        UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess());
       if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef))
         continue;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index ae2fe2767074..7001d330fce0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1951,7 +1951,6 @@ bool IndVarSimplify::run(Loop *L) {
   // using it.
   if (!DisableLFTR) {
     BasicBlock *PreHeader = L->getLoopPreheader();
-    BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
 
     SmallVector<BasicBlock*, 16> ExitingBlocks;
     L->getExitingBlocks(ExitingBlocks);
@@ -1987,7 +1986,7 @@ bool IndVarSimplify::run(Loop *L) {
       // Avoid high cost expansions.  Note: This heuristic is questionable in
       // that our definition of "high cost" is not exactly principled.
       if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget,
-                                       TTI, PreHeaderBR))
+                                       TTI, PreHeader->getTerminator()))
         continue;
 
       // Check preconditions for proper SCEVExpander operation. SCEV does not
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
index bf714d167670..6f97f3e93123 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -486,7 +486,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
 
   // Check that neither this loop nor its parent have had LCSSA broken. LICM is
   // specifically moving instructions across the loop boundary and so it is
-  // especially in need of sanity checking here.
+  // especially in need of basic functional correctness checking here.
   assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
   assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
          "Parent loop not left in LCSSA form after LICM!");
@@ -1860,6 +1860,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   bool UnorderedAtomic;
   AAMDNodes AATags;
   ICFLoopSafetyInfo &SafetyInfo;
+  bool CanInsertStoresInExitBlocks;
 
   // We're about to add a use of V in a loop exit block.  Insert an LCSSA phi
   // (if legal) if doing so would add an out-of-loop use to an instruction
@@ -1886,12 +1887,13 @@ public:
                SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
                MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl,
                Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags,
-               ICFLoopSafetyInfo &SafetyInfo)
+               ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP),
         PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
         Alignment(Alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
-        SafetyInfo(SafetyInfo) {}
+        SafetyInfo(SafetyInfo),
+        CanInsertStoresInExitBlocks(CanInsertStoresInExitBlocks) {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -1903,7 +1905,7 @@ public:
     return PointerMustAliases.count(Ptr);
   }
 
-  void doExtraRewritesBeforeFinalDeletion() override {
+  void insertStoresInLoopExitBlocks() {
     // Insert stores after in the loop exit blocks.  Each exit block gets a
     // store of the live-out values that feed them.  Since we've already told
     // the SSA updater about the defs in the loop and the preheader
@@ -1937,10 +1939,21 @@ public:
     }
   }
 
+  void doExtraRewritesBeforeFinalDeletion() override {
+    if (CanInsertStoresInExitBlocks)
+      insertStoresInLoopExitBlocks();
+  }
+
   void instructionDeleted(Instruction *I) const override {
     SafetyInfo.removeInstruction(I);
     MSSAU->removeMemoryAccess(I);
   }
+
+  bool shouldDelete(Instruction *I) const override {
+    if (isa<StoreInst>(I))
+      return CanInsertStoresInExitBlocks;
+    return true;
+  }
 };
 
 bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L,
@@ -2039,6 +2052,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
   bool DereferenceableInPH = false;
   bool SafeToInsertStore = false;
+  bool FoundLoadToPromote = false;
 
   SmallVector<Instruction *, 64> LoopUses;
 
@@ -2067,16 +2081,11 @@ bool llvm::promoteLoopAccessesToScalars(
     IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
   }
 
-  // Check that all of the pointers in the alias set have the same type.  We
-  // cannot (yet) promote a memory location that is loaded and stored in
+  // Check that all accesses to pointers in the aliass set use the same type.
+  // We cannot (yet) promote a memory location that is loaded and stored in
   // different sizes.  While we are at it, collect alignment and AA info.
+  Type *AccessTy = nullptr;
   for (Value *ASIV : PointerMustAliases) {
-    // Check that all of the pointers in the alias set have the same type.  We
-    // cannot (yet) promote a memory location that is loaded and stored in
-    // different sizes.
-    if (SomePtr->getType() != ASIV->getType())
-      return false;
-
     for (User *U : ASIV->users()) {
       // Ignore instructions that are outside the loop.
       Instruction *UI = dyn_cast<Instruction>(U);
@@ -2091,6 +2100,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
         SawUnorderedAtomic |= Load->isAtomic();
         SawNotAtomic |= !Load->isAtomic();
+        FoundLoadToPromote = true;
 
         Align InstAlignment = Load->getAlign();
 
@@ -2153,6 +2163,11 @@ bool llvm::promoteLoopAccessesToScalars(
       } else
         return false; // Not a load or store.
 
+      if (!AccessTy)
+        AccessTy = getLoadStoreType(UI);
+      else if (AccessTy != getLoadStoreType(UI))
+        return false;
+
       // Merge the AA tags.
       if (LoopUses.empty()) {
         // On the first load/store, just take its AA tags.
@@ -2175,9 +2190,7 @@ bool llvm::promoteLoopAccessesToScalars(
   // If we're inserting an atomic load in the preheader, we must be able to
   // lower it.  We're only guaranteed to be able to lower naturally aligned
   // atomics.
-  auto *SomePtrElemType = SomePtr->getType()->getPointerElementType();
-  if (SawUnorderedAtomic &&
-      Alignment < MDL.getTypeStoreSize(SomePtrElemType))
+  if (SawUnorderedAtomic && Alignment < MDL.getTypeStoreSize(AccessTy))
     return false;
 
   // If we couldn't prove we can hoist the load, bail.
@@ -2199,13 +2212,20 @@ bool llvm::promoteLoopAccessesToScalars(
     }
   }
 
-  // If we've still failed to prove we can sink the store, give up.
-  if (!SafeToInsertStore)
+  // If we've still failed to prove we can sink the store, hoist the load
+  // only, if possible.
+  if (!SafeToInsertStore && !FoundLoadToPromote)
+    // If we cannot hoist the load either, give up.
     return false;
 
-  // Otherwise, this is safe to promote, lets do it!
-  LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
-                    << '\n');
+  // Lets do the promotion!
+  if (SafeToInsertStore)
+    LLVM_DEBUG(dbgs() << "LICM: Promoting load/store of the value: " << *SomePtr
+                      << '\n');
+  else
+    LLVM_DEBUG(dbgs() << "LICM: Promoting load of the value: " << *SomePtr
+                      << '\n');
+
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
                               LoopUses[0])
@@ -2224,13 +2244,14 @@ bool llvm::promoteLoopAccessesToScalars(
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
                         InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL,
-                        Alignment, SawUnorderedAtomic, AATags, *SafetyInfo);
+                        Alignment, SawUnorderedAtomic, AATags, *SafetyInfo,
+                        SafeToInsertStore);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
   LoadInst *PreheaderLoad = new LoadInst(
-      SomePtr->getType()->getPointerElementType(), SomePtr,
-      SomePtr->getName() + ".promoted", Preheader->getTerminator());
+      AccessTy, SomePtr, SomePtr->getName() + ".promoted",
+      Preheader->getTerminator());
   if (SawUnorderedAtomic)
     PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
   PreheaderLoad->setAlignment(Alignment);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 3df4cfe8e4c1..6c783848432b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -49,9 +49,17 @@ void PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
                  LPMUpdater &>::printPipeline(raw_ostream &OS,
                                               function_ref<StringRef(StringRef)>
                                                   MapClassName2PassName) {
-  for (unsigned Idx = 0, Size = LoopPasses.size(); Idx != Size; ++Idx) {
-    auto *P = LoopPasses[Idx].get();
-    P->printPipeline(OS, MapClassName2PassName);
+  assert(LoopPasses.size() + LoopNestPasses.size() == IsLoopNestPass.size());
+
+  unsigned IdxLP = 0, IdxLNP = 0;
+  for (unsigned Idx = 0, Size = IsLoopNestPass.size(); Idx != Size; ++Idx) {
+    if (IsLoopNestPass[Idx]) {
+      auto *P = LoopNestPasses[IdxLNP++].get();
+      P->printPipeline(OS, MapClassName2PassName);
+    } else {
+      auto *P = LoopPasses[IdxLP++].get();
+      P->printPipeline(OS, MapClassName2PassName);
+    }
     if (Idx + 1 < Size)
       OS << ",";
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index a87843d658a9..728d63fe2847 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -256,8 +256,8 @@ private:
         }
     }
 
-    // Sanity check: amount of dead and live loop blocks should match the total
-    // number of blocks in loop.
+    // Amount of dead and live loop blocks should match the total number of
+    // blocks in loop.
     assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() &&
            "Malformed block sets?");
 
@@ -305,7 +305,6 @@ private:
         BlocksInLoopAfterFolding.insert(BB);
     }
 
-    // Sanity check: header must be in loop.
     assert(BlocksInLoopAfterFolding.count(L.getHeader()) &&
            "Header not in loop?");
     assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() &&
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 67702520511b..39c8b65968aa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -806,28 +806,27 @@ static Optional<unsigned> shouldFullUnroll(
     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
     const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
     const TargetTransformInfo::UnrollingPreferences &UP) {
+  assert(FullUnrollTripCount && "should be non-zero!");
 
-  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
-    // When computing the unrolled size, note that BEInsns are not replicated
-    // like the rest of the loop body.
-    if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) {
-      return FullUnrollTripCount;
+  if (FullUnrollTripCount > UP.FullUnrollMaxCount)
+    return None;
 
-    } else {
-      // The loop isn't that small, but we still can fully unroll it if that
-      // helps to remove a significant number of instructions.
-      // To check that, run additional analysis on the loop.
-      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
-              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
-              UP.MaxIterationsCountToAnalyze)) {
-        unsigned Boost =
-            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
-        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
-          return FullUnrollTripCount;
-        }
-      }
-    }
+  // When computing the unrolled size, note that BEInsns are not replicated
+  // like the rest of the loop body.
+  if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
+    return FullUnrollTripCount;
+
+  // The loop isn't that small, but we still can fully unroll it if that
+  // helps to remove a significant number of instructions.
+  // To check that, run additional analysis on the loop.
+  if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+          L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+          UP.Threshold * UP.MaxPercentThresholdBoost / 100,
+          UP.MaxIterationsCountToAnalyze)) {
+    unsigned Boost =
+      getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+    if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
+      return FullUnrollTripCount;
   }
   return None;
 }
@@ -837,51 +836,48 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
                     const UnrollCostEstimator UCE,
                     const TargetTransformInfo::UnrollingPreferences &UP) {
 
+  if (!TripCount)
+    return None;
+
+  if (!UP.Partial) {
+    LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
+               << "-unroll-allow-partial not given\n");
+    return 0;
+  }
   unsigned count = UP.Count;
-  if (TripCount) {
-    if (!UP.Partial) {
-      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
-                        << "-unroll-allow-partial not given\n");
-      count = 0;
-      return count;
-    }
-    if (count == 0)
-      count = TripCount;
-    if (UP.PartialThreshold != NoThreshold) {
-      // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
-        count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
-                (LoopSize - UP.BEInsns);
-      if (count > UP.MaxCount)
-        count = UP.MaxCount;
-      while (count != 0 && TripCount % count != 0)
-        count--;
-      if (UP.AllowRemainder && count <= 1) {
-        // If there is no Count that is modulo of TripCount, set Count to
-        // largest power-of-two factor that satisfies the threshold limit.
-        // As we'll create fixup loop, do the type of unrolling only if
-        // remainder loop is allowed.
-        count = UP.DefaultUnrollRuntimeCount;
-        while (count != 0 &&
-               UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
-          count >>= 1;
-      }
-      if (count < 2) {
-        count = 0;
-      }
-    } else {
-      count = TripCount;
-    }
+  if (count == 0)
+    count = TripCount;
+  if (UP.PartialThreshold != NoThreshold) {
+    // Reduce unroll count to be modulo of TripCount for partial unrolling.
+    if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+      count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+        (LoopSize - UP.BEInsns);
     if (count > UP.MaxCount)
       count = UP.MaxCount;
-
-    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << count << "\n");
-
-    return count;
+    while (count != 0 && TripCount % count != 0)
+      count--;
+    if (UP.AllowRemainder && count <= 1) {
+      // If there is no Count that is modulo of TripCount, set Count to
+      // largest power-of-two factor that satisfies the threshold limit.
+      // As we'll create fixup loop, do the type of unrolling only if
+      // remainder loop is allowed.
+      count = UP.DefaultUnrollRuntimeCount;
+      while (count != 0 &&
+             UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+        count >>= 1;
+    }
+    if (count < 2) {
+      count = 0;
+    }
+  } else {
+    count = TripCount;
   }
+  if (count > UP.MaxCount)
+    count = UP.MaxCount;
 
-  // if didn't return until here, should continue to other priorties
-  return None;
+  LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << count << "\n");
+
+  return count;
 }
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
@@ -900,7 +896,6 @@ bool llvm::computeUnrollCount(
     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
 
   UnrollCostEstimator UCE(*L, LoopSize);
-  Optional<unsigned> UnrollFactor;
 
   const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
   const bool PragmaFullUnroll = hasUnrollFullPragma(L);
@@ -926,9 +921,8 @@ bool llvm::computeUnrollCount(
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
   // 2nd priority is unroll count set by pragma.
-  UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, UCE, UP);
-
-  if (UnrollFactor) {
+  if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
+                                             UCE, UP)) {
     UP.Count = *UnrollFactor;
 
     if (UserUnrollCount || (PragmaCount > 0)) {
@@ -948,11 +942,20 @@ bool llvm::computeUnrollCount(
     }
   }
 
-  // 3rd priority is full unroll count.
-  // Full unroll makes sense only when TripCount or its upper bound could be
-  // statically calculated.
-  // Also we need to check if we exceed FullUnrollMaxCount.
+  // 3rd priority is exact full unrolling.  This will eliminate all copies
+  // of some exit test.
+  UP.Count = 0;
+  if (TripCount) {
+    UP.Count = TripCount;
+    if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
+                                             TripCount, UCE, UP)) {
+      UP.Count = *UnrollFactor;
+      UseUpperBound = false;
+      return ExplicitUnroll;
+    }
+  }
 
+  // 4th priority is bounded unrolling.
   // We can unroll by the upper bound amount if it's generally allowed or if
   // we know that the loop is executed either the upper bound or zero times.
   // (MaxOrZero unrolling keeps only the first loop test, so the number of
@@ -961,37 +964,21 @@ bool llvm::computeUnrollCount(
   // number of loop tests goes up which may end up being worse on targets with
   // constrained branch predictor resources so is controlled by an option.)
   // In addition we only unroll small upper bounds.
-  unsigned FullUnrollMaxTripCount = MaxTripCount;
-  if (!(UP.UpperBound || MaxOrZero) ||
-      FullUnrollMaxTripCount > UnrollMaxUpperBound)
-    FullUnrollMaxTripCount = 0;
-
-  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
-  // compute the former when the latter is zero.
-  unsigned ExactTripCount = TripCount;
-  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
-         "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
-
-  unsigned FullUnrollTripCount =
-      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
-  UP.Count = FullUnrollTripCount;
-
-  UnrollFactor =
-      shouldFullUnroll(L, TTI, DT, SE, EphValues, FullUnrollTripCount, UCE, UP);
-
-  // if shouldFullUnroll can do the unrolling, some side parameteres should be
-  // set
-  if (UnrollFactor) {
-    UP.Count = *UnrollFactor;
-    UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
-    TripCount = FullUnrollTripCount;
-    TripMultiple = UP.UpperBound ? 1 : TripMultiple;
-    return ExplicitUnroll;
-  } else {
-    UP.Count = FullUnrollTripCount;
+  // Note that the cost of bounded unrolling is always strictly greater than
+  // cost of exact full unrolling.  As such, if we have an exact count and
+  // found it unprofitable, we'll never chose to bounded unroll.
+  if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
+      MaxTripCount <= UnrollMaxUpperBound) {
+    UP.Count = MaxTripCount;
+    if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
+                                             MaxTripCount, UCE, UP)) {
+      UP.Count = *UnrollFactor;
+      UseUpperBound = true;
+      return ExplicitUnroll;
+    }
   }
 
-  // 4th priority is loop peeling.
+  // 5th priority is loop peeling.
   computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold);
   if (PP.PeelCount) {
     UP.Runtime = false;
@@ -1004,11 +991,9 @@ bool llvm::computeUnrollCount(
   if (TripCount)
     UP.Partial |= ExplicitUnroll;
 
-  // 5th priority is partial unrolling.
+  // 6th priority is partial unrolling.
   // Try partial unroll only when TripCount could be statically calculated.
-  UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP);
-
-  if (UnrollFactor) {
+  if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
     UP.Count = *UnrollFactor;
 
     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
@@ -1049,7 +1034,7 @@ bool llvm::computeUnrollCount(
                 "because loop has a runtime trip count.";
     });
 
-  // 6th priority is runtime unrolling.
+  // 7th priority is runtime unrolling.
   // Don't unroll a runtime trip count loop when it is disabled.
   if (hasRuntimeUnrollDisablePragma(L)) {
     UP.Count = 0;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
index b0fb8daaba8f..c354fa177a60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -494,7 +494,7 @@ static bool LinearizeExprTree(Instruction *I,
   SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
 
 #ifndef NDEBUG
-  SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
+  SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme.
 #endif
   while (!Worklist.empty()) {
     std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
@@ -2313,11 +2313,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
   MadeChange |= LinearizeExprTree(I, Tree);
   SmallVector<ValueEntry, 8> Ops;
   Ops.reserve(Tree.size());
-  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
-    RepeatedValue E = Tree[i];
-    Ops.append(E.second.getZExtValue(),
-               ValueEntry(getRank(E.first), E.first));
-  }
+  for (const RepeatedValue &E : Tree)
+    Ops.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first));
 
   LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 86d3620c312e..3799d2dd1cf2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -227,8 +227,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
   unsigned IterCnt = 0;
   (void)IterCnt;
   while (LocalChange) {
-    assert(IterCnt++ < 1000 &&
-           "Sanity: iterative simplification didn't converge!");
+    assert(IterCnt++ < 1000 && "Iterative simplification didn't converge!");
     LocalChange = false;
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6469c899feea..d6d6b1a7fa09 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -235,22 +235,26 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // These dominator edges will be redirected from Pred.
   std::vector<DominatorTree::UpdateType> Updates;
   if (DTU) {
-    SmallPtrSet<BasicBlock *, 2> SuccsOfBB(succ_begin(BB), succ_end(BB));
+    // To avoid processing the same predecessor more than once.
+    SmallPtrSet<BasicBlock *, 8> SeenSuccs;
     SmallPtrSet<BasicBlock *, 2> SuccsOfPredBB(succ_begin(PredBB),
                                                succ_end(PredBB));
-    Updates.reserve(Updates.size() + 2 * SuccsOfBB.size() + 1);
+    Updates.reserve(Updates.size() + 2 * succ_size(BB) + 1);
     // Add insert edges first. Experimentally, for the particular case of two
     // blocks that can be merged, with a single successor and single predecessor
     // respectively, it is beneficial to have all insert updates first. Deleting
     // edges first may lead to unreachable blocks, followed by inserting edges
     // making the blocks reachable again. Such DT updates lead to high compile
     // times. We add inserts before deletes here to reduce compile time.
-    for (BasicBlock *SuccOfBB : SuccsOfBB)
+    for (BasicBlock *SuccOfBB : successors(BB))
       // This successor of BB may already be a PredBB's successor.
       if (!SuccsOfPredBB.contains(SuccOfBB))
-        Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB});
-    for (BasicBlock *SuccOfBB : SuccsOfBB)
-      Updates.push_back({DominatorTree::Delete, BB, SuccOfBB});
+        if (SeenSuccs.insert(SuccOfBB).second)
+          Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB});
+    SeenSuccs.clear();
+    for (BasicBlock *SuccOfBB : successors(BB))
+      if (SeenSuccs.insert(SuccOfBB).second)
+        Updates.push_back({DominatorTree::Delete, BB, SuccOfBB});
     Updates.push_back({DominatorTree::Delete, PredBB, BB});
   }
 
@@ -804,14 +808,14 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 8> Updates;
     // Old dominates New. New node dominates all other nodes dominated by Old.
-    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New),
-                                                       succ_end(New));
+    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld;
     Updates.push_back({DominatorTree::Insert, Old, New});
-    Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfOld.size());
-    for (BasicBlock *UniqueSuccessorOfOld : UniqueSuccessorsOfOld) {
-      Updates.push_back({DominatorTree::Insert, New, UniqueSuccessorOfOld});
-      Updates.push_back({DominatorTree::Delete, Old, UniqueSuccessorOfOld});
-    }
+    Updates.reserve(Updates.size() + 2 * succ_size(New));
+    for (BasicBlock *SuccessorOfOld : successors(New))
+      if (UniqueSuccessorsOfOld.insert(SuccessorOfOld).second) {
+        Updates.push_back({DominatorTree::Insert, New, SuccessorOfOld});
+        Updates.push_back({DominatorTree::Delete, Old, SuccessorOfOld});
+      }
 
     DTU->applyUpdates(Updates);
   } else if (DT)
@@ -870,14 +874,14 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt,
     SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
     // New dominates Old. The predecessor nodes of the Old node dominate
     // New node.
-    SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld(pred_begin(New),
-                                                         pred_end(New));
+    SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld;
     DTUpdates.push_back({DominatorTree::Insert, New, Old});
-    DTUpdates.reserve(DTUpdates.size() + 2 * UniquePredecessorsOfOld.size());
-    for (BasicBlock *UniquePredecessorOfOld : UniquePredecessorsOfOld) {
-      DTUpdates.push_back({DominatorTree::Insert, UniquePredecessorOfOld, New});
-      DTUpdates.push_back({DominatorTree::Delete, UniquePredecessorOfOld, Old});
-    }
+    DTUpdates.reserve(DTUpdates.size() + 2 * pred_size(New));
+    for (BasicBlock *PredecessorOfOld : predecessors(New))
+      if (UniquePredecessorsOfOld.insert(PredecessorOfOld).second) {
+        DTUpdates.push_back({DominatorTree::Insert, PredecessorOfOld, New});
+        DTUpdates.push_back({DominatorTree::Delete, PredecessorOfOld, Old});
+      }
 
     DTU->applyUpdates(DTUpdates);
 
@@ -910,13 +914,14 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
     } else {
       // Split block expects NewBB to have a non-empty set of predecessors.
       SmallVector<DominatorTree::UpdateType, 8> Updates;
-      SmallPtrSet<BasicBlock *, 8> UniquePreds(Preds.begin(), Preds.end());
+      SmallPtrSet<BasicBlock *, 8> UniquePreds;
       Updates.push_back({DominatorTree::Insert, NewBB, OldBB});
-      Updates.reserve(Updates.size() + 2 * UniquePreds.size());
-      for (auto *UniquePred : UniquePreds) {
-        Updates.push_back({DominatorTree::Insert, UniquePred, NewBB});
-        Updates.push_back({DominatorTree::Delete, UniquePred, OldBB});
-      }
+      Updates.reserve(Updates.size() + 2 * Preds.size());
+      for (auto *Pred : Preds)
+        if (UniquePreds.insert(Pred).second) {
+          Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+          Updates.push_back({DominatorTree::Delete, Pred, OldBB});
+        }
       DTU->applyUpdates(Updates);
     }
   } else if (DT) {
@@ -1376,14 +1381,14 @@ SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
   if (DTU) {
-    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail),
-                                                        succ_end(Tail));
+    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead;
     Updates.push_back({DominatorTree::Insert, Head, Tail});
-    Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfHead.size());
-    for (BasicBlock *UniqueSuccessorOfHead : UniqueSuccessorsOfHead) {
-      Updates.push_back({DominatorTree::Insert, Tail, UniqueSuccessorOfHead});
-      Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead});
-    }
+    Updates.reserve(Updates.size() + 2 * succ_size(Tail));
+    for (BasicBlock *SuccessorOfHead : successors(Tail))
+      if (UniqueSuccessorsOfHead.insert(SuccessorOfHead).second) {
+        Updates.push_back({DominatorTree::Insert, Tail, SuccessorOfHead});
+        Updates.push_back({DominatorTree::Delete, Head, SuccessorOfHead});
+      }
   }
   Instruction *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 957935398972..580cfd80141e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -452,18 +452,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_mempcpy:
   case LibFunc_memccpy:
+    Changed |= setWillReturn(F);
+    LLVM_FALLTHROUGH;
+  case LibFunc_memcpy_chk:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setWillReturn(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotAlias(F, 1);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc_memcpy_chk:
-    Changed |= setDoesNotThrow(F);
-    return Changed;
   case LibFunc_memalign:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
@@ -1018,9 +1017,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  // TODO: add LibFunc entries for:
-  // case LibFunc_memset_pattern4:
-  // case LibFunc_memset_pattern8:
+  case LibFunc_memset_pattern4:
+  case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -1029,10 +1027,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_memset:
-    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotThrow(F);
+    LLVM_FALLTHROUGH;
+  case LibFunc_memset_chk:
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyWritesMemory(F, 0);
+    Changed |= setDoesNotThrow(F);
     return Changed;
   // int __nvvm_reflect(const char *)
   case LibFunc_nvvm_reflect:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
index 200deca4b317..57c273a0e3c5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -135,10 +135,18 @@ std::unique_ptr<Module> llvm::CloneModule(
   // Similarly, copy over function bodies now...
   //
   for (const Function &I : M) {
-    if (I.isDeclaration())
+    Function *F = cast<Function>(VMap[&I]);
+
+    if (I.isDeclaration()) {
+      // Copy over metadata for declarations since we're not doing it below in
+      // CloneFunctionInto().
+      SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+      I.getAllMetadata(MDs);
+      for (auto MD : MDs)
+        F->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
       continue;
+    }
 
-    Function *F = cast<Function>(VMap[&I]);
     if (!ShouldCloneDefinition(&I)) {
       // Skip after setting the correct linkage for an external reference.
       F->setLinkage(GlobalValue::ExternalLinkage);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
index 4dbcbf80d3da..7c310f16d46e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
@@ -74,7 +74,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
                                  {}, {}, nullptr, "widenable_cond");
     CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC,
                                       "exiplicit_guard_cond"));
-    assert(isWidenableBranch(CheckBI) && "sanity check");
+    assert(isWidenableBranch(CheckBI) && "Branch must be widenable.");
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f4776589910f..997667810580 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1218,10 +1218,9 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     if (!RI || !isa<CallBase>(RI->getOperand(0)))
       continue;
     auto *RetVal = cast<CallBase>(RI->getOperand(0));
-    // Sanity check that the cloned RetVal exists and is a call, otherwise we
-    // cannot add the attributes on the cloned RetVal.
-    // Simplification during inlining could have transformed the cloned
-    // instruction.
+    // Check that the cloned RetVal exists and is a call, otherwise we cannot
+    // add the attributes on the cloned RetVal. Simplification during inlining
+    // could have transformed the cloned instruction.
     auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
     if (!NewRetVal)
       continue;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
index 74ab37fadf36..ec926b1f5a94 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
@@ -529,8 +529,8 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
     std::function<void(Value *)> AboutToDeleteCallback) {
   unsigned S = 0, E = DeadInsts.size(), Alive = 0;
   for (; S != E; ++S) {
-    auto *I = cast<Instruction>(DeadInsts[S]);
-    if (!isInstructionTriviallyDead(I)) {
+    auto *I = dyn_cast<Instruction>(DeadInsts[S]);
+    if (!I || !isInstructionTriviallyDead(I)) {
       DeadInsts[S] = nullptr;
       ++Alive;
     }
@@ -760,15 +760,18 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
   SmallVector<DominatorTree::UpdateType, 32> Updates;
 
   if (DTU) {
-    SmallPtrSet<BasicBlock *, 2> PredsOfPredBB(pred_begin(PredBB),
-                                               pred_end(PredBB));
-    Updates.reserve(Updates.size() + 2 * PredsOfPredBB.size() + 1);
-    for (BasicBlock *PredOfPredBB : PredsOfPredBB)
+    // To avoid processing the same predecessor more than once.
+    SmallPtrSet<BasicBlock *, 2> SeenPreds;
+    Updates.reserve(Updates.size() + 2 * pred_size(PredBB) + 1);
+    for (BasicBlock *PredOfPredBB : predecessors(PredBB))
       // This predecessor of PredBB may already have DestBB as a successor.
       if (PredOfPredBB != PredBB)
-        Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB});
-    for (BasicBlock *PredOfPredBB : PredsOfPredBB)
-      Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB});
+        if (SeenPreds.insert(PredOfPredBB).second)
+          Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB});
+    SeenPreds.clear();
+    for (BasicBlock *PredOfPredBB : predecessors(PredBB))
+      if (SeenPreds.insert(PredOfPredBB).second)
+        Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB});
     Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
   }
 
@@ -1096,16 +1099,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
 
   SmallVector<DominatorTree::UpdateType, 32> Updates;
   if (DTU) {
+    // To avoid processing the same predecessor more than once.
+    SmallPtrSet<BasicBlock *, 8> SeenPreds;
     // All predecessors of BB will be moved to Succ.
-    SmallPtrSet<BasicBlock *, 8> PredsOfBB(pred_begin(BB), pred_end(BB));
     SmallPtrSet<BasicBlock *, 8> PredsOfSucc(pred_begin(Succ), pred_end(Succ));
-    Updates.reserve(Updates.size() + 2 * PredsOfBB.size() + 1);
-    for (auto *PredOfBB : PredsOfBB)
+    Updates.reserve(Updates.size() + 2 * pred_size(BB) + 1);
+    for (auto *PredOfBB : predecessors(BB))
       // This predecessor of BB may already have Succ as a successor.
       if (!PredsOfSucc.contains(PredOfBB))
-        Updates.push_back({DominatorTree::Insert, PredOfBB, Succ});
-    for (auto *PredOfBB : PredsOfBB)
-      Updates.push_back({DominatorTree::Delete, PredOfBB, BB});
+        if (SeenPreds.insert(PredOfBB).second)
+          Updates.push_back({DominatorTree::Insert, PredOfBB, Succ});
+    SeenPreds.clear();
+    for (auto *PredOfBB : predecessors(BB))
+      if (SeenPreds.insert(PredOfBB).second)
+        Updates.push_back({DominatorTree::Delete, PredOfBB, BB});
     Updates.push_back({DominatorTree::Delete, BB, Succ});
   }
 
@@ -2190,26 +2197,6 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
     DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
 }
 
-void llvm::createUnreachableSwitchDefault(SwitchInst *Switch,
-                                          DomTreeUpdater *DTU) {
-  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-  auto *BB = Switch->getParent();
-  auto *OrigDefaultBlock = Switch->getDefaultDest();
-  OrigDefaultBlock->removePredecessor(BB);
-  BasicBlock *NewDefaultBlock = BasicBlock::Create(
-      BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
-      OrigDefaultBlock);
-  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
-  Switch->setDefaultDest(&*NewDefaultBlock);
-  if (DTU) {
-    SmallVector<DominatorTree::UpdateType, 2> Updates;
-    Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
-    if (!is_contained(successors(BB), OrigDefaultBlock))
-      Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
-    DTU->applyUpdates(Updates);
-  }
-}
-
 BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
                                                    BasicBlock *UnwindEdge,
                                                    DomTreeUpdater *DTU) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index a92cb6a313d3..bb719a499a4c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -623,15 +623,13 @@ bool llvm::UnrollRuntimeLoopRemainder(
   if (!SE)
     return false;
 
-  // Only unroll loops with a computable trip count, and the trip count needs
-  // to be an int value (allowing a pointer type is a TODO item).
+  // Only unroll loops with a computable trip count.
   // We calculate the backedge count by using getExitCount on the Latch block,
   // which is proven to be the only exiting block in this loop. This is same as
   // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
   // exiting blocks).
   const SCEV *BECountSC = SE->getExitCount(L, Latch);
-  if (isa<SCEVCouldNotCompute>(BECountSC) ||
-      !BECountSC->getType()->isIntegerTy()) {
+  if (isa<SCEVCouldNotCompute>(BECountSC)) {
     LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
     return false;
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 68572d479742..c8e42acdffb3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1049,6 +1049,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
     return Builder.CreateOrReduce(Src);
   case RecurKind::Xor:
     return Builder.CreateXorReduce(Src);
+  case RecurKind::FMulAdd:
   case RecurKind::FAdd:
     return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
                                     Src);
@@ -1091,7 +1092,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
 Value *llvm::createOrderedReduction(IRBuilderBase &B,
                                     const RecurrenceDescriptor &Desc,
                                     Value *Src, Value *Start) {
-  assert(Desc.getRecurrenceKind() == RecurKind::FAdd &&
+  assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
+          Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
          "Unexpected reduction kind");
   assert(Src->getType()->isVectorTy() && "Expected a vector type");
   assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 5893ce15b129..7d9992176658 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -446,6 +446,9 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
   // Now that everything is rewritten, delete the old instructions from the
   // function.  They should all be dead now.
   for (Instruction *User : Insts) {
+    if (!shouldDelete(User))
+      continue;
+
     // If this is a load that still has uses, then the load must have been added
     // as a live value in the SSAUpdate data structure for a block (e.g. because
     // the loaded value was stored later).  In this case, we need to recursively
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
new file mode 100644
index 000000000000..9495e442e0bf
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -0,0 +1,462 @@
+//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a profile inference algorithm. Given an incomplete and
+// possibly imprecise block counts, the algorithm reconstructs realistic block
+// and edge counts that satisfy flow conservation rules, while minimally modify
+// input block counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+#define DEBUG_TYPE "sample-profile-inference"
+
+namespace {
+
+/// A value indicating an infinite flow/capacity/weight of a block/edge.
+/// Not using numeric_limits<int64_t>::max(), as the values can be summed up
+/// during the execution.
+static constexpr int64_t INF = ((int64_t)1) << 50;
+
+/// The minimum-cost maximum flow algorithm.
+///
+/// The algorithm finds the maximum flow of minimum cost on a given (directed)
+/// network using a modified version of the classical Moore-Bellman-Ford
+/// approach. The algorithm applies a number of augmentation iterations in which
+/// flow is sent along paths of positive capacity from the source to the sink.
+/// The worst-case time complexity of the implementation is O(v(f)*m*n), where
+/// where m is the number of edges, n is the number of vertices, and v(f) is the
+/// value of the maximum flow. However, the observed running time on typical
+/// instances is sub-quadratic, that is, o(n^2).
+///
+/// The input is a set of edges with specified costs and capacities, and a pair
+/// of nodes (source and sink). The output is the flow along each edge of the
+/// minimum total cost respecting the given edge capacities.
+class MinCostMaxFlow {
+public:
+  // Initialize algorithm's data structures for a network of a given size.
+  void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
+    Source = SourceNode;
+    Target = SinkNode;
+
+    Nodes = std::vector<Node>(NodeCount);
+    Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
+  }
+
+  // Run the algorithm.
+  int64_t run() {
+    // Find an augmenting path and update the flow along the path
+    size_t AugmentationIters = 0;
+    while (findAugmentingPath()) {
+      augmentFlowAlongPath();
+      AugmentationIters++;
+    }
+
+    // Compute the total flow and its cost
+    int64_t TotalCost = 0;
+    int64_t TotalFlow = 0;
+    for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
+      for (auto &Edge : Edges[Src]) {
+        if (Edge.Flow > 0) {
+          TotalCost += Edge.Cost * Edge.Flow;
+          if (Src == Source)
+            TotalFlow += Edge.Flow;
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
+                      << " iterations with " << TotalFlow << " total flow"
+                      << " of " << TotalCost << " cost\n");
+    (void)TotalFlow;
+    return TotalCost;
+  }
+
+  /// Adding an edge to the network with a specified capacity and a cost.
+  /// Multiple edges between a pair of nodes are allowed but self-edges
+  /// are not supported.
+  void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
+    assert(Capacity > 0 && "adding an edge of zero capacity");
+    assert(Src != Dst && "loop edge are not supported");
+
+    Edge SrcEdge;
+    SrcEdge.Dst = Dst;
+    SrcEdge.Cost = Cost;
+    SrcEdge.Capacity = Capacity;
+    SrcEdge.Flow = 0;
+    SrcEdge.RevEdgeIndex = Edges[Dst].size();
+
+    Edge DstEdge;
+    DstEdge.Dst = Src;
+    DstEdge.Cost = -Cost;
+    DstEdge.Capacity = 0;
+    DstEdge.Flow = 0;
+    DstEdge.RevEdgeIndex = Edges[Src].size();
+
+    Edges[Src].push_back(SrcEdge);
+    Edges[Dst].push_back(DstEdge);
+  }
+
+  /// Adding an edge to the network of infinite capacity and a given cost.
+  void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
+    addEdge(Src, Dst, INF, Cost);
+  }
+
+  /// Get the total flow from a given source node.
+  /// Returns a list of pairs (target node, amount of flow to the target).
+  const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
+    std::vector<std::pair<uint64_t, int64_t>> Flow;
+    for (auto &Edge : Edges[Src]) {
+      if (Edge.Flow > 0)
+        Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
+    }
+    return Flow;
+  }
+
+  /// Get the total flow between a pair of nodes.
+  int64_t getFlow(uint64_t Src, uint64_t Dst) const {
+    int64_t Flow = 0;
+    for (auto &Edge : Edges[Src]) {
+      if (Edge.Dst == Dst) {
+        Flow += Edge.Flow;
+      }
+    }
+    return Flow;
+  }
+
+  /// A cost of increasing a block's count by one.
+  static constexpr int64_t AuxCostInc = 10;
+  /// A cost of decreasing a block's count by one.
+  static constexpr int64_t AuxCostDec = 20;
+  /// A cost of increasing a count of zero-weight block by one.
+  static constexpr int64_t AuxCostIncZero = 11;
+  /// A cost of increasing the entry block's count by one.
+  static constexpr int64_t AuxCostIncEntry = 40;
+  /// A cost of decreasing the entry block's count by one.
+  static constexpr int64_t AuxCostDecEntry = 10;
+  /// A cost of taking an unlikely jump.
+  static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
+
+private:
+  /// Check for existence of an augmenting path with a positive capacity.
+  bool findAugmentingPath() {
+    // Initialize data structures
+    for (auto &Node : Nodes) {
+      Node.Distance = INF;
+      Node.ParentNode = uint64_t(-1);
+      Node.ParentEdgeIndex = uint64_t(-1);
+      Node.Taken = false;
+    }
+
+    std::queue<uint64_t> Queue;
+    Queue.push(Source);
+    Nodes[Source].Distance = 0;
+    Nodes[Source].Taken = true;
+    while (!Queue.empty()) {
+      uint64_t Src = Queue.front();
+      Queue.pop();
+      Nodes[Src].Taken = false;
+      // Although the residual network contains edges with negative costs
+      // (in particular, backward edges), it can be shown that there are no
+      // negative-weight cycles and the following two invariants are maintained:
+      // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
+      // where Dist is the length of the shortest path between two nodes. This
+      // allows to prune the search-space of the path-finding algorithm using
+      // the following early-stop criteria:
+      // -- If we find a path with zero-distance from Source to Target, stop the
+      //    search, as the path is the shortest since Dist[Source, Target] >= 0;
+      // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
+      //    process node V, as it is guaranteed _not_ to be on a shortest path
+      //    from Source to Target; it follows from inequalities
+      //    Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
+      //                         >= Dist[Source, V]
+      if (Nodes[Target].Distance == 0)
+        break;
+      if (Nodes[Src].Distance > Nodes[Target].Distance)
+        continue;
+
+      // Process adjacent edges
+      for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
+        auto &Edge = Edges[Src][EdgeIdx];
+        if (Edge.Flow < Edge.Capacity) {
+          uint64_t Dst = Edge.Dst;
+          int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
+          if (Nodes[Dst].Distance > NewDistance) {
+            // Update the distance and the parent node/edge
+            Nodes[Dst].Distance = NewDistance;
+            Nodes[Dst].ParentNode = Src;
+            Nodes[Dst].ParentEdgeIndex = EdgeIdx;
+            // Add the node to the queue, if it is not there yet
+            if (!Nodes[Dst].Taken) {
+              Queue.push(Dst);
+              Nodes[Dst].Taken = true;
+            }
+          }
+        }
+      }
+    }
+
+    return Nodes[Target].Distance != INF;
+  }
+
+  /// Update the current flow along the augmenting path.
+  void augmentFlowAlongPath() {
+    // Find path capacity
+    int64_t PathCapacity = INF;
+    uint64_t Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+      PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
+      Now = Pred;
+    }
+
+    assert(PathCapacity > 0 && "found incorrect augmenting path");
+
+    // Update the flow along the path
+    Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+      auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
+
+      Edge.Flow += PathCapacity;
+      RevEdge.Flow -= PathCapacity;
+
+      Now = Pred;
+    }
+  }
+
+  /// An node in a flow network.
+  struct Node {
+    /// The cost of the cheapest path from the source to the current node.
+    int64_t Distance;
+    /// The node preceding the current one in the path.
+    uint64_t ParentNode;
+    /// The index of the edge between ParentNode and the current node.
+    uint64_t ParentEdgeIndex;
+    /// An indicator of whether the current node is in a queue.
+    bool Taken;
+  };
+  /// An edge in a flow network.
+  struct Edge {
+    /// The cost of the edge.
+    int64_t Cost;
+    /// The capacity of the edge.
+    int64_t Capacity;
+    /// The current flow on the edge.
+    int64_t Flow;
+    /// The destination node of the edge.
+    uint64_t Dst;
+    /// The index of the reverse edge between Dst and the current node.
+    uint64_t RevEdgeIndex;
+  };
+
+  /// The set of network nodes.
+  std::vector<Node> Nodes;
+  /// The set of network edges.
+  std::vector<std::vector<Edge>> Edges;
+  /// Source node of the flow.
+  uint64_t Source;
+  /// Target (sink) node of the flow.
+  uint64_t Target;
+};
+
+/// Initializing flow network for a given function.
+///
+/// Every block is split into three nodes that are responsible for (i) an
+/// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or
+/// reduction of the block weight.
+void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
+  uint64_t NumBlocks = Func.Blocks.size();
+  assert(NumBlocks > 1 && "Too few blocks in a function");
+  LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n");
+
+  // Pre-process data: make sure the entry weight is at least 1
+  if (Func.Blocks[Func.Entry].Weight == 0) {
+    Func.Blocks[Func.Entry].Weight = 1;
+  }
+  // Introducing dummy source/sink pairs to allow flow circulation.
+  // The nodes corresponding to blocks of Func have indicies in the range
+  // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values.
+  uint64_t S = 3 * NumBlocks;
+  uint64_t T = S + 1;
+  uint64_t S1 = S + 2;
+  uint64_t T1 = S + 3;
+
+  Network.initialize(3 * NumBlocks + 4, S1, T1);
+
+  // Create three nodes for every block of the function
+  for (uint64_t B = 0; B < NumBlocks; B++) {
+    auto &Block = Func.Blocks[B];
+    assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
+           "non-zero weight of a block w/o weight except for an entry");
+
+    // Split every block into two nodes
+    uint64_t Bin = 3 * B;
+    uint64_t Bout = 3 * B + 1;
+    uint64_t Baux = 3 * B + 2;
+    if (Block.Weight > 0) {
+      Network.addEdge(S1, Bout, Block.Weight, 0);
+      Network.addEdge(Bin, T1, Block.Weight, 0);
+    }
+
+    // Edges from S and to T
+    assert((!Block.isEntry() || !Block.isExit()) &&
+           "a block cannot be an entry and an exit");
+    if (Block.isEntry()) {
+      Network.addEdge(S, Bin, 0);
+    } else if (Block.isExit()) {
+      Network.addEdge(Bout, T, 0);
+    }
+
+    // An auxiliary node to allow increase/reduction of block counts:
+    // We assume that decreasing block counts is more expensive than increasing,
+    // and thus, setting separate costs here. In the future we may want to tune
+    // the relative costs so as to maximize the quality of generated profiles.
+    int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc;
+    int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec;
+    if (Block.UnknownWeight) {
+      // Do not penalize changing weights of blocks w/o known profile count
+      AuxCostInc = 0;
+      AuxCostDec = 0;
+    } else {
+      // Increasing the count for "cold" blocks with zero initial count is more
+      // expensive than for "hot" ones
+      if (Block.Weight == 0) {
+        AuxCostInc = MinCostMaxFlow::AuxCostIncZero;
+      }
+      // Modifying the count of the entry block is expensive
+      if (Block.isEntry()) {
+        AuxCostInc = MinCostMaxFlow::AuxCostIncEntry;
+        AuxCostDec = MinCostMaxFlow::AuxCostDecEntry;
+      }
+    }
+    // For blocks with self-edges, do not penalize a reduction of the count,
+    // as all of the increase can be attributed to the self-edge
+    if (Block.HasSelfEdge) {
+      AuxCostDec = 0;
+    }
+
+    Network.addEdge(Bin, Baux, AuxCostInc);
+    Network.addEdge(Baux, Bout, AuxCostInc);
+    if (Block.Weight > 0) {
+      Network.addEdge(Bout, Baux, AuxCostDec);
+      Network.addEdge(Baux, Bin, AuxCostDec);
+    }
+  }
+
+  // Creating edges for every jump
+  for (auto &Jump : Func.Jumps) {
+    uint64_t Src = Jump.Source;
+    uint64_t Dst = Jump.Target;
+    if (Src != Dst) {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t DstIn = 3 * Dst;
+      uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0;
+      Network.addEdge(SrcOut, DstIn, Cost);
+    }
+  }
+
+  // Make sure we have a valid flow circulation
+  Network.addEdge(T, S, 0);
+}
+
+/// Extract resulting block and edge counts from the flow network.
+void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) {
+  uint64_t NumBlocks = Func.Blocks.size();
+
+  // Extract resulting block counts
+  for (uint64_t Src = 0; Src < NumBlocks; Src++) {
+    auto &Block = Func.Blocks[Src];
+    uint64_t SrcOut = 3 * Src + 1;
+    int64_t Flow = 0;
+    for (auto &Adj : Network.getFlow(SrcOut)) {
+      uint64_t DstIn = Adj.first;
+      int64_t DstFlow = Adj.second;
+      bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2);
+      if (!IsAuxNode || Block.HasSelfEdge) {
+        Flow += DstFlow;
+      }
+    }
+    Block.Flow = Flow;
+    assert(Flow >= 0 && "negative block flow");
+  }
+
+  // Extract resulting jump counts
+  for (auto &Jump : Func.Jumps) {
+    uint64_t Src = Jump.Source;
+    uint64_t Dst = Jump.Target;
+    int64_t Flow = 0;
+    if (Src != Dst) {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t DstIn = 3 * Dst;
+      Flow = Network.getFlow(SrcOut, DstIn);
+    } else {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t SrcAux = 3 * Src + 2;
+      int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux);
+      if (AuxFlow > 0)
+        Flow = AuxFlow;
+    }
+    Jump.Flow = Flow;
+    assert(Flow >= 0 && "negative jump flow");
+  }
+}
+
+#ifndef NDEBUG
+/// Verify that the computed flow values satisfy flow conservation rules
+void verifyWeights(const FlowFunction &Func) {
+  const uint64_t NumBlocks = Func.Blocks.size();
+  auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
+  auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
+  for (auto &Jump : Func.Jumps) {
+    InFlow[Jump.Target] += Jump.Flow;
+    OutFlow[Jump.Source] += Jump.Flow;
+  }
+
+  uint64_t TotalInFlow = 0;
+  uint64_t TotalOutFlow = 0;
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    auto &Block = Func.Blocks[I];
+    if (Block.isEntry()) {
+      TotalInFlow += Block.Flow;
+      assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+    } else if (Block.isExit()) {
+      TotalOutFlow += Block.Flow;
+      assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+    } else {
+      assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+      assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+    }
+  }
+  assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
+}
+#endif
+
+} // end of anonymous namespace
+
+/// Apply the profile inference algorithm for a given flow function
+void llvm::applyFlowInference(FlowFunction &Func) {
+  // Create and apply an inference network model
+  auto InferenceNetwork = MinCostMaxFlow();
+  initializeNetwork(InferenceNetwork, Func);
+  InferenceNetwork.run();
+
+  // Extract flow values for every block and every edge
+  extractWeights(InferenceNetwork, Func);
+
+#ifndef NDEBUG
+  // Verify the result
+  verifyWeights(Func);
+#endif
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
index 6d995cf4c048..ea0e8343eb88 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
@@ -34,6 +34,10 @@ cl::opt<bool> NoWarnSampleUnused(
     cl::desc("Use this option to turn off/on warnings about function with "
              "samples but without debug information to use those samples. "));
 
+cl::opt<bool> SampleProfileUseProfi(
+    "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Use profi to infer block and edge counts."));
+
 namespace sampleprofutil {
 
 /// Return true if the given callsite is hot wrt to hot cutoff threshold.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index a042146d7ace..71c15d5c51fc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -1833,22 +1834,6 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
   return V;
 }
 
-/// Check whether value has nuw/nsw/exact set but SCEV does not.
-/// TODO: In reality it is better to check the poison recursively
-/// but this is better than nothing.
-static bool SCEVLostPoisonFlags(const SCEV *S, const Instruction *I) {
-  if (isa<OverflowingBinaryOperator>(I)) {
-    if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) {
-      if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap())
-        return true;
-      if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap())
-        return true;
-    }
-  } else if (isa<PossiblyExactOperator>(I) && I->isExact())
-    return true;
-  return false;
-}
-
 ScalarEvolution::ValueOffsetPair
 SCEVExpander::FindValueInExprValueMap(const SCEV *S,
                                       const Instruction *InsertPt) {
@@ -1872,8 +1857,7 @@ SCEVExpander::FindValueInExprValueMap(const SCEV *S,
         if (S->getType() == V->getType() &&
             SE.DT.dominates(EntInst, InsertPt) &&
             (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
-             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)) &&
-            !SCEVLostPoisonFlags(S, EntInst))
+             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
           return {V, Offset};
       }
     }
@@ -1952,26 +1936,36 @@ Value *SCEVExpander::expand(const SCEV *S) {
 
   if (!V)
     V = visit(S);
-  else if (VO.second) {
-    if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
-      Type *Ety = Vty->getPointerElementType();
-      int64_t Offset = VO.second->getSExtValue();
-      int64_t ESize = SE.getTypeSizeInBits(Ety);
-      if ((Offset * 8) % ESize == 0) {
-        ConstantInt *Idx =
+  else {
+    // If we're reusing an existing instruction, we are effectively CSEing two
+    // copies of the instruction (with potentially different flags).  As such,
+    // we need to drop any poison generating flags unless we can prove that
+    // said flags must be valid for all new users.
+    if (auto *I = dyn_cast<Instruction>(V))
+      if (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))
+        I->dropPoisonGeneratingFlags();
+
+    if (VO.second) {
+      if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
+        Type *Ety = Vty->getPointerElementType();
+        int64_t Offset = VO.second->getSExtValue();
+        int64_t ESize = SE.getTypeSizeInBits(Ety);
+        if ((Offset * 8) % ESize == 0) {
+          ConstantInt *Idx =
             ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
-        V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
-      } else {
-        ConstantInt *Idx =
+          V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
+        } else {
+          ConstantInt *Idx =
             ConstantInt::getSigned(VO.second->getType(), -Offset);
-        unsigned AS = Vty->getAddressSpace();
-        V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
-        V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
-                              "uglygep");
-        V = Builder.CreateBitCast(V, Vty);
+          unsigned AS = Vty->getAddressSpace();
+          V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
+          V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
+                                "uglygep");
+          V = Builder.CreateBitCast(V, Vty);
+        }
+      } else {
+        V = Builder.CreateSub(V, VO.second);
       }
-    } else {
-      V = Builder.CreateSub(V, VO.second);
     }
   }
   // Remember the expanded value for this SCEV at this location.
@@ -2180,7 +2174,9 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
   }
 
   // Use expand's logic which is used for reusing a previous Value in
-  // ExprValueMap.
+  // ExprValueMap.  Note that we don't currently model the cost of
+  // needing to drop poison generating flags on the instruction if we
+  // want to reuse it.  We effectively assume that has zero cost.
   ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At);
   if (VO.first)
     return VO;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f467de5f924e..afa3ecde77f9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3936,7 +3936,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
   BasicBlock *KeepEdge1 = TrueBB;
   BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
 
-  SmallPtrSet<BasicBlock *, 2> RemovedSuccessors;
+  SmallSetVector<BasicBlock *, 2> RemovedSuccessors;
 
   // Then remove the rest.
   for (BasicBlock *Succ : successors(OldTerm)) {
@@ -4782,6 +4782,26 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
   return true;
 }
 
+static void createUnreachableSwitchDefault(SwitchInst *Switch,
+                                           DomTreeUpdater *DTU) {
+  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+  auto *BB = Switch->getParent();
+  auto *OrigDefaultBlock = Switch->getDefaultDest();
+  OrigDefaultBlock->removePredecessor(BB);
+  BasicBlock *NewDefaultBlock = BasicBlock::Create(
+      BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
+      OrigDefaultBlock);
+  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+  Switch->setDefaultDest(&*NewDefaultBlock);
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 2> Updates;
+    Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
+    if (!is_contained(successors(BB), OrigDefaultBlock))
+      Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
+    DTU->applyUpdates(Updates);
+  }
+}
+
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
 bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
@@ -4927,10 +4947,14 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
   // Gather dead cases.
   SmallVector<ConstantInt *, 8> DeadCases;
   SmallDenseMap<BasicBlock *, int, 8> NumPerSuccessorCases;
+  SmallVector<BasicBlock *, 8> UniqueSuccessors;
   for (auto &Case : SI->cases()) {
     auto *Successor = Case.getCaseSuccessor();
-    if (DTU)
+    if (DTU) {
+      if (!NumPerSuccessorCases.count(Successor))
+        UniqueSuccessors.push_back(Successor);
       ++NumPerSuccessorCases[Successor];
+    }
     const APInt &CaseVal = Case.getCaseValue()->getValue();
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
         (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
@@ -4973,9 +4997,9 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
 
   if (DTU) {
     std::vector<DominatorTree::UpdateType> Updates;
-    for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
-      if (I.second == 0)
-        Updates.push_back({DominatorTree::Delete, SI->getParent(), I.first});
+    for (auto *Successor : UniqueSuccessors)
+      if (NumPerSuccessorCases[Successor] == 0)
+        Updates.push_back({DominatorTree::Delete, SI->getParent(), Successor});
     DTU->applyUpdates(Updates);
   }
 
@@ -6040,15 +6064,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     if (Succ == SI->getDefaultDest())
       continue;
     Succ->removePredecessor(BB);
-    RemovedSuccessors.insert(Succ);
+    if (DTU && RemovedSuccessors.insert(Succ).second)
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
   }
   SI->eraseFromParent();
 
-  if (DTU) {
-    for (BasicBlock *RemovedSuccessor : RemovedSuccessors)
-      Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
+  if (DTU)
     DTU->applyUpdates(Updates);
-  }
 
   ++NumLookupTables;
   if (NeedMask)
@@ -6215,7 +6237,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 
   // Eliminate redundant destinations.
   SmallPtrSet<Value *, 8> Succs;
-  SmallPtrSet<BasicBlock *, 8> RemovedSuccs;
+  SmallSetVector<BasicBlock *, 8> RemovedSuccs;
   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
     BasicBlock *Dest = IBI->getDestination(i);
     if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
@@ -6305,8 +6327,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
 
     // We've found an identical block.  Update our predecessors to take that
     // path instead and make ourselves dead.
-    SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
-    for (BasicBlock *Pred : Preds) {
+    SmallSetVector<BasicBlock *, 16> UniquePreds(pred_begin(BB), pred_end(BB));
+    for (BasicBlock *Pred : UniquePreds) {
       InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
       assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
              "unexpected successor");
@@ -6323,8 +6345,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
       if (isa<DbgInfoIntrinsic>(Inst))
         Inst.eraseFromParent();
 
-    SmallPtrSet<BasicBlock *, 16> Succs(succ_begin(BB), succ_end(BB));
-    for (BasicBlock *Succ : Succs) {
+    SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB));
+    for (BasicBlock *Succ : UniqueSuccs) {
       Succ->removePredecessor(BB);
       if (DTU)
         Updates.push_back({DominatorTree::Delete, BB, Succ});
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23bb6f0860c9..5ca0adb4242c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -473,18 +473,10 @@ public:
   /// handle the more complex control flow around the loops.
   virtual BasicBlock *createVectorizedLoopSkeleton();
 
-  /// Widen a single instruction within the innermost loop.
-  void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
-                        VPTransformState &State);
-
   /// Widen a single call instruction within the innermost loop.
   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
                             VPTransformState &State);
 
-  /// Widen a single select instruction within the innermost loop.
-  void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
-                              bool InvariantCond, VPTransformState &State);
-
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
   void fixVectorizedLoop(VPTransformState &State);
 
@@ -496,12 +488,6 @@ public:
   /// new unrolled loop, where UF is the unroll factor.
   using VectorParts = SmallVector<Value *, 2>;
 
-  /// Vectorize a single GetElementPtrInst based on information gathered and
-  /// decisions taken during planning.
-  void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
-                unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
-                SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
-
   /// Vectorize a single first-order recurrence or pointer induction PHINode in
   /// a block. This method handles the induction variable canonicalization. It
   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
@@ -511,9 +497,9 @@ public:
   /// A helper function to scalarize a single Instruction in the innermost loop.
   /// Generates a sequence of scalar instances for each lane between \p MinLane
   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
-  /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+  /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
   /// Instr's operands.
-  void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
+  void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
                             const VPIteration &Instance, bool IfPredicateInstr,
                             VPTransformState &State);
 
@@ -538,15 +524,6 @@ public:
                                 ArrayRef<VPValue *> StoredValues,
                                 VPValue *BlockInMask = nullptr);
 
-  /// Vectorize Load and Store instructions with the base address given in \p
-  /// Addr, optionally masking the vector operations if \p BlockInMask is
-  /// non-null. Use \p State to translate given VPValues to IR values in the
-  /// vectorized loop.
-  void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
-                                  VPValue *Def, VPValue *Addr,
-                                  VPValue *StoredValue, VPValue *BlockInMask,
-                                  bool ConsecutiveStride, bool Reverse);
-
   /// Set the debug location in the builder \p Ptr using the debug location in
   /// \p V. If \p Ptr is None then it uses the class member's Builder.
   void setDebugLocFromInst(const Value *V,
@@ -566,6 +543,17 @@ public:
   /// element.
   virtual Value *getBroadcastInstrs(Value *V);
 
+  /// Add metadata from one instruction to another.
+  ///
+  /// This includes both the original MDs from \p From and additional ones (\see
+  /// addNewMetadata).  Use this for *newly created* instructions in the vector
+  /// loop.
+  void addMetadata(Instruction *To, Instruction *From);
+
+  /// Similar to the previous function but it adds the metadata to a
+  /// vector of instructions.
+  void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
 protected:
   friend class LoopVectorizationPlanner;
 
@@ -741,16 +729,16 @@ protected:
   /// vector loop.
   void addNewMetadata(Instruction *To, const Instruction *Orig);
 
-  /// Add metadata from one instruction to another.
-  ///
-  /// This includes both the original MDs from \p From and additional ones (\see
-  /// addNewMetadata).  Use this for *newly created* instructions in the vector
-  /// loop.
-  void addMetadata(Instruction *To, Instruction *From);
-
-  /// Similar to the previous function but it adds the metadata to a
-  /// vector of instructions.
-  void addMetadata(ArrayRef<Value *> To, Instruction *From);
+  /// Collect poison-generating recipes that may generate a poison value that is
+  /// used after vectorization, even when their operands are not poison. Those
+  /// recipes meet the following conditions:
+  ///  * Contribute to the address computation of a recipe generating a widen
+  ///    memory load/store (VPWidenMemoryInstructionRecipe or
+  ///    VPInterleaveRecipe).
+  ///  * Such a widen memory load/store has at least one underlying Instruction
+  ///    that is in a basic block that needs predication and after vectorization
+  ///    the generated instruction won't be predicated.
+  void collectPoisonGeneratingRecipes(VPTransformState &State);
 
   /// Allow subclasses to override and print debug traces before/after vplan
   /// execution, when trace information is requested.
@@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To,
     LVer->annotateInstWithNoAlias(To, Orig);
 }
 
+void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
+    VPTransformState &State) {
+
+  // Collect recipes in the backward slice of `Root` that may generate a poison
+  // value that is used after vectorization.
+  SmallPtrSet<VPRecipeBase *, 16> Visited;
+  auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
+    SmallVector<VPRecipeBase *, 16> Worklist;
+    Worklist.push_back(Root);
+
+    // Traverse the backward slice of Root through its use-def chain.
+    while (!Worklist.empty()) {
+      VPRecipeBase *CurRec = Worklist.back();
+      Worklist.pop_back();
+
+      if (!Visited.insert(CurRec).second)
+        continue;
+
+      // Prune search if we find another recipe generating a widen memory
+      // instruction. Widen memory instructions involved in address computation
+      // will lead to gather/scatter instructions, which don't need to be
+      // handled.
+      if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
+          isa<VPInterleaveRecipe>(CurRec))
+        continue;
+
+      // This recipe contributes to the address computation of a widen
+      // load/store. Collect recipe if its underlying instruction has
+      // poison-generating flags.
+      Instruction *Instr = CurRec->getUnderlyingInstr();
+      if (Instr && Instr->hasPoisonGeneratingFlags())
+        State.MayGeneratePoisonRecipes.insert(CurRec);
+
+      // Add new definitions to the worklist.
+      for (VPValue *operand : CurRec->operands())
+        if (VPDef *OpDef = operand->getDef())
+          Worklist.push_back(cast<VPRecipeBase>(OpDef));
+    }
+  });
+
+  // Traverse all the recipes in the VPlan and collect the poison-generating
+  // recipes in the backward slice starting at the address of a VPWidenRecipe or
+  // VPInterleaveRecipe.
+  auto Iter = depth_first(
+      VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+    for (VPRecipeBase &Recipe : *VPBB) {
+      if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
+        Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
+        VPDef *AddrDef = WidenRec->getAddr()->getDef();
+        if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
+            Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
+          collectPoisonGeneratingInstrsInBackwardSlice(
+              cast<VPRecipeBase>(AddrDef));
+      } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
+        VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
+        if (AddrDef) {
+          // Check if any member of the interleave group needs predication.
+          const InterleaveGroup<Instruction> *InterGroup =
+              InterleaveRec->getInterleaveGroup();
+          bool NeedPredication = false;
+          for (int I = 0, NumMembers = InterGroup->getNumMembers();
+               I < NumMembers; ++I) {
+            Instruction *Member = InterGroup->getMember(I);
+            if (Member)
+              NeedPredication |=
+                  Legal->blockNeedsPredication(Member->getParent());
+          }
+
+          if (NeedPredication)
+            collectPoisonGeneratingInstrsInBackwardSlice(
+                cast<VPRecipeBase>(AddrDef));
+        }
+      }
+    }
+  }
+}
+
 void InnerLoopVectorizer::addMetadata(Instruction *To,
                                       Instruction *From) {
   propagateMetadata(To, From);
@@ -1541,7 +1607,16 @@ public:
   // Returns true if \p I is an instruction that will be predicated either
   // through scalar predication or masked load/store or masked gather/scatter.
   // Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I) {
+  bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
+    // When we know the load is uniform and the original scalar loop was not
+    // predicated we don't need to mark it as a predicated instruction. Any
+    // vectorised blocks created when tail-folding are something artificial we
+    // have introduced and we know there is always at least one active lane.
+    // That's why we call Legal->blockNeedsPredication here because it doesn't
+    // query tail-folding.
+    if (IsKnownUniform && isa<LoadInst>(I) &&
+        !Legal->blockNeedsPredication(I->getParent()))
+      return false;
     if (!blockNeedsPredicationForAnyReason(I->getParent()))
       return false;
     // Loads and stores that need some form of masked operation are predicated
@@ -1816,9 +1891,11 @@ private:
 
   /// Collect the instructions that are scalar after vectorization. An
   /// instruction is scalar if it is known to be uniform or will be scalarized
-  /// during vectorization. Non-uniform scalarized instructions will be
-  /// represented by VF values in the vectorized loop, each corresponding to an
-  /// iteration of the original scalar loop.
+  /// during vectorization. collectLoopScalars should only add non-uniform nodes
+  /// to the list if they are used by a load/store instruction that is marked as
+  /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
+  /// VF values in the vectorized loop, each corresponding to an iteration of
+  /// the original scalar loop.
   void collectLoopScalars(ElementCount VF);
 
   /// Keeps cost model vectorization decision and cost for instructions.
@@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   }
 }
 
-void InnerLoopVectorizer::vectorizeMemoryInstruction(
-    Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
-    VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,
-    bool Reverse) {
-  // Attempt to issue a wide load.
-  LoadInst *LI = dyn_cast<LoadInst>(Instr);
-  StoreInst *SI = dyn_cast<StoreInst>(Instr);
-
-  assert((LI || SI) && "Invalid Load/Store instruction");
-  assert((!SI || StoredValue) && "No stored value provided for widened store");
-  assert((!LI || !StoredValue) && "Stored value provided for widened load");
-
-  Type *ScalarDataTy = getLoadStoreType(Instr);
-
-  auto *DataTy = VectorType::get(ScalarDataTy, VF);
-  const Align Alignment = getLoadStoreAlignment(Instr);
-  bool CreateGatherScatter = !ConsecutiveStride;
-
-  VectorParts BlockInMaskParts(UF);
-  bool isMaskRequired = BlockInMask;
-  if (isMaskRequired)
-    for (unsigned Part = 0; Part < UF; ++Part)
-      BlockInMaskParts[Part] = State.get(BlockInMask, Part);
-
-  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
-    // Calculate the pointer for the specific unroll-part.
-    GetElementPtrInst *PartPtr = nullptr;
-
-    bool InBounds = false;
-    if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
-      InBounds = gep->isInBounds();
-    if (Reverse) {
-      // If the address is consecutive but reversed, then the
-      // wide store needs to start at the last vector element.
-      // RunTimeVF =  VScale * VF.getKnownMinValue()
-      // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
-      Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
-      // NumElt = -Part * RunTimeVF
-      Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
-      // LastLane = 1 - RunTimeVF
-      Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
-      PartPtr =
-          cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
-      PartPtr->setIsInBounds(InBounds);
-      PartPtr = cast<GetElementPtrInst>(
-          Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
-      PartPtr->setIsInBounds(InBounds);
-      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
-        BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
-    } else {
-      Value *Increment =
-          createStepForVF(Builder, Builder.getInt32Ty(), VF, Part);
-      PartPtr = cast<GetElementPtrInst>(
-          Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
-      PartPtr->setIsInBounds(InBounds);
-    }
-
-    unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
-    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
-  };
-
-  // Handle Stores:
-  if (SI) {
-    setDebugLocFromInst(SI);
-
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Instruction *NewSI = nullptr;
-      Value *StoredVal = State.get(StoredValue, Part);
-      if (CreateGatherScatter) {
-        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
-        Value *VectorGep = State.get(Addr, Part);
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
-                                            MaskPart);
-      } else {
-        if (Reverse) {
-          // If we store to reverse consecutive memory locations, then we need
-          // to reverse the order of elements in the stored value.
-          StoredVal = reverseVector(StoredVal);
-          // We don't want to update the value in the map as it might be used in
-          // another expression. So don't call resetVectorValue(StoredVal).
-        }
-        auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
-        if (isMaskRequired)
-          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
-                                            BlockInMaskParts[Part]);
-        else
-          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
-      }
-      addMetadata(NewSI, SI);
-    }
-    return;
-  }
-
-  // Handle loads.
-  assert(LI && "Must have a load instruction");
-  setDebugLocFromInst(LI);
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    Value *NewLI;
-    if (CreateGatherScatter) {
-      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
-      Value *VectorGep = State.get(Addr, Part);
-      NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
-                                         nullptr, "wide.masked.gather");
-      addMetadata(NewLI, LI);
-    } else {
-      auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
-      if (isMaskRequired)
-        NewLI = Builder.CreateMaskedLoad(
-            DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
-            PoisonValue::get(DataTy), "wide.masked.load");
-      else
-        NewLI =
-            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
-
-      // Add metadata to the load, but setVectorValue to the reverse shuffle.
-      addMetadata(NewLI, LI);
-      if (Reverse)
-        NewLI = reverseVector(NewLI);
-    }
-
-    State.set(Def, NewLI, Part);
-  }
-}
-
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
-                                               VPUser &User,
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+                                               VPReplicateRecipe *RepRecipe,
                                                const VPIteration &Instance,
                                                bool IfPredicateInstr,
                                                VPTransformState &State) {
@@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
   if (!IsVoidRetTy)
     Cloned->setName(Instr->getName() + ".cloned");
 
+  // If the scalarized instruction contributes to the address computation of a
+  // widen masked load/store which was in a basic block that needed predication
+  // and is not predicated after vectorization, we can't propagate
+  // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
+  // instruction could feed a poison value to the base address of the widen
+  // load/store.
+  if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
+    Cloned->dropPoisonGeneratingFlags();
+
   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
                                Builder.GetInsertPoint());
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
-  for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
+  for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
     auto InputInstance = Instance;
     if (!Operand || !OrigLoop->contains(Operand) ||
         (Cost->isUniformAfterVectorization(Operand, State.VF)))
       InputInstance.Lane = VPLane::getFirstLane();
-    auto *NewOp = State.get(User.getOperand(op), InputInstance);
+    auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
     Cloned->setOperand(op, NewOp);
   }
   addNewMetadata(Cloned, Instr);
@@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
   // Place the cloned scalar in the new loop.
   Builder.Insert(Cloned);
 
-  State.set(Def, Cloned, Instance);
+  State.set(RepRecipe, Cloned, Instance);
 
   // If we just cloned a new assumption, add it the assumption cache.
   if (auto *II = dyn_cast<AssumeInst>(Cloned))
@@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
   return Cost->useOrderedReductions(RdxDesc);
 }
 
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
-                                   VPUser &Operands, unsigned UF,
-                                   ElementCount VF, bool IsPtrLoopInvariant,
-                                   SmallBitVector &IsIndexLoopInvariant,
-                                   VPTransformState &State) {
-  // Construct a vector GEP by widening the operands of the scalar GEP as
-  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
-  // results in a vector of pointers when at least one operand of the GEP
-  // is vector-typed. Thus, to keep the representation compact, we only use
-  // vector-typed operands for loop-varying values.
-
-  if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
-    // If we are vectorizing, but the GEP has only loop-invariant operands,
-    // the GEP we build (by only using vector-typed operands for
-    // loop-varying values) would be a scalar pointer. Thus, to ensure we
-    // produce a vector of pointers, we need to either arbitrarily pick an
-    // operand to broadcast, or broadcast a clone of the original GEP.
-    // Here, we broadcast a clone of the original.
-    //
-    // TODO: If at some point we decide to scalarize instructions having
-    //       loop-invariant operands, this special case will no longer be
-    //       required. We would add the scalarization decision to
-    //       collectLoopScalars() and teach getVectorValue() to broadcast
-    //       the lane-zero scalar value.
-    auto *Clone = Builder.Insert(GEP->clone());
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
-      State.set(VPDef, EntryPart, Part);
-      addMetadata(EntryPart, GEP);
-    }
-  } else {
-    // If the GEP has at least one loop-varying operand, we are sure to
-    // produce a vector of pointers. But if we are only unrolling, we want
-    // to produce a scalar GEP for each unroll part. Thus, the GEP we
-    // produce with the code below will be scalar (if VF == 1) or vector
-    // (otherwise). Note that for the unroll-only case, we still maintain
-    // values in the vector mapping with initVector, as we do for other
-    // instructions.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      // The pointer operand of the new GEP. If it's loop-invariant, we
-      // won't broadcast it.
-      auto *Ptr = IsPtrLoopInvariant
-                      ? State.get(Operands.getOperand(0), VPIteration(0, 0))
-                      : State.get(Operands.getOperand(0), Part);
-
-      // Collect all the indices for the new GEP. If any index is
-      // loop-invariant, we won't broadcast it.
-      SmallVector<Value *, 4> Indices;
-      for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
-        VPValue *Operand = Operands.getOperand(I);
-        if (IsIndexLoopInvariant[I - 1])
-          Indices.push_back(State.get(Operand, VPIteration(0, 0)));
-        else
-          Indices.push_back(State.get(Operand, Part));
-      }
-
-      // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
-      // but it should be a vector, otherwise.
-      auto *NewGEP =
-          GEP->isInBounds()
-              ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
-                                          Indices)
-              : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
-      assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
-             "NewGEP is not a pointer vector");
-      State.set(VPDef, NewGEP, Part);
-      addMetadata(NewGEP, GEP);
-    }
-  }
-}
-
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
                                               VPWidenPHIRecipe *PhiR,
                                               VPTransformState &State) {
@@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
       // iteration. If the instruction is uniform, we only need to generate the
       // first lane. Otherwise, we generate all VF values.
       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
-      unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
-
-      bool NeedsVectorIndex = !IsUniform && VF.isScalable();
-      Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
-      if (NeedsVectorIndex) {
-        Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
-        UnitStepVec = Builder.CreateStepVector(VecIVTy);
-        PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
-      }
+      assert((IsUniform || !State.VF.isScalable()) &&
+             "Cannot scalarize a scalable VF");
+      unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
 
       for (unsigned Part = 0; Part < UF; ++Part) {
         Value *PartStart =
             createStepForVF(Builder, PtrInd->getType(), VF, Part);
 
-        if (NeedsVectorIndex) {
-          // Here we cache the whole vector, which means we can support the
-          // extraction of any lane. However, in some cases the extractelement
-          // instruction that is generated for scalar uses of this vector (e.g.
-          // a load instruction) is not folded away. Therefore we still
-          // calculate values for the first n lanes to avoid redundant moves
-          // (when extracting the 0th element) and to produce scalar code (i.e.
-          // additional add/gep instructions instead of expensive extractelement
-          // instructions) when extracting higher-order elements.
-          Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
-          Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
-          Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
-          Value *SclrGep =
-              emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
-          SclrGep->setName("next.gep");
-          State.set(PhiR, SclrGep, Part);
-        }
-
         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
           Value *Idx = Builder.CreateAdd(
               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
@@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) {
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
-                                           VPUser &User,
-                                           VPTransformState &State) {
-  switch (I.getOpcode()) {
-  case Instruction::Call:
-  case Instruction::Br:
-  case Instruction::PHI:
-  case Instruction::GetElementPtr:
-  case Instruction::Select:
-    llvm_unreachable("This instruction is handled by a different recipe.");
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::URem:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::FNeg:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::FDiv:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    // Just widen unops and binops.
-    setDebugLocFromInst(&I);
-
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      SmallVector<Value *, 2> Ops;
-      for (VPValue *VPOp : User.operands())
-        Ops.push_back(State.get(VPOp, Part));
-
-      Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
-
-      if (auto *VecOp = dyn_cast<Instruction>(V))
-        VecOp->copyIRFlags(&I);
-
-      // Use this vector value for all users of the original instruction.
-      State.set(Def, V, Part);
-      addMetadata(V, &I);
-    }
-
-    break;
-  }
-  case Instruction::ICmp:
-  case Instruction::FCmp: {
-    // Widen compares. Generate vector compares.
-    bool FCmp = (I.getOpcode() == Instruction::FCmp);
-    auto *Cmp = cast<CmpInst>(&I);
-    setDebugLocFromInst(Cmp);
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *A = State.get(User.getOperand(0), Part);
-      Value *B = State.get(User.getOperand(1), Part);
-      Value *C = nullptr;
-      if (FCmp) {
-        // Propagate fast math flags.
-        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-        Builder.setFastMathFlags(Cmp->getFastMathFlags());
-        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
-      } else {
-        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
-      }
-      State.set(Def, C, Part);
-      addMetadata(C, &I);
-    }
-
-    break;
-  }
-
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast: {
-    auto *CI = cast<CastInst>(&I);
-    setDebugLocFromInst(CI);
-
-    /// Vectorize casts.
-    Type *DestTy =
-        (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
-
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *A = State.get(User.getOperand(0), Part);
-      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
-      State.set(Def, Cast, Part);
-      addMetadata(Cast, &I);
-    }
-    break;
-  }
-  default:
-    // This instruction is not vectorized by simple widening.
-    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
-    llvm_unreachable("Unhandled instruction!");
-  } // end of switch.
-}
-
 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
                                                VPUser &ArgOperands,
                                                VPTransformState &State) {
@@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
   }
 }
 
-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
-                                                 VPUser &Operands,
-                                                 bool InvariantCond,
-                                                 VPTransformState &State) {
-  setDebugLocFromInst(&I);
-
-  // The condition can be loop invariant  but still defined inside the
-  // loop. This means that we can't just use the original 'cond' value.
-  // We have to take the 'vectorized' value and pick the first lane.
-  // Instcombine will make this a no-op.
-  auto *InvarCond = InvariantCond
-                        ? State.get(Operands.getOperand(0), VPIteration(0, 0))
-                        : nullptr;
-
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    Value *Cond =
-        InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
-    Value *Op0 = State.get(Operands.getOperand(1), Part);
-    Value *Op1 = State.get(Operands.getOperand(2), Part);
-    Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
-    State.set(VPDef, Sel, Part);
-    addMetadata(Sel, &I);
-  }
-}
-
 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   // We should not collect Scalars more than once per VF. Right now, this
   // function is called from collectUniformsAndScalars(), which already does
@@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
            !TheLoop->isLoopInvariant(V);
   };
 
-  auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
-    if (!isa<PHINode>(Ptr) ||
-        !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
-      return false;
-    auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
-    if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
-      return false;
-    return isScalarUse(MemAccess, Ptr);
-  };
-
-  // A helper that evaluates a memory access's use of a pointer. If the
-  // pointer is actually the pointer induction of a loop, it is being
-  // inserted into Worklist. If the use will be a scalar use, and the
-  // pointer is only used by memory accesses, we place the pointer in
-  // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
+  // A helper that evaluates a memory access's use of a pointer. If the use will
+  // be a scalar use and the pointer is only used by memory accesses, we place
+  // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+  // PossibleNonScalarPtrs.
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
-    if (isScalarPtrInduction(MemAccess, Ptr)) {
-      Worklist.insert(cast<Instruction>(Ptr));
-      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
-                        << "\n");
-
-      Instruction *Update = cast<Instruction>(
-          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-
-      // If there is more than one user of Update (Ptr), we shouldn't assume it
-      // will be scalar after vectorisation as other users of the instruction
-      // may require widening. Otherwise, add it to ScalarPtrs.
-      if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
-        ScalarPtrs.insert(Update);
-        return;
-      }
-    }
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
     if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
       continue;
 
+    // Returns true if \p Indvar is a pointer induction that is used directly by
+    // load/store instruction \p I.
+    auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
+                                              Instruction *I) {
+      return Induction.second.getKind() ==
+                 InductionDescriptor::IK_PtrInduction &&
+             (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+             Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
+    };
+
     // Determine if all users of the induction variable are scalar after
     // vectorization.
     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+             IsDirectLoadStoreFromPtrIndvar(Ind, I);
     });
     if (!ScalarInd)
       continue;
@@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
     auto ScalarIndUpdate =
         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
           auto *I = cast<Instruction>(U);
-          return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+          return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+                 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
         });
     if (!ScalarIndUpdate)
       continue;
@@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   unsigned AS = getLoadStoreAddressSpace(I);
   Value *Ptr = getLoadStorePointerOperand(I);
   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+  // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
+  //       that it is being called from this specific place.
 
   // Figure out whether the access is strided and get the stride value
   // if it's known in compile time
@@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
 
+  // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
+  // normal fmul instruction to the cost of the fadd reduction.
+  if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
+    BaseCost +=
+        TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
+
   // If we're using ordered reductions then we can just return the base cost
   // here, since getArithmeticReductionCost calculates the full ordered
   // reduction cost when FP reassociation is not allowed.
@@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
   case Instruction::Call: {
+    if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
+      if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+        return *RedCost;
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
@@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   State.TripCount = ILV.getOrCreateTripCount(nullptr);
   State.CanonicalIV = ILV.Induction;
+  ILV.collectPoisonGeneratingRecipes(State);
 
   ILV.printDebugTracesAtStart();
 
@@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
 
 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
   DEBUG_WITH_TYPE(VerboseDebug, {
-    dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+    dbgs() << "intermediate fn:\n"
+           << *OrigLoop->getHeader()->getParent() << "\n";
   });
 }
 
@@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
 
 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
   DEBUG_WITH_TYPE(VerboseDebug, {
-    dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+    dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
   });
 }
 
@@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
+      [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
+      Range);
 
   // Even if the instruction is not marked as uniform, there are certain
   // intrinsic calls that can be effectively treated as such, so we check for
@@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     if (VPBB)
       VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
     else {
-      Plan->setEntry(FirstVPBBForBB);
+      auto *TopRegion = new VPRegionBlock("vector loop");
+      TopRegion->setEntry(FirstVPBBForBB);
+      Plan->setEntry(TopRegion);
       HeaderVPBB = FirstVPBBForBB;
     }
     VPBB = FirstVPBBForBB;
@@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     }
   }
 
-  assert(isa<VPBasicBlock>(Plan->getEntry()) &&
+  assert(isa<VPRegionBlock>(Plan->getEntry()) &&
          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
-         "entry block must be set to a non-empty VPBasicBlock");
+         "entry block must be set to a VPRegionBlock having a non-empty entry "
+         "VPBasicBlock");
+  cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
   RecipeBuilder.fixHeaderPhis();
 
   // ---------------------------------------------------------------------------
@@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       unsigned FirstOpId;
       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
              "Only min/max recurrences allowed for inloop reductions");
+      // Recognize a call to the llvm.fmuladd intrinsic.
+      bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
+      assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
+             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
                "Expected to replace a VPWidenSelectSC");
         FirstOpId = 1;
       } else {
-        assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
+        assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
+                (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
                "Expected to replace a VPWidenSC");
         FirstOpId = 0;
       }
@@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       auto *CondOp = CM.foldTailByMasking()
                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
                          : nullptr;
-      VPReductionRecipe *RedRecipe = new VPReductionRecipe(
-          &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
+
+      if (IsFMulAdd) {
+        // If the instruction is a call to the llvm.fmuladd intrinsic then we
+        // need to create an fmul recipe to use as the vector operand for the
+        // fadd reduction.
+        VPInstruction *FMulRecipe = new VPInstruction(
+            Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
+        FMulRecipe->setFastMathFlags(R->getFastMathFlags());
+        WidenRecipe->getParent()->insert(FMulRecipe,
+                                         WidenRecipe->getIterator());
+        VecOp = FMulRecipe;
+      }
+      VPReductionRecipe *RedRecipe =
+          new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
       Plan->removeVPValueFor(R);
       Plan->addVPValue(R, RedRecipe);
@@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
 }
 
 void VPWidenSelectRecipe::execute(VPTransformState &State) {
-  State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
-                                    this, *this, InvariantCond, State);
+  auto &I = *cast<SelectInst>(getUnderlyingInstr());
+  State.ILV->setDebugLocFromInst(&I);
+
+  // The condition can be loop invariant  but still defined inside the
+  // loop. This means that we can't just use the original 'cond' value.
+  // We have to take the 'vectorized' value and pick the first lane.
+  // Instcombine will make this a no-op.
+  auto *InvarCond =
+      InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
+
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
+    Value *Op0 = State.get(getOperand(1), Part);
+    Value *Op1 = State.get(getOperand(2), Part);
+    Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
+    State.set(this, Sel, Part);
+    State.ILV->addMetadata(Sel, &I);
+  }
 }
 
 void VPWidenRecipe::execute(VPTransformState &State) {
-  State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
+  auto &I = *cast<Instruction>(getUnderlyingValue());
+  auto &Builder = State.Builder;
+  switch (I.getOpcode()) {
+  case Instruction::Call:
+  case Instruction::Br:
+  case Instruction::PHI:
+  case Instruction::GetElementPtr:
+  case Instruction::Select:
+    llvm_unreachable("This instruction is handled by a different recipe.");
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::FNeg:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Just widen unops and binops.
+    State.ILV->setDebugLocFromInst(&I);
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      SmallVector<Value *, 2> Ops;
+      for (VPValue *VPOp : operands())
+        Ops.push_back(State.get(VPOp, Part));
+
+      Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+
+      if (auto *VecOp = dyn_cast<Instruction>(V)) {
+        VecOp->copyIRFlags(&I);
+
+        // If the instruction is vectorized and was in a basic block that needed
+        // predication, we can't propagate poison-generating flags (nuw/nsw,
+        // exact, etc.). The control flow has been linearized and the
+        // instruction is no longer guarded by the predicate, which could make
+        // the flag properties to no longer hold.
+        if (State.MayGeneratePoisonRecipes.count(this) > 0)
+          VecOp->dropPoisonGeneratingFlags();
+      }
+
+      // Use this vector value for all users of the original instruction.
+      State.set(this, V, Part);
+      State.ILV->addMetadata(V, &I);
+    }
+
+    break;
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // Widen compares. Generate vector compares.
+    bool FCmp = (I.getOpcode() == Instruction::FCmp);
+    auto *Cmp = cast<CmpInst>(&I);
+    State.ILV->setDebugLocFromInst(Cmp);
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value *A = State.get(getOperand(0), Part);
+      Value *B = State.get(getOperand(1), Part);
+      Value *C = nullptr;
+      if (FCmp) {
+        // Propagate fast math flags.
+        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        Builder.setFastMathFlags(Cmp->getFastMathFlags());
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+      } else {
+        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+      }
+      State.set(this, C, Part);
+      State.ILV->addMetadata(C, &I);
+    }
+
+    break;
+  }
+
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    auto *CI = cast<CastInst>(&I);
+    State.ILV->setDebugLocFromInst(CI);
+
+    /// Vectorize casts.
+    Type *DestTy = (State.VF.isScalar())
+                       ? CI->getType()
+                       : VectorType::get(CI->getType(), State.VF);
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value *A = State.get(getOperand(0), Part);
+      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      State.set(this, Cast, Part);
+      State.ILV->addMetadata(Cast, &I);
+    }
+    break;
+  }
+  default:
+    // This instruction is not vectorized by simple widening.
+    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    llvm_unreachable("Unhandled instruction!");
+  } // end of switch.
 }
 
 void VPWidenGEPRecipe::execute(VPTransformState &State) {
-  State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
-                      *this, State.UF, State.VF, IsPtrLoopInvariant,
-                      IsIndexLoopInvariant, State);
+  auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
+  // Construct a vector GEP by widening the operands of the scalar GEP as
+  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+  // results in a vector of pointers when at least one operand of the GEP
+  // is vector-typed. Thus, to keep the representation compact, we only use
+  // vector-typed operands for loop-varying values.
+
+  if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+    // If we are vectorizing, but the GEP has only loop-invariant operands,
+    // the GEP we build (by only using vector-typed operands for
+    // loop-varying values) would be a scalar pointer. Thus, to ensure we
+    // produce a vector of pointers, we need to either arbitrarily pick an
+    // operand to broadcast, or broadcast a clone of the original GEP.
+    // Here, we broadcast a clone of the original.
+    //
+    // TODO: If at some point we decide to scalarize instructions having
+    //       loop-invariant operands, this special case will no longer be
+    //       required. We would add the scalarization decision to
+    //       collectLoopScalars() and teach getVectorValue() to broadcast
+    //       the lane-zero scalar value.
+    auto *Clone = State.Builder.Insert(GEP->clone());
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
+      State.set(this, EntryPart, Part);
+      State.ILV->addMetadata(EntryPart, GEP);
+    }
+  } else {
+    // If the GEP has at least one loop-varying operand, we are sure to
+    // produce a vector of pointers. But if we are only unrolling, we want
+    // to produce a scalar GEP for each unroll part. Thus, the GEP we
+    // produce with the code below will be scalar (if VF == 1) or vector
+    // (otherwise). Note that for the unroll-only case, we still maintain
+    // values in the vector mapping with initVector, as we do for other
+    // instructions.
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      // The pointer operand of the new GEP. If it's loop-invariant, we
+      // won't broadcast it.
+      auto *Ptr = IsPtrLoopInvariant
+                      ? State.get(getOperand(0), VPIteration(0, 0))
+                      : State.get(getOperand(0), Part);
+
+      // Collect all the indices for the new GEP. If any index is
+      // loop-invariant, we won't broadcast it.
+      SmallVector<Value *, 4> Indices;
+      for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
+        VPValue *Operand = getOperand(I);
+        if (IsIndexLoopInvariant[I - 1])
+          Indices.push_back(State.get(Operand, VPIteration(0, 0)));
+        else
+          Indices.push_back(State.get(Operand, Part));
+      }
+
+      // If the GEP instruction is vectorized and was in a basic block that
+      // needed predication, we can't propagate the poison-generating 'inbounds'
+      // flag. The control flow has been linearized and the GEP is no longer
+      // guarded by the predicate, which could make the 'inbounds' properties to
+      // no longer hold.
+      bool IsInBounds =
+          GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
+
+      // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+      // but it should be a vector, otherwise.
+      auto *NewGEP = IsInBounds
+                         ? State.Builder.CreateInBoundsGEP(
+                               GEP->getSourceElementType(), Ptr, Indices)
+                         : State.Builder.CreateGEP(GEP->getSourceElementType(),
+                                                   Ptr, Indices);
+      assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+             "NewGEP is not a pointer vector");
+      State.set(this, NewGEP, Part);
+      State.ILV->addMetadata(NewGEP, GEP);
+    }
+  }
 }
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
@@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-    State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
-                                    *State.Instance, IsPredicated, State);
+    State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
+                                    IsPredicated, State);
     // Insert scalar instance packing it into a vector.
     if (AlsoPack && State.VF.isVector()) {
       // If we're constructing lane 0, initialize to start from poison.
@@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
          "Can't scalarize a scalable vector");
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
-      State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
+      State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
                                       VPIteration(Part, Lane), IsPredicated,
                                       State);
 }
@@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
 
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
-  State.ILV->vectorizeMemoryInstruction(
-      &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
-      StoredValue, getMask(), Consecutive, Reverse);
+
+  // Attempt to issue a wide load.
+  LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
+  StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
+
+  assert((LI || SI) && "Invalid Load/Store instruction");
+  assert((!SI || StoredValue) && "No stored value provided for widened store");
+  assert((!LI || !StoredValue) && "Stored value provided for widened load");
+
+  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+
+  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+  bool CreateGatherScatter = !Consecutive;
+
+  auto &Builder = State.Builder;
+  InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
+  bool isMaskRequired = getMask();
+  if (isMaskRequired)
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      BlockInMaskParts[Part] = State.get(getMask(), Part);
+
+  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+    // Calculate the pointer for the specific unroll-part.
+    GetElementPtrInst *PartPtr = nullptr;
+
+    bool InBounds = false;
+    if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+      InBounds = gep->isInBounds();
+    if (Reverse) {
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
+      // RunTimeVF =  VScale * VF.getKnownMinValue()
+      // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
+      Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
+      // NumElt = -Part * RunTimeVF
+      Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
+      // LastLane = 1 - RunTimeVF
+      Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
+      PartPtr =
+          cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
+      PartPtr->setIsInBounds(InBounds);
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
+      PartPtr->setIsInBounds(InBounds);
+      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+        BlockInMaskParts[Part] =
+            Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
+    } else {
+      Value *Increment =
+          createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
+      PartPtr->setIsInBounds(InBounds);
+    }
+
+    unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+  };
+
+  // Handle Stores:
+  if (SI) {
+    State.ILV->setDebugLocFromInst(SI);
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Instruction *NewSI = nullptr;
+      Value *StoredVal = State.get(StoredValue, Part);
+      if (CreateGatherScatter) {
+        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        Value *VectorGep = State.get(getAddr(), Part);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                            MaskPart);
+      } else {
+        if (Reverse) {
+          // If we store to reverse consecutive memory locations, then we need
+          // to reverse the order of elements in the stored value.
+          StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
+          // We don't want to update the value in the map as it might be used in
+          // another expression. So don't call resetVectorValue(StoredVal).
+        }
+        auto *VecPtr =
+            CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+        if (isMaskRequired)
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+                                            BlockInMaskParts[Part]);
+        else
+          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+      }
+      State.ILV->addMetadata(NewSI, SI);
+    }
+    return;
+  }
+
+  // Handle loads.
+  assert(LI && "Must have a load instruction");
+  State.ILV->setDebugLocFromInst(LI);
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *NewLI;
+    if (CreateGatherScatter) {
+      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      Value *VectorGep = State.get(getAddr(), Part);
+      NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
+                                         nullptr, "wide.masked.gather");
+      State.ILV->addMetadata(NewLI, LI);
+    } else {
+      auto *VecPtr =
+          CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+      if (isMaskRequired)
+        NewLI = Builder.CreateMaskedLoad(
+            DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
+            PoisonValue::get(DataTy), "wide.masked.load");
+      else
+        NewLI =
+            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+
+      // Add metadata to the load, but setVectorValue to the reverse shuffle.
+      State.ILV->addMetadata(NewLI, LI);
+      if (Reverse)
+        NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
+    }
+
+    State.set(getVPSingleValue(), NewLI, Part);
+  }
 }
 
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3ef0b794f68..95061e9053fa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) {
   return false;
 }
 
+/// Checks if the given value is actually an undefined constant vector.
+static bool isUndefVector(const Value *V) {
+  if (isa<UndefValue>(V))
+    return true;
+  auto *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+  if (!C->containsUndefOrPoisonElement())
+    return false;
+  auto *VecTy = dyn_cast<FixedVectorType>(C->getType());
+  if (!VecTy)
+    return false;
+  for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+    if (Constant *Elem = C->getAggregateElement(I))
+      if (!isa<UndefValue>(Elem))
+        return false;
+  }
+  return true;
+}
+
 /// Checks if the vector of instructions can be represented as a shuffle, like:
 /// %x0 = extractelement <4 x i8> %x, i32 0
 /// %x3 = extractelement <4 x i8> %x, i32 3
@@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) {
 /// TargetTransformInfo::getInstructionThroughput?
 static Optional<TargetTransformInfo::ShuffleKind>
 isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
-  auto *EI0 = cast<ExtractElementInst>(VL[0]);
+  const auto *It =
+      find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
+  if (It == VL.end())
+    return None;
+  auto *EI0 = cast<ExtractElementInst>(*It);
   if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
     return None;
   unsigned Size =
@@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
   Value *Vec2 = nullptr;
   enum ShuffleMode { Unknown, Select, Permute };
   ShuffleMode CommonShuffleMode = Unknown;
+  Mask.assign(VL.size(), UndefMaskElem);
   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+    // Undef can be represented as an undef element in a vector.
+    if (isa<UndefValue>(VL[I]))
+      continue;
     auto *EI = cast<ExtractElementInst>(VL[I]);
+    if (isa<ScalableVectorType>(EI->getVectorOperandType()))
+      return None;
     auto *Vec = EI->getVectorOperand();
+    // We can extractelement from undef or poison vector.
+    if (isUndefVector(Vec))
+      continue;
     // All vector operands must have the same number of vector elements.
     if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
       return None;
+    if (isa<UndefValue>(EI->getIndexOperand()))
+      continue;
     auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
     if (!Idx)
       return None;
     // Undefined behavior if Idx is negative or >= Size.
-    if (Idx->getValue().uge(Size)) {
-      Mask.push_back(UndefMaskElem);
+    if (Idx->getValue().uge(Size))
       continue;
-    }
     unsigned IntIdx = Idx->getValue().getZExtValue();
-    Mask.push_back(IntIdx);
-    // We can extractelement from undef or poison vector.
-    if (isa<UndefValue>(Vec))
-      continue;
+    Mask[I] = IntIdx;
     // For correct shuffling we have to have at most 2 different vector operands
     // in all extractelement instructions.
-    if (!Vec1 || Vec1 == Vec)
+    if (!Vec1 || Vec1 == Vec) {
       Vec1 = Vec;
-    else if (!Vec2 || Vec2 == Vec)
+    } else if (!Vec2 || Vec2 == Vec) {
       Vec2 = Vec;
-    else
+      Mask[I] += Size;
+    } else {
       return None;
+    }
     if (CommonShuffleMode == Permute)
       continue;
     // If the extract index is not the same as the operation number, it is a
@@ -1680,6 +1712,28 @@ private:
       return IsSame(Scalars, ReuseShuffleIndices);
     }
 
+    /// \returns true if current entry has same operands as \p TE.
+    bool hasEqualOperands(const TreeEntry &TE) const {
+      if (TE.getNumOperands() != getNumOperands())
+        return false;
+      SmallBitVector Used(getNumOperands());
+      for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
+        unsigned PrevCount = Used.count();
+        for (unsigned K = 0; K < E; ++K) {
+          if (Used.test(K))
+            continue;
+          if (getOperand(K) == TE.getOperand(I)) {
+            Used.set(K);
+            break;
+          }
+        }
+        // Check if we actually found the matching operand.
+        if (PrevCount == Used.count())
+          return false;
+      }
+      return true;
+    }
+
     /// \return Final vectorization factor for the node. Defined by the total
     /// number of vectorized scalars, including those, used several times in the
     /// entry and counted in the \a ReuseShuffleIndices, if any.
@@ -1773,6 +1827,12 @@ private:
       return Operands[OpIdx];
     }
 
+    /// \returns the \p OpIdx operand of this TreeEntry.
+    ArrayRef<Value *> getOperand(unsigned OpIdx) const {
+      assert(OpIdx < Operands.size() && "Off bounds");
+      return Operands[OpIdx];
+    }
+
     /// \returns the number of operands.
     unsigned getNumOperands() const { return Operands.size(); }
 
@@ -2078,7 +2138,7 @@ private:
   SmallPtrSet<const Value *, 32> EphValues;
 
   /// Holds all of the instructions that we gathered.
-  SetVector<Instruction *> GatherSeq;
+  SetVector<Instruction *> GatherShuffleSeq;
 
   /// A list of blocks that we are going to CSE.
   SetVector<BasicBlock *> CSEBlocks;
@@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                                bool IsGather) {
     DenseMap<Value *, int> ExtractVectorsTys;
     for (auto *V : VL) {
+      if (isa<UndefValue>(V))
+        continue;
       // If all users of instruction are going to be vectorized and this
       // instruction itself is not going to be vectorized, consider this
       // instruction as dead and remove its cost from the final cost of the
       // vectorized tree.
-      if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
-          (IsGather && ScalarToTreeEntry.count(V)))
+      if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))
         continue;
       auto *EE = cast<ExtractElementInst>(V);
-      unsigned Idx = *getExtractIndex(EE);
+      Optional<unsigned> EEIdx = getExtractIndex(EE);
+      if (!EEIdx)
+        continue;
+      unsigned Idx = *EEIdx;
       if (TTIRef.getNumberOfParts(VecTy) !=
           TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
         auto It =
@@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
     for (const auto &Data : ExtractVectorsTys) {
       auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
       unsigned NumElts = VecTy->getNumElements();
+      if (Data.second % NumElts == 0)
+        continue;
       if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
         unsigned Idx = (Data.second / NumElts) * NumElts;
         unsigned EENumElts = EEVTy->getNumElements();
@@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       // broadcast.
       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
     }
-    if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
-        allSameBlock(VL) &&
-        !isa<ScalableVectorType>(
-            cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) {
+    if ((E->getOpcode() == Instruction::ExtractElement ||
+         all_of(E->Scalars,
+                [](Value *V) {
+                  return isa<ExtractElementInst, UndefValue>(V);
+                })) &&
+        allSameType(VL)) {
       // Check that gather of extractelements can be represented as just a
       // shuffle of a single/two vectors the scalars are extracted from.
       SmallVector<int> Mask;
@@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
               return !is_contained(E->Scalars,
                                    cast<Instruction>(V)->getOperand(0));
             }));
-        if (isa<UndefValue>(FirstInsert->getOperand(0))) {
+        if (isUndefVector(FirstInsert->getOperand(0))) {
           Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
         } else {
           SmallVector<int> InsertMask(NumElts);
@@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       InstructionCost VecCost = 0;
-      if (Instruction::isBinaryOp(E->getOpcode())) {
+      // Try to find the previous shuffle node with the same operands and same
+      // main/alternate ops.
+      auto &&TryFindNodeWithEqualOperands = [this, E]() {
+        for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+          if (TE.get() == E)
+            break;
+          if (TE->isAltShuffle() &&
+              ((TE->getOpcode() == E->getOpcode() &&
+                TE->getAltOpcode() == E->getAltOpcode()) ||
+               (TE->getOpcode() == E->getAltOpcode() &&
+                TE->getAltOpcode() == E->getOpcode())) &&
+              TE->hasEqualOperands(*E))
+            return true;
+        }
+        return false;
+      };
+      if (TryFindNodeWithEqualOperands()) {
+        LLVM_DEBUG({
+          dbgs() << "SLP: diamond match for alternate node found.\n";
+          E->dump();
+        });
+        // No need to add new vector costs here since we're going to reuse
+        // same main/alternate vector ops, just do different shuffling.
+      } else if (Instruction::isBinaryOp(E->getOpcode())) {
         VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
         VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
                                                CostKind);
@@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
                    [this](Value *V) { return EphValues.contains(V); }) &&
            (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
             TE->Scalars.size() < Limit ||
-            (TE->getOpcode() == Instruction::ExtractElement &&
+            ((TE->getOpcode() == Instruction::ExtractElement ||
+              all_of(TE->Scalars,
+                     [](Value *V) {
+                       return isa<ExtractElementInst, UndefValue>(V);
+                     })) &&
              isFixedVectorShuffle(TE->Scalars, Mask)) ||
             (TE->State == TreeEntry::NeedToGather &&
              TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
@@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const {
   return Cost;
 }
 
+/// Check if two insertelement instructions are from the same buildvector.
+static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
+                                            InsertElementInst *V) {
+  // Instructions must be from the same basic blocks.
+  if (VU->getParent() != V->getParent())
+    return false;
+  // Checks if 2 insertelements are from the same buildvector.
+  if (VU->getType() != V->getType())
+    return false;
+  // Multiple used inserts are separate nodes.
+  if (!VU->hasOneUse() && !V->hasOneUse())
+    return false;
+  auto *IE1 = VU;
+  auto *IE2 = V;
+  // Go through the vector operand of insertelement instructions trying to find
+  // either VU as the original vector for IE2 or V as the original vector for
+  // IE1.
+  do {
+    if (IE2 == VU || IE1 == V)
+      return true;
+    if (IE1) {
+      if (IE1 != VU && !IE1->hasOneUse())
+        IE1 = nullptr;
+      else
+        IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+    }
+    if (IE2) {
+      if (IE2 != V && !IE2->hasOneUse())
+        IE2 = nullptr;
+      else
+        IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+    }
+  } while (IE1 || IE2);
+  return false;
+}
+
 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   InstructionCost Cost = 0;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   SmallVector<APInt> DemandedElts;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+    if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
+        !ExtractCostCalculated.insert(EU.Scalar).second)
       continue;
 
     // Uses by ephemeral values are free (because the ephemeral value will be
@@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
     // If found user is an insertelement, do not calculate extract cost but try
     // to detect it as a final shuffled/identity match.
-    if (isa_and_nonnull<InsertElementInst>(EU.User)) {
-      if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
-        Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
+    if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
+      if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
+        Optional<int> InsertIdx = getInsertIndex(VU, 0);
         if (!InsertIdx || *InsertIdx == UndefMaskElem)
           continue;
-        Value *VU = EU.User;
         auto *It = find_if(FirstUsers, [VU](Value *V) {
-          // Checks if 2 insertelements are from the same buildvector.
-          if (VU->getType() != V->getType())
-            return false;
-          auto *IE1 = cast<InsertElementInst>(VU);
-          auto *IE2 = cast<InsertElementInst>(V);
-          // Go through of insertelement instructions trying to find either VU
-          // as the original vector for IE2 or V as the original vector for IE1.
-          do {
-            if (IE1 == VU || IE2 == V)
-              return true;
-            if (IE1)
-              IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
-            if (IE2)
-              IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
-          } while (IE1 || IE2);
-          return false;
+          return areTwoInsertFromSameBuildVector(VU,
+                                                 cast<InsertElementInst>(V));
         });
         int VecId = -1;
         if (It == FirstUsers.end()) {
           VF.push_back(FTy->getNumElements());
           ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
-          FirstUsers.push_back(EU.User);
+          // Find the insertvector, vectorized in tree, if any.
+          Value *Base = VU;
+          while (isa<InsertElementInst>(Base)) {
+            // Build the mask for the vectorized insertelement instructions.
+            if (const TreeEntry *E = getTreeEntry(Base)) {
+              VU = cast<InsertElementInst>(Base);
+              do {
+                int Idx = E->findLaneForValue(Base);
+                ShuffleMask.back()[Idx] = Idx;
+                Base = cast<InsertElementInst>(Base)->getOperand(0);
+              } while (E == getTreeEntry(Base));
+              break;
+            }
+            Base = cast<InsertElementInst>(Base)->getOperand(0);
+          }
+          FirstUsers.push_back(VU);
           DemandedElts.push_back(APInt::getZero(VF.back()));
           VecId = FirstUsers.size() - 1;
         } else {
@@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         int Idx = *InsertIdx;
         ShuffleMask[VecId][Idx] = EU.Lane;
         DemandedElts[VecId].setBit(Idx);
+        continue;
       }
     }
 
@@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
   InstructionCost SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
-  for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
-    // For the very first element - simple shuffle of the source vector.
-    int Limit = ShuffleMask[I].size() * 2;
-    if (I == 0 &&
-        all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&
-        !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
+  if (FirstUsers.size() == 1) {
+    int Limit = ShuffleMask.front().size() * 2;
+    if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
+        !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
       InstructionCost C = TTI->getShuffleCost(
           TTI::SK_PermuteSingleSrc,
-          cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
+          cast<FixedVectorType>(FirstUsers.front()->getType()),
+          ShuffleMask.front());
       LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
                         << " for final shuffle of insertelement external users "
                         << *VectorizableTree.front()->Scalars.front() << ".\n"
                         << "SLP: Current total cost = " << Cost << "\n");
       Cost += C;
-      continue;
     }
-    // Other elements - permutation of 2 vectors (the initial one and the next
-    // Ith incoming vector).
-    unsigned VF = ShuffleMask[I].size();
-    for (unsigned Idx = 0; Idx < VF; ++Idx) {
-      int &Mask = ShuffleMask[I][Idx];
-      Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
-    }
-    InstructionCost C = TTI->getShuffleCost(
-        TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
-        ShuffleMask[I]);
-    LLVM_DEBUG(
-        dbgs()
-        << "SLP: Adding cost " << C
-        << " for final shuffle of vector node and external insertelement users "
-        << *VectorizableTree.front()->Scalars.front() << ".\n"
-        << "SLP: Current total cost = " << Cost << "\n");
-    Cost += C;
     InstructionCost InsertCost = TTI->getScalarizationOverhead(
-        cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
-        /*Insert*/ true,
-        /*Extract*/ false);
+        cast<FixedVectorType>(FirstUsers.front()->getType()),
+        DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
+    LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+                      << " for insertelements gather.\n"
+                      << "SLP: Current total cost = " << Cost << "\n");
     Cost -= InsertCost;
+  } else if (FirstUsers.size() >= 2) {
+    unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
+    // Combined masks of the first 2 vectors.
+    SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
+    copy(ShuffleMask.front(), CombinedMask.begin());
+    APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
+    auto *VecTy = FixedVectorType::get(
+        cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
+        MaxVF);
+    for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
+      if (ShuffleMask[1][I] != UndefMaskElem) {
+        CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
+        CombinedDemandedElts.setBit(I);
+      }
+    }
+    InstructionCost C =
+        TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                      << " for final shuffle of vector node and external "
+                         "insertelement users "
+                      << *VectorizableTree.front()->Scalars.front() << ".\n"
+                      << "SLP: Current total cost = " << Cost << "\n");
+    Cost += C;
+    InstructionCost InsertCost = TTI->getScalarizationOverhead(
+        VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
     LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
                       << " for insertelements gather.\n"
                       << "SLP: Current total cost = " << Cost << "\n");
+    Cost -= InsertCost;
+    for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
+      // Other elements - permutation of 2 vectors (the initial one and the
+      // next Ith incoming vector).
+      unsigned VF = ShuffleMask[I].size();
+      for (unsigned Idx = 0; Idx < VF; ++Idx) {
+        int Mask = ShuffleMask[I][Idx];
+        if (Mask != UndefMaskElem)
+          CombinedMask[Idx] = MaxVF + Mask;
+        else if (CombinedMask[Idx] != UndefMaskElem)
+          CombinedMask[Idx] = Idx;
+      }
+      for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
+        if (CombinedMask[Idx] != UndefMaskElem)
+          CombinedMask[Idx] = Idx;
+      InstructionCost C =
+          TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
+      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                        << " for final shuffle of vector node and external "
+                           "insertelement users "
+                        << *VectorizableTree.front()->Scalars.front() << ".\n"
+                        << "SLP: Current total cost = " << Cost << "\n");
+      Cost += C;
+      InstructionCost InsertCost = TTI->getScalarizationOverhead(
+          cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
+          /*Insert*/ true, /*Extract*/ false);
+      LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+                        << " for insertelements gather.\n"
+                        << "SLP: Current total cost = " << Cost << "\n");
+      Cost -= InsertCost;
+    }
   }
 
 #ifndef NDEBUG
@@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
     if (!InsElt)
       return Vec;
-    GatherSeq.insert(InsElt);
+    GatherShuffleSeq.insert(InsElt);
     CSEBlocks.insert(InsElt->getParent());
     // Add to our 'need-to-extract' list.
     if (TreeEntry *Entry = getTreeEntry(V)) {
@@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder {
   const unsigned VF = 0;
   bool IsFinalized = false;
   SmallVector<int, 4> Mask;
+  /// Holds all of the instructions that we gathered.
+  SetVector<Instruction *> &GatherShuffleSeq;
+  /// A list of blocks that we are going to CSE.
+  SetVector<BasicBlock *> &CSEBlocks;
 
 public:
-  ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)
-      : Builder(Builder), VF(VF) {}
+  ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF,
+                            SetVector<Instruction *> &GatherShuffleSeq,
+                            SetVector<BasicBlock *> &CSEBlocks)
+      : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq),
+        CSEBlocks(CSEBlocks) {}
 
   /// Adds a mask, inverting it before applying.
   void addInversedMask(ArrayRef<unsigned> SubMask) {
@@ -5804,7 +5983,12 @@ public:
 
     if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))
       return V;
-    return Builder.CreateShuffleVector(V, Mask, "shuffle");
+    Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle");
+    if (auto *I = dyn_cast<Instruction>(Vec)) {
+      GatherShuffleSeq.insert(I);
+      CSEBlocks.insert(I->getParent());
+    }
+    return Vec;
   }
 
   ~ShuffleInstructionBuilder() {
@@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
             std::iota(UniformMask.begin(), UniformMask.end(), 0);
             V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
           }
+          if (auto *I = dyn_cast<Instruction>(V)) {
+            GatherShuffleSeq.insert(I);
+            CSEBlocks.insert(I->getParent());
+          }
         }
         return V;
       }
@@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
     VL = UniqueValues;
   }
 
-  ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+  ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+                                           CSEBlocks);
   Value *Vec = gather(VL);
   if (!ReuseShuffleIndicies.empty()) {
     ShuffleBuilder.addMask(ReuseShuffleIndicies);
     Vec = ShuffleBuilder.finalize(Vec);
-    if (auto *I = dyn_cast<Instruction>(Vec)) {
-      GatherSeq.insert(I);
-      CSEBlocks.insert(I->getParent());
-    }
   }
   return Vec;
 }
@@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   unsigned VF = E->getVectorFactor();
-  ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+  ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+                                           CSEBlocks);
   if (E->State == TreeEntry::NeedToGather) {
     if (E->getMainOp())
       setInsertPointAfterBundle(E);
@@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
              "Expected shuffle of 1 or 2 entries.");
       Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
                                         Entries.back()->VectorizedValue, Mask);
+      if (auto *I = dyn_cast<Instruction>(Vec)) {
+        GatherShuffleSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
     } else {
       Vec = gather(E->Scalars);
     }
     if (NeedToShuffleReuses) {
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       Vec = ShuffleBuilder.finalize(Vec);
-      if (auto *I = dyn_cast<Instruction>(Vec)) {
-        GatherSeq.insert(I);
-        CSEBlocks.insert(I->getParent());
-      }
     }
     E->VectorizedValue = Vec;
     return Vec;
@@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         IsIdentity &= *InsertIdx - Offset == I;
         Mask[*InsertIdx - Offset] = I;
       }
-      if (!IsIdentity || NumElts != NumScalars)
+      if (!IsIdentity || NumElts != NumScalars) {
         V = Builder.CreateShuffleVector(V, Mask);
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          GatherShuffleSeq.insert(I);
+          CSEBlocks.insert(I->getParent());
+        }
+      }
 
       if ((!IsIdentity || Offset != 0 ||
-           !isa<UndefValue>(FirstInsert->getOperand(0))) &&
+           !isUndefVector(FirstInsert->getOperand(0))) &&
           NumElts != NumScalars) {
         SmallVector<int> InsertMask(NumElts);
         std::iota(InsertMask.begin(), InsertMask.end(), 0);
@@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         V = Builder.CreateShuffleVector(
             FirstInsert->getOperand(0), V, InsertMask,
             cast<Instruction>(E->Scalars.back())->getName());
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          GatherShuffleSeq.insert(I);
+          CSEBlocks.insert(I->getParent());
+        }
       }
 
       ++NumVectorInstructions;
@@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         V1 = Builder.CreateCast(
             static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
       }
+      // Add V0 and V1 to later analysis to try to find and remove matching
+      // instruction, if any.
+      for (Value *V : {V0, V1}) {
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          GatherShuffleSeq.insert(I);
+          CSEBlocks.insert(I->getParent());
+        }
+      }
 
       // Create shuffle to take alternate operations from the vector.
       // Also, gather up main and alt scalar ops to propagate IR flags to
@@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       propagateIRFlags(V1, AltScalars);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
-      if (Instruction *I = dyn_cast<Instruction>(V))
+      if (auto *I = dyn_cast<Instruction>(V)) {
         V = propagateMetadata(I, E->Scalars);
+        GatherShuffleSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
       V = ShuffleBuilder.finalize(V);
 
       E->VectorizedValue = V;
@@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
 }
 
 void BoUpSLP::optimizeGatherSequence() {
-  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()
                     << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
-  for (Instruction *I : GatherSeq) {
+  for (Instruction *I : GatherShuffleSeq) {
     if (isDeleted(I))
       continue;
 
@@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() {
     // If the vector or the element that we insert into it are
     // instructions that are defined in this basic block then we can't
     // hoist this instruction.
-    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
-    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
-    if (Op0 && L->contains(Op0))
-      continue;
-    if (Op1 && L->contains(Op1))
+    if (any_of(I->operands(), [L](Value *V) {
+          auto *OpI = dyn_cast<Instruction>(V);
+          return OpI && L->contains(OpI);
+        }))
       continue;
 
     // We can hoist this instruction. Move it to the pre-header.
@@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() {
     return A->getDFSNumIn() < B->getDFSNumIn();
   });
 
-  // Perform O(N^2) search over the gather sequences and merge identical
+  // Less defined shuffles can be replaced by the more defined copies.
+  // Between two shuffles one is less defined if it has the same vector operands
+  // and its mask indeces are the same as in the first one or undefs. E.g.
+  // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
+  // poison, <0, 0, 0, 0>.
+  auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
+                                           SmallVectorImpl<int> &NewMask) {
+    if (I1->getType() != I2->getType())
+      return false;
+    auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
+    auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
+    if (!SI1 || !SI2)
+      return I1->isIdenticalTo(I2);
+    if (SI1->isIdenticalTo(SI2))
+      return true;
+    for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
+      if (SI1->getOperand(I) != SI2->getOperand(I))
+        return false;
+    // Check if the second instruction is more defined than the first one.
+    NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
+    ArrayRef<int> SM1 = SI1->getShuffleMask();
+    // Count trailing undefs in the mask to check the final number of used
+    // registers.
+    unsigned LastUndefsCnt = 0;
+    for (int I = 0, E = NewMask.size(); I < E; ++I) {
+      if (SM1[I] == UndefMaskElem)
+        ++LastUndefsCnt;
+      else
+        LastUndefsCnt = 0;
+      if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&
+          NewMask[I] != SM1[I])
+        return false;
+      if (NewMask[I] == UndefMaskElem)
+        NewMask[I] = SM1[I];
+    }
+    // Check if the last undefs actually change the final number of used vector
+    // registers.
+    return SM1.size() - LastUndefsCnt > 1 &&
+           TTI->getNumberOfParts(SI1->getType()) ==
+               TTI->getNumberOfParts(
+                   FixedVectorType::get(SI1->getType()->getElementType(),
+                                        SM1.size() - LastUndefsCnt));
+  };
+  // Perform O(N^2) search over the gather/shuffle sequences and merge identical
   // instructions. TODO: We can further optimize this scan if we split the
   // instructions into different buckets based on the insert lane.
   SmallVector<Instruction *, 16> Visited;
@@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() {
       if (isDeleted(&In))
         continue;
       if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
-          !isa<ShuffleVectorInst>(&In))
+          !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
         continue;
 
       // Check if we can replace this instruction with any of the
       // visited instructions.
       bool Replaced = false;
-      for (Instruction *v : Visited) {
-        if (In.isIdenticalTo(v) &&
-            DT->dominates(v->getParent(), In.getParent())) {
-          In.replaceAllUsesWith(v);
+      for (Instruction *&V : Visited) {
+        SmallVector<int> NewMask;
+        if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
+            DT->dominates(V->getParent(), In.getParent())) {
+          In.replaceAllUsesWith(V);
           eraseInstruction(&In);
+          if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
+            if (!NewMask.empty())
+              SI->setShuffleMask(NewMask);
+          Replaced = true;
+          break;
+        }
+        if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
+            GatherShuffleSeq.contains(V) &&
+            IsIdenticalOrLessDefined(V, &In, NewMask) &&
+            DT->dominates(In.getParent(), V->getParent())) {
+          In.moveAfter(V);
+          V->replaceAllUsesWith(&In);
+          eraseInstruction(V);
+          if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
+            if (!NewMask.empty())
+              SI->setShuffleMask(NewMask);
+          V = &In;
           Replaced = true;
           break;
         }
@@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() {
     }
   }
   CSEBlocks.clear();
-  GatherSeq.clear();
+  GatherShuffleSeq.clear();
 }
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
@@ -8791,6 +9057,8 @@ private:
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
+    assert(RdxKind != RecurKind::FMulAdd &&
+           "A call to the llvm.fmuladd intrinsic is not handled yet");
 
     ++NumVectorInstructions;
     return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
@@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
   SmallVector<Value *, 16> BuildVectorOpds;
   SmallVector<int> Mask;
   if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
-      (llvm::all_of(BuildVectorOpds,
-                    [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+      (llvm::all_of(
+           BuildVectorOpds,
+           [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&
        isFixedVectorShuffle(BuildVectorOpds, Mask)))
     return false;
 
@@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
   return tryToVectorizeList(BuildVectorInsts, R);
 }
 
-bool SLPVectorizerPass::vectorizeSimpleInstructions(
-    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
-    bool AtTerminator) {
-  bool OpsChanged = false;
-  SmallVector<Instruction *, 4> PostponedCmps;
-  for (auto *I : reverse(Instructions)) {
-    if (R.isDeleted(I))
-      continue;
-    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
-      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
-    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
-      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
-    else if (isa<CmpInst>(I))
-      PostponedCmps.push_back(I);
-  }
-  if (AtTerminator) {
-    // Try to find reductions first.
-    for (Instruction *I : PostponedCmps) {
-      if (R.isDeleted(I))
-        continue;
-      for (Value *Op : I->operands())
-        OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
-    }
-    // Try to vectorize operands as vector bundles.
-    for (Instruction *I : PostponedCmps) {
-      if (R.isDeleted(I))
-        continue;
-      OpsChanged |= tryToVectorize(I, R);
-    }
-    Instructions.clear();
-  } else {
-    // Insert in reverse order since the PostponedCmps vector was filled in
-    // reverse order.
-    Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
-  }
-  return OpsChanged;
-}
-
 template <typename T>
 static bool
 tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
@@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
   return Changed;
 }
 
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
+    bool AtTerminator) {
+  bool OpsChanged = false;
+  SmallVector<Instruction *, 4> PostponedCmps;
+  for (auto *I : reverse(Instructions)) {
+    if (R.isDeleted(I))
+      continue;
+    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+    else if (isa<CmpInst>(I))
+      PostponedCmps.push_back(I);
+  }
+  if (AtTerminator) {
+    // Try to find reductions first.
+    for (Instruction *I : PostponedCmps) {
+      if (R.isDeleted(I))
+        continue;
+      for (Value *Op : I->operands())
+        OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
+    }
+    // Try to vectorize operands as vector bundles.
+    for (Instruction *I : PostponedCmps) {
+      if (R.isDeleted(I))
+        continue;
+      OpsChanged |= tryToVectorize(I, R);
+    }
+    // Try to vectorize list of compares.
+    // Sort by type, compare predicate, etc.
+    // TODO: Add analysis on the operand opcodes (profitable to vectorize
+    // instructions with same/alternate opcodes/const values).
+    auto &&CompareSorter = [&R](Value *V, Value *V2) {
+      auto *CI1 = cast<CmpInst>(V);
+      auto *CI2 = cast<CmpInst>(V2);
+      if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+        return false;
+      if (CI1->getOperand(0)->getType()->getTypeID() <
+          CI2->getOperand(0)->getType()->getTypeID())
+        return true;
+      if (CI1->getOperand(0)->getType()->getTypeID() >
+          CI2->getOperand(0)->getType()->getTypeID())
+        return false;
+      return CI1->getPredicate() < CI2->getPredicate() ||
+             (CI1->getPredicate() > CI2->getPredicate() &&
+              CI1->getPredicate() <
+                  CmpInst::getSwappedPredicate(CI2->getPredicate()));
+    };
+
+    auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
+      if (V1 == V2)
+        return true;
+      auto *CI1 = cast<CmpInst>(V1);
+      auto *CI2 = cast<CmpInst>(V2);
+      if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+        return false;
+      if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())
+        return false;
+      return CI1->getPredicate() == CI2->getPredicate() ||
+             CI1->getPredicate() ==
+                 CmpInst::getSwappedPredicate(CI2->getPredicate());
+    };
+    auto Limit = [&R](Value *V) {
+      unsigned EltSize = R.getVectorElementSize(V);
+      return std::max(2U, R.getMaxVecRegSize() / EltSize);
+    };
+
+    SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());
+    OpsChanged |= tryToVectorizeSequence<Value>(
+        Vals, Limit, CompareSorter, AreCompatibleCompares,
+        [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
+          // Exclude possible reductions from other blocks.
+          bool ArePossiblyReducedInOtherBlock =
+              any_of(Candidates, [](Value *V) {
+                return any_of(V->users(), [V](User *U) {
+                  return isa<SelectInst>(U) &&
+                         cast<SelectInst>(U)->getParent() !=
+                             cast<Instruction>(V)->getParent();
+                });
+              });
+          if (ArePossiblyReducedInOtherBlock)
+            return false;
+          return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+        },
+        /*LimitForRegisterSize=*/true);
+    Instructions.clear();
+  } else {
+    // Insert in reverse order since the PostponedCmps vector was filled in
+    // reverse order.
+    Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
+  }
+  return OpsChanged;
+}
+
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 638467f94e1c..44b5e1df0839 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -718,6 +718,8 @@ void VPInstruction::generateInstruction(VPTransformState &State,
 
 void VPInstruction::execute(VPTransformState &State) {
   assert(!State.Instance && "VPInstruction executing an Instance");
+  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
+  State.Builder.setFastMathFlags(FMF);
   for (unsigned Part = 0; Part < State.UF; ++Part)
     generateInstruction(State, Part);
 }
@@ -760,6 +762,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
     O << Instruction::getOpcodeName(getOpcode());
   }
 
+  O << FMF;
+
   for (const VPValue *Operand : operands()) {
     O << " ";
     Operand->printAsOperand(O, SlotTracker);
@@ -767,6 +771,16 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
+  // Make sure the VPInstruction is a floating-point operation.
+  assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
+          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
+          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
+          Opcode == Instruction::FCmp) &&
+         "this op can't take fast-math flags");
+  FMF = FMFNew;
+}
+
 /// Generate the code inside the body of the vectorized loop. Assumes a single
 /// LoopVectorBody basic-block was created for this. Introduce additional
 /// basic-blocks as needed, and fill them all.
@@ -1196,8 +1210,10 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = ";
   getChainOp()->printAsOperand(O, SlotTracker);
-  O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode())
-    << " (";
+  O << " +";
+  if (isa<FPMathOperator>(getUnderlyingInstr()))
+    O << getUnderlyingInstr()->getFastMathFlags();
+  O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
   getVecOp()->printAsOperand(O, SlotTracker);
   if (getCondOp()) {
     O << ", ";
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 00ee31007cb7..810dd5030f95 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -59,6 +59,7 @@ class Value;
 class VPBasicBlock;
 class VPRegionBlock;
 class VPlan;
+class VPReplicateRecipe;
 class VPlanSlp;
 
 /// Returns a calculation for the total number of elements for a given \p VF.
@@ -346,6 +347,10 @@ struct VPTransformState {
 
   /// Pointer to the VPlan code is generated for.
   VPlan *Plan;
+
+  /// Holds recipes that may generate a poison value that is used after
+  /// vectorization, even when their operands are not poison.
+  SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
 };
 
 /// VPUsers instance used by VPBlockBase to manage CondBit and the block
@@ -789,6 +794,7 @@ public:
 private:
   typedef unsigned char OpcodeTy;
   OpcodeTy Opcode;
+  FastMathFlags FMF;
 
   /// Utility method serving execute(): generates a single instance of the
   /// modeled instruction.
@@ -802,13 +808,6 @@ public:
       : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
         VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
 
-  VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
-      : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}),
-        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
-    for (auto *I : Operands)
-      addOperand(I->getVPSingleValue());
-  }
-
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
       : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
 
@@ -870,6 +869,9 @@ public:
       return true;
     }
   }
+
+  /// Set the fast-math flags.
+  void setFastMathFlags(FastMathFlags FMFNew);
 };
 
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
@@ -1511,7 +1513,7 @@ public:
 /// - For store: Address, stored value, optional mask
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {
   Instruction &Ingredient;
 
   // Whether the loaded-from / stored-to addresses are consecutive.
@@ -1533,10 +1535,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
 public:
   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
                                  bool Consecutive, bool Reverse)
-      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),
+      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}),
+        VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load),
         Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
-    new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
     setMask(Mask);
   }
 
@@ -1544,6 +1546,7 @@ public:
                                  VPValue *StoredValue, VPValue *Mask,
                                  bool Consecutive, bool Reverse)
       : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
+        VPValue(VPValue::VPVMemoryInstructionSC, &Store, this),
         Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
     setMask(Mask);