12 files changed, 198 insertions, 522 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 34c8a380448e..503ce019dc84 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -29,7 +28,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -398,6 +396,54 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
   return true;
 }
 
+/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
+/// pessimistic codegen that has to account for setting errno and can enable
+/// vectorization.
+static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI,
+                     TargetLibraryInfo &TLI, AssumptionCache &AC,
+                     DominatorTree &DT) {
+  // Match a call to sqrt mathlib function.
+  auto *Call = dyn_cast<CallInst>(&I);
+  if (!Call)
+    return false;
+
+  Module *M = Call->getModule();
+  LibFunc Func;
+  if (!TLI.getLibFunc(*Call, Func) || !isLibFuncEmittable(M, &TLI, Func))
+    return false;
+
+  if (Func != LibFunc_sqrt && Func != LibFunc_sqrtf && Func != LibFunc_sqrtl)
+    return false;
+
+  // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
+  // (because NNAN or the operand arg must not be less than -0.0) and (2) we
+  // would not end up lowering to a libcall anyway (which could change the value
+  // of errno), then:
+  // (1) errno won't be set.
+  // (2) it is safe to convert this to an intrinsic call.
+  Type *Ty = Call->getType();
+  Value *Arg = Call->getArgOperand(0);
+  if (TTI.haveFastSqrt(Ty) &&
+      (Call->hasNoNaNs() ||
+       cannotBeOrderedLessThanZero(Arg, M->getDataLayout(), &TLI, 0, &AC, &I,
+                                   &DT))) {
+    IRBuilder<> Builder(&I);
+    IRBuilderBase::FastMathFlagGuard Guard(Builder);
+    Builder.setFastMathFlags(Call->getFastMathFlags());
+
+    Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty);
+    Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");
+    I.replaceAllUsesWith(NewSqrt);
+
+    // Explicitly erase the old call because a call with side effects is not
+    // trivially dead.
+    I.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
 // the numbers from 0 to \p InputBits that should represent cttz results.
@@ -869,159 +915,13 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
   return true;
 }
 
-/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
-/// pessimistic codegen that has to account for setting errno and can enable
-/// vectorization.
-static bool foldSqrt(CallInst *Call, TargetTransformInfo &TTI,
-                     TargetLibraryInfo &TLI, AssumptionCache &AC,
-                     DominatorTree &DT) {
-  Module *M = Call->getModule();
-
-  // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
-  // (because NNAN or the operand arg must not be less than -0.0) and (2) we
-  // would not end up lowering to a libcall anyway (which could change the value
-  // of errno), then:
-  // (1) errno won't be set.
-  // (2) it is safe to convert this to an intrinsic call.
-  Type *Ty = Call->getType();
-  Value *Arg = Call->getArgOperand(0);
-  if (TTI.haveFastSqrt(Ty) &&
-      (Call->hasNoNaNs() ||
-       cannotBeOrderedLessThanZero(Arg, M->getDataLayout(), &TLI, 0, &AC, Call,
-                                   &DT))) {
-    IRBuilder<> Builder(Call);
-    IRBuilderBase::FastMathFlagGuard Guard(Builder);
-    Builder.setFastMathFlags(Call->getFastMathFlags());
-
-    Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty);
-    Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");
-    Call->replaceAllUsesWith(NewSqrt);
-
-    // Explicitly erase the old call because a call with side effects is not
-    // trivially dead.
-    Call->eraseFromParent();
-    return true;
-  }
-
-  return false;
-}
-
-/// Try to expand strcmp(P, "x") calls.
-static bool expandStrcmp(CallInst *CI, DominatorTree &DT, bool &MadeCFGChange) {
-  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
-
-  // Trivial cases are optimized during inst combine
-  if (Str1P == Str2P)
-    return false;
-
-  StringRef Str1, Str2;
-  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
-  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
-  Value *NonConstantP = nullptr;
-  StringRef ConstantStr;
-
-  if (!HasStr1 && HasStr2 && Str2.size() == 1) {
-    NonConstantP = Str1P;
-    ConstantStr = Str2;
-  } else if (!HasStr2 && HasStr1 && Str1.size() == 1) {
-    NonConstantP = Str2P;
-    ConstantStr = Str1;
-  } else {
-    return false;
-  }
-
-  // Check if strcmp result is only used in a comparison with zero
-  if (!isOnlyUsedInZeroComparison(CI))
-    return false;
-
-  // For strcmp(P, "x") do the following transformation:
-  //
-  // (before)
-  // dst = strcmp(P, "x")
-  //
-  // (after)
-  // v0 = P[0] - 'x'
-  // [if v0 == 0]
-  //   v1 = P[1]
-  // dst = phi(v0, v1)
-  //
-
-  IRBuilder<> B(CI->getParent());
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-  Type *RetType = CI->getType();
-
-  B.SetInsertPoint(CI);
-  BasicBlock *InitialBB = B.GetInsertBlock();
-  Value *Str1FirstCharacterValue =
-      B.CreateZExt(B.CreateLoad(B.getInt8Ty(), NonConstantP), RetType);
-  Value *Str2FirstCharacterValue =
-      ConstantInt::get(RetType, static_cast<unsigned char>(ConstantStr[0]));
-  Value *FirstCharacterSub =
-      B.CreateNSWSub(Str1FirstCharacterValue, Str2FirstCharacterValue);
-  Value *IsFirstCharacterSubZero =
-      B.CreateICmpEQ(FirstCharacterSub, ConstantInt::get(RetType, 0));
-  Instruction *IsFirstCharacterSubZeroBBTerminator = SplitBlockAndInsertIfThen(
-      IsFirstCharacterSubZero, CI, /*Unreachable*/ false,
-      /*BranchWeights*/ nullptr, &DTU);
-
-  B.SetInsertPoint(IsFirstCharacterSubZeroBBTerminator);
-  B.GetInsertBlock()->setName("strcmp_expand_sub_is_zero");
-  BasicBlock *IsFirstCharacterSubZeroBB = B.GetInsertBlock();
-  Value *Str1SecondCharacterValue = B.CreateZExt(
-      B.CreateLoad(B.getInt8Ty(), B.CreateConstInBoundsGEP1_64(
-                                      B.getInt8Ty(), NonConstantP, 1)),
-      RetType);
-
-  B.SetInsertPoint(CI);
-  B.GetInsertBlock()->setName("strcmp_expand_sub_join");
-
-  PHINode *Result = B.CreatePHI(RetType, 2);
-  Result->addIncoming(FirstCharacterSub, InitialBB);
-  Result->addIncoming(Str1SecondCharacterValue, IsFirstCharacterSubZeroBB);
-
-  CI->replaceAllUsesWith(Result);
-  CI->eraseFromParent();
-
-  MadeCFGChange = true;
-
-  return true;
-}
-
-static bool foldLibraryCalls(Instruction &I, TargetTransformInfo &TTI,
-                             TargetLibraryInfo &TLI, DominatorTree &DT,
-                             AssumptionCache &AC, bool &MadeCFGChange) {
-  CallInst *CI = dyn_cast<CallInst>(&I);
-  if (!CI)
-    return false;
-
-  LibFunc Func;
-  Module *M = I.getModule();
-  if (!TLI.getLibFunc(*CI, Func) || !isLibFuncEmittable(M, &TLI, Func))
-    return false;
-
-  switch (Func) {
-  case LibFunc_sqrt:
-  case LibFunc_sqrtf:
-  case LibFunc_sqrtl:
-    return foldSqrt(CI, TTI, TLI, AC, DT);
-  case LibFunc_strcmp:
-    return expandStrcmp(CI, DT, MadeCFGChange);
-  default:
-    break;
-  }
-
-  return false;
-}
-
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
 static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
                                 TargetTransformInfo &TTI,
                                 TargetLibraryInfo &TLI, AliasAnalysis &AA,
-                                AssumptionCache &AC, bool &MadeCFGChange) {
+                                AssumptionCache &AC) {
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
@@ -1046,7 +946,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make
       // bugs.
-      MadeChange |= foldLibraryCalls(I, TTI, TLI, DT, AC, MadeCFGChange);
+      MadeChange |= foldSqrt(I, TTI, TLI, AC, DT);
     }
   }
 
@@ -1062,12 +962,12 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
 /// handled in the callers of this function.
 static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI,
                     TargetLibraryInfo &TLI, DominatorTree &DT,
-                    AliasAnalysis &AA, bool &ChangedCFG) {
+                    AliasAnalysis &AA) {
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TruncInstCombine TIC(AC, TLI, DL, DT);
   MadeChange |= TIC.run(F);
-  MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, ChangedCFG);
+  MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC);
   return MadeChange;
 }
 
@@ -1078,21 +978,12 @@ PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
-
-  bool MadeCFGChange = false;
-
-  if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {
+  if (!runImpl(F, AC, TTI, TLI, DT, AA)) {
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
   }
-
   // Mark all the analyses that instcombine updates as preserved.
   PreservedAnalyses PA;
-
-  if (MadeCFGChange)
-    PA.preserve<DominatorTreeAnalysis>();
-  else
-    PA.preserveSet<CFGAnalyses>();
-
+  PA.preserveSet<CFGAnalyses>();
   return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index d78ab1c1ea28..d0606c15f3d5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -194,12 +194,49 @@ bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
   for (auto *DA : It->second)
     Visited.insert(DA->getParent());
 
+  SmallPtrSet<const BasicBlock *, 32> EscapingBBs;
+  for (auto *U : CB->users()) {
+    // The use from coroutine intrinsics are not a problem.
+    if (isa<CoroFreeInst, CoroSubFnInst, CoroSaveInst>(U))
+      continue;
+
+    // Think all other usages may be an escaping candidate conservatively.
+    //
+    // Note that the major user of switch ABI coroutine (the C++) will store
+    // resume.fn, destroy.fn and the index to the coroutine frame immediately.
+    // So the parent of the coro.begin in C++ will be always escaping.
+    // Then we can't get any performance benefits for C++ by improving the
+    // precision of the method.
+    //
+    // The reason why we still judge it is we want to make LLVM Coroutine in
+    // switch ABIs to be self contained as much as possible instead of a
+    // by-product of C++20 Coroutines.
+    EscapingBBs.insert(cast<Instruction>(U)->getParent());
+  }
+
+  bool PotentiallyEscaped = false;
+
   do {
     const auto *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB).second)
       continue;
-    if (TIs.count(BB))
-      return true;
+
+    // A Path insensitive marker to test whether the coro.begin escapes.
+    // It is intentional to make it path insensitive while it may not be
+    // precise since we don't want the process to be too slow.
+    PotentiallyEscaped |= EscapingBBs.count(BB);
+
+    if (TIs.count(BB)) {
+      if (!BB->getTerminator()->isExceptionalTerminator() || PotentiallyEscaped)
+        return true;
+
+      // If the function ends with the exceptional terminator, the memory used
+      // by the coroutine frame can be released by stack unwinding
+      // automatically. So we can think the coro.begin doesn't escape if it
+      // exits the function by exceptional terminator.
+
+      continue;
+    }
 
     // Conservatively say that there is potentially a path.
     if (!--Limit)
@@ -236,36 +273,36 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // memory location storing that value and not the virtual register.
 
   SmallPtrSet<BasicBlock *, 8> Terminators;
-  // First gather all of the non-exceptional terminators for the function.
+  // First gather all of the terminators for the function.
   // Consider the final coro.suspend as the real terminator when the current
   // function is a coroutine.
-    for (BasicBlock &B : *F) {
-      auto *TI = B.getTerminator();
-      if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
-          !isa<UnreachableInst>(TI))
-        Terminators.insert(&B);
-    }
+  for (BasicBlock &B : *F) {
+    auto *TI = B.getTerminator();
+
+    if (TI->getNumSuccessors() != 0 || isa<UnreachableInst>(TI))
+      continue;
+
+    Terminators.insert(&B);
+  }
 
   // Filter out the coro.destroy that lie along exceptional paths.
   SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
   for (const auto &It : DestroyAddr) {
-    // If there is any coro.destroy dominates all of the terminators for the
-    // coro.begin, we could know the corresponding coro.begin wouldn't escape.
-    for (Instruction *DA : It.second) {
-      if (llvm::all_of(Terminators, [&](auto *TI) {
-            return DT.dominates(DA, TI->getTerminator());
-          })) {
-        ReferencedCoroBegins.insert(It.first);
-        break;
-      }
-    }
-
-    // Whether there is any paths from coro.begin to Terminators which not pass
-    // through any of the coro.destroys.
+    // If every terminators is dominated by coro.destroy, we could know the
+    // corresponding coro.begin wouldn't escape.
+    //
+    // Otherwise hasEscapePath would decide whether there is any paths from
+    // coro.begin to Terminators which not pass through any of the
+    // coro.destroys.
     //
     // hasEscapePath is relatively slow, so we avoid to run it as much as
     // possible.
-    if (!ReferencedCoroBegins.count(It.first) &&
+    if (llvm::all_of(Terminators,
+                     [&](auto *TI) {
+                       return llvm::any_of(It.second, [&](auto *DA) {
+                         return DT.dominates(DA, TI->getTerminator());
+                       });
+                     }) ||
         !hasEscapePath(It.first, Terminators))
       ReferencedCoroBegins.insert(It.first);
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 3d6c501e4596..ac5dbc7cfb2a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -78,11 +78,6 @@ static cl::opt<unsigned> MaxClones(
     "The maximum number of clones allowed for a single function "
     "specialization"));
 
-static cl::opt<unsigned> MaxIncomingPhiValues(
-    "funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden, cl::desc(
-    "The maximum number of incoming values a PHI node can have to be "
-    "considered during the specialization bonus estimation"));
-
 static cl::opt<unsigned> MinFunctionSize(
     "funcspec-min-function-size", cl::init(100), cl::Hidden, cl::desc(
     "Don't specialize functions that have less than this number of "
@@ -109,7 +104,6 @@ static cl::opt<bool> SpecializeLiteralConstant(
 // the combination of size and latency savings in comparison to the non
 // specialized version of the function.
 static Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList,
-                                DenseSet<BasicBlock *> &DeadBlocks,
                                 ConstMap &KnownConstants, SCCPSolver &Solver,
                                 BlockFrequencyInfo &BFI,
                                 TargetTransformInfo &TTI) {
@@ -124,12 +118,6 @@ static Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList,
     if (!Weight)
       continue;
 
-    // These blocks are considered dead as far as the InstCostVisitor is
-    // concerned. They haven't been proven dead yet by the Solver, but
-    // may become if we propagate the constant specialization arguments.
-    if (!DeadBlocks.insert(BB).second)
-      continue;
-
     for (Instruction &I : *BB) {
       // Disregard SSA copies.
       if (auto *II = dyn_cast<IntrinsicInst>(&I))
@@ -164,19 +152,9 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
   return nullptr;
 }
 
-Cost InstCostVisitor::getBonusFromPendingPHIs() {
-  Cost Bonus = 0;
-  while (!PendingPHIs.empty()) {
-    Instruction *Phi = PendingPHIs.pop_back_val();
-    Bonus += getUserBonus(Phi);
-  }
-  return Bonus;
-}
-
 Cost InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
   // Cache the iterator before visiting.
-  LastVisited = Use ? KnownConstants.insert({Use, C}).first
-                    : KnownConstants.end();
+  LastVisited = KnownConstants.insert({Use, C}).first;
 
   if (auto *I = dyn_cast<SwitchInst>(User))
     return estimateSwitchInst(*I);
@@ -203,15 +181,13 @@ Cost InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
 
   for (auto *U : User->users())
     if (auto *UI = dyn_cast<Instruction>(U))
-      if (UI != User && Solver.isBlockExecutable(UI->getParent()))
+      if (Solver.isBlockExecutable(UI->getParent()))
         Bonus += getUserBonus(UI, User, C);
 
   return Bonus;
 }
 
 Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (I.getCondition() != LastVisited->first)
     return 0;
 
@@ -232,13 +208,10 @@ Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
     WorkList.push_back(BB);
   }
 
-  return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, BFI,
-                             TTI);
+  return estimateBasicBlocks(WorkList, KnownConstants, Solver, BFI, TTI);
 }
 
 Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (I.getCondition() != LastVisited->first)
     return 0;
 
@@ -250,39 +223,10 @@ Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
       Succ->getUniquePredecessor() == I.getParent())
     WorkList.push_back(Succ);
 
-  return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, BFI,
-                             TTI);
-}
-
-Constant *InstCostVisitor::visitPHINode(PHINode &I) {
-  if (I.getNumIncomingValues() > MaxIncomingPhiValues)
-    return nullptr;
-
-  bool Inserted = VisitedPHIs.insert(&I).second;
-  Constant *Const = nullptr;
-
-  for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) {
-    Value *V = I.getIncomingValue(Idx);
-    if (auto *Inst = dyn_cast<Instruction>(V))
-      if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx)))
-        continue;
-    Constant *C = findConstantFor(V, KnownConstants);
-    if (!C) {
-      if (Inserted)
-        PendingPHIs.push_back(&I);
-      return nullptr;
-    }
-    if (!Const)
-      Const = C;
-    else if (C != Const)
-      return nullptr;
-  }
-  return Const;
+  return estimateBasicBlocks(WorkList, KnownConstants, Solver, BFI, TTI);
 }
 
 Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (isGuaranteedNotToBeUndefOrPoison(LastVisited->second))
     return LastVisited->second;
   return nullptr;
@@ -309,8 +253,6 @@ Constant *InstCostVisitor::visitCallBase(CallBase &I) {
 }
 
 Constant *InstCostVisitor::visitLoadInst(LoadInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (isa<ConstantPointerNull>(LastVisited->second))
     return nullptr;
   return ConstantFoldLoadFromConstPtr(LastVisited->second, I.getType(), DL);
@@ -333,8 +275,6 @@ Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
 }
 
 Constant *InstCostVisitor::visitSelectInst(SelectInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (I.getCondition() != LastVisited->first)
     return nullptr;
 
@@ -350,8 +290,6 @@ Constant *InstCostVisitor::visitCastInst(CastInst &I) {
 }
 
 Constant *InstCostVisitor::visitCmpInst(CmpInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   bool Swap = I.getOperand(1) == LastVisited->first;
   Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
   Constant *Other = findConstantFor(V, KnownConstants);
@@ -365,14 +303,10 @@ Constant *InstCostVisitor::visitCmpInst(CmpInst &I) {
 }
 
 Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   return ConstantFoldUnaryOpOperand(I.getOpcode(), LastVisited->second, DL);
 }
 
 Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   bool Swap = I.getOperand(1) == LastVisited->first;
   Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
   Constant *Other = findConstantFor(V, KnownConstants);
@@ -779,17 +713,13 @@ bool FunctionSpecializer::findSpecializations(Function *F, Cost SpecCost,
       AllSpecs[Index].CallSites.push_back(&CS);
     } else {
       // Calculate the specialisation gain.
-      Cost Score = 0;
+      Cost Score = 0 - SpecCost;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
       for (ArgInfo &A : S.Args)
         Score += getSpecializationBonus(A.Formal, A.Actual, Visitor);
-      Score += Visitor.getBonusFromPendingPHIs();
-
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization score = "
-                        << Score << "\n");
 
       // Discard unprofitable specialisations.
-      if (!ForceSpecialization && Score <= SpecCost)
+      if (!ForceSpecialization && Score <= 0)
         continue;
 
       // Create a new specialisation entry.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index afd6e034f46d..767b7c7defbb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -906,7 +906,7 @@ InstCombinerImpl::foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I) {
 
   auto NewFoldedConst = [&](bool IsTrueArm, Value *V) {
     bool IsCastOpRHS = (CastOp == RHS);
-    bool IsZExt = isa<ZExtInst>(CastOp);
+    bool IsZExt = isa<ZExtOperator>(CastOp);
     Constant *C;
 
     if (IsTrueArm) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 3e3be536defc..597cec8e61c9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1777,6 +1777,20 @@ void CHR::cloneScopeBlocks(CHRScope *Scope,
       BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F);
       NewBlocks.push_back(NewBB);
       VMap[BB] = NewBB;
+
+      // Unreachable predecessors will not be cloned and will not have an edge
+      // to the cloned block. As such, also remove them from any phi nodes.
+      // To avoid iterator invalidation, first collect the dead predecessors
+      // from the first phi node, and then perform the actual removal.
+      if (auto *FirstPN = dyn_cast<PHINode>(NewBB->begin())) {
+        SmallVector<BasicBlock *> DeadPreds;
+        for (BasicBlock *Pred : FirstPN->blocks())
+          if (!DT.isReachableFromEntry(Pred))
+            DeadPreds.push_back(Pred);
+        for (PHINode &PN : make_early_inc_range(NewBB->phis()))
+          for (BasicBlock *Pred : DeadPreds)
+            PN.removeIncomingValue(Pred);
+      }
     }
 
   // Place the cloned blocks right after the original blocks (right before the
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 21f0b1a92293..75adcabc0d34 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -898,7 +898,9 @@ bool GCOVProfiler::emitProfileNotes(
 
           if (Line == Loc.getLine()) continue;
           Line = Loc.getLine();
-          if (SP != getDISubprogram(Loc.getScope()))
+          MDNode *Scope = Loc.getScope();
+          // TODO: Handle blocks from another file due to #line, #include, etc.
+          if (isa<DILexicalBlockFile>(Scope) || SP != getDISubprogram(Scope))
             continue;
 
           GCOVLines &Lines = Block.getFile(Filename);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 15628d32280d..2b88dd08d88b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -424,7 +424,7 @@ static Decomposition decompose(Value *V,
       return MergeResults(Op0, Op1, IsSigned);
 
     ConstantInt *CI;
-    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI)))) {
+    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI))) && canUseSExt(CI)) {
       auto Result = decompose(Op0, Preconditions, IsSigned, DL);
       Result.mul(CI->getSExtValue());
       return Result;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 24390f1b54f6..5b8f1b00dc03 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1269,6 +1269,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     if (IsLoadCSE) {
       LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
       combineMetadataForCSE(NLoadI, LoadI, false);
+      LVI->forgetValue(NLoadI);
     };
 
     // If the returned value is the load itself, replace with poison. This can
@@ -1461,6 +1462,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
   for (LoadInst *PredLoadI : CSELoads) {
     combineMetadataForCSE(PredLoadI, LoadI, true);
+    LVI->forgetValue(PredLoadI);
   }
 
   LoadI->replaceAllUsesWith(PN);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 68642a01b37c..00937e0d734a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -69,7 +69,6 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 STATISTIC(NumCallSlot,    "Number of call slot optimizations performed");
-STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
 
 namespace {
 
@@ -731,23 +730,6 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
     return true;
   }
 
-  // If this is a load-store pair from a stack slot to a stack slot, we
-  // might be able to perform the stack-move optimization just as we do for
-  // memcpys from an alloca to an alloca.
-  if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
-    if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
-      if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
-                                DL.getTypeStoreSize(T), BAA)) {
-        // Avoid invalidating the iterator.
-        BBI = SI->getNextNonDebugInstruction()->getIterator();
-        eraseInstruction(SI);
-        eraseInstruction(LI);
-        ++NumMemCpyInstr;
-        return true;
-      }
-    }
-  }
-
   return false;
 }
 
@@ -1426,217 +1408,6 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   return true;
 }
 
-// Attempts to optimize the pattern whereby memory is copied from an alloca to
-// another alloca, where the two allocas don't have conflicting mod/ref. If
-// successful, the two allocas can be merged into one and the transfer can be
-// deleted. This pattern is generated frequently in Rust, due to the ubiquity of
-// move operations in that language.
-//
-// Once we determine that the optimization is safe to perform, we replace all
-// uses of the destination alloca with the source alloca. We also "shrink wrap"
-// the lifetime markers of the single merged alloca to before the first use
-// and after the last use. Note that the "shrink wrapping" procedure is a safe
-// transformation only because we restrict the scope of this optimization to
-// allocas that aren't captured.
-bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
-                                          AllocaInst *DestAlloca,
-                                          AllocaInst *SrcAlloca, uint64_t Size,
-                                          BatchAAResults &BAA) {
-  LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
-                    << *Store << "\n");
-
-  // Make sure the two allocas are in the same address space.
-  if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
-    return false;
-  }
-
-  // 1. Check that copy is full. Calculate the static size of the allocas to be
-  // merged, bail out if we can't.
-  const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
-  std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
-  if (!SrcSize || SrcSize->isScalable() || Size != SrcSize->getFixedValue()) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
-    return false;
-  }
-  std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
-  if (!DestSize || DestSize->isScalable() ||
-      Size != DestSize->getFixedValue()) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
-    return false;
-  }
-
-  // 2-1. Check that src and dest are static allocas, which are not affected by
-  // stacksave/stackrestore.
-  if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca() ||
-      SrcAlloca->getParent() != Load->getParent() ||
-      SrcAlloca->getParent() != Store->getParent())
-    return false;
-
-  // 2-2. Check that src and dest are never captured, unescaped allocas. Also
-  // collect lifetime markers first/last users in order to shrink wrap the
-  // lifetimes, and instructions with noalias metadata to remove them.
-
-  SmallVector<Instruction *, 4> LifetimeMarkers;
-  Instruction *FirstUser = nullptr, *LastUser = nullptr;
-  SmallSet<Instruction *, 4> NoAliasInstrs;
-
-  // Recursively track the user and check whether modified alias exist.
-  auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool {
-    bool CanBeNull, CanBeFreed;
-    return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
-  };
-
-  auto CaptureTrackingWithModRef =
-      [&](Instruction *AI,
-          function_ref<bool(Instruction *)> ModRefCallback) -> bool {
-    SmallVector<Instruction *, 8> Worklist;
-    Worklist.push_back(AI);
-    unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
-    Worklist.reserve(MaxUsesToExplore);
-    SmallSet<const Use *, 20> Visited;
-    while (!Worklist.empty()) {
-      Instruction *I = Worklist.back();
-      Worklist.pop_back();
-      for (const Use &U : I->uses()) {
-        if (Visited.size() >= MaxUsesToExplore) {
-          LLVM_DEBUG(
-              dbgs()
-              << "Stack Move: Exceeded max uses to see ModRef, bailing\n");
-          return false;
-        }
-        if (!Visited.insert(&U).second)
-          continue;
-        switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) {
-        case UseCaptureKind::MAY_CAPTURE:
-          return false;
-        case UseCaptureKind::PASSTHROUGH:
-          // Instructions cannot have non-instruction users.
-          Worklist.push_back(cast<Instruction>(U.getUser()));
-          continue;
-        case UseCaptureKind::NO_CAPTURE: {
-          auto *UI = cast<Instruction>(U.getUser());
-          if (DestAlloca->getParent() != UI->getParent())
-            return false;
-          if (!FirstUser || UI->comesBefore(FirstUser))
-            FirstUser = UI;
-          if (!LastUser || LastUser->comesBefore(UI))
-            LastUser = UI;
-          if (UI->isLifetimeStartOrEnd()) {
-            // We note the locations of these intrinsic calls so that we can
-            // delete them later if the optimization succeeds, this is safe
-            // since both llvm.lifetime.start and llvm.lifetime.end intrinsics
-            // conceptually fill all the bytes of the alloca with an undefined
-            // value.
-            int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
-            if (Size < 0 || Size == DestSize) {
-              LifetimeMarkers.push_back(UI);
-              continue;
-            }
-          }
-          if (UI->hasMetadata(LLVMContext::MD_noalias))
-            NoAliasInstrs.insert(UI);
-          if (!ModRefCallback(UI))
-            return false;
-        }
-        }
-      }
-    }
-    return true;
-  };
-
-  // 3. Check that dest has no Mod/Ref, except full size lifetime intrinsics,
-  // from the alloca to the Store.
-  ModRefInfo DestModRef = ModRefInfo::NoModRef;
-  MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
-  auto DestModRefCallback = [&](Instruction *UI) -> bool {
-    // We don't care about the store itself.
-    if (UI == Store)
-      return true;
-    ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
-    // FIXME: For multi-BB cases, we need to see reachability from it to
-    // store.
-    // Bailout if Dest may have any ModRef before Store.
-    if (UI->comesBefore(Store) && isModOrRefSet(Res))
-      return false;
-    DestModRef |= BAA.getModRefInfo(UI, DestLoc);
-
-    return true;
-  };
-
-  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback))
-    return false;
-
-  // 3. Check that, from after the Load to the end of the BB,
-  // 3-1. if the dest has any Mod, src has no Ref, and
-  // 3-2. if the dest has any Ref, src has no Mod except full-sized lifetimes.
-  MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size));
-
-  auto SrcModRefCallback = [&](Instruction *UI) -> bool {
-    // Any ModRef before Load doesn't matter, also Load and Store can be
-    // ignored.
-    if (UI->comesBefore(Load) || UI == Load || UI == Store)
-      return true;
-    ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc);
-    if ((isModSet(DestModRef) && isRefSet(Res)) ||
-        (isRefSet(DestModRef) && isModSet(Res)))
-      return false;
-
-    return true;
-  };
-
-  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
-    return false;
-
-  // We can do the transformation. First, align the allocas appropriately.
-  SrcAlloca->setAlignment(
-      std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
-
-  // Merge the two allocas.
-  DestAlloca->replaceAllUsesWith(SrcAlloca);
-  eraseInstruction(DestAlloca);
-
-  // Drop metadata on the source alloca.
-  SrcAlloca->dropUnknownNonDebugMetadata();
-
-  // Do "shrink wrap" the lifetimes, if the original lifetime intrinsics exists.
-  if (!LifetimeMarkers.empty()) {
-    LLVMContext &C = SrcAlloca->getContext();
-    IRBuilder<> Builder(C);
-
-    ConstantInt *AllocaSize = ConstantInt::get(Type::getInt64Ty(C), Size);
-    // Create a new lifetime start marker before the first user of src or alloca
-    // users.
-    Builder.SetInsertPoint(FirstUser->getParent(), FirstUser->getIterator());
-    Builder.CreateLifetimeStart(SrcAlloca, AllocaSize);
-
-    // Create a new lifetime end marker after the last user of src or alloca
-    // users.
-    // FIXME: If the last user is the terminator for the bb, we can insert
-    // lifetime.end marker to the immidiate post-dominator, but currently do
-    // nothing.
-    if (!LastUser->isTerminator()) {
-      Builder.SetInsertPoint(LastUser->getParent(), ++LastUser->getIterator());
-      Builder.CreateLifetimeEnd(SrcAlloca, AllocaSize);
-    }
-
-    // Remove all other lifetime markers.
-    for (Instruction *I : LifetimeMarkers)
-      eraseInstruction(I);
-  }
-
-  // As this transformation can cause memory accesses that didn't previously
-  // alias to begin to alias one another, we remove !noalias metadata from any
-  // uses of either alloca. This is conservative, but more precision doesn't
-  // seem worthwhile right now.
-  for (Instruction *I : NoAliasInstrs)
-    I->setMetadata(LLVMContext::MD_noalias, nullptr);
-
-  LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
-  NumStackMove++;
-  return true;
-}
-
 /// Perform simplification of memcpy's.  If we have memcpy A
 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
@@ -1693,14 +1464,13 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
       AnyClobber, MemoryLocation::getForSource(M), BAA);
 
-  // There are five possible optimizations we can do for memcpy:
+  // There are four possible optimizations we can do for memcpy:
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
   //   b) call-memcpy xform for return slot optimization.
   //   c) memcpy from freshly alloca'd space or space that has just started
   //      its lifetime copies undefined data, and we can therefore eliminate
   //      the memcpy in favor of the data that was already at the destination.
   //   d) memcpy from a just-memset'd source can be turned into memset.
-  //   e) elimination of memcpy via stack-move optimization.
   if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
     if (Instruction *MI = MD->getMemoryInst()) {
       if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
@@ -1719,8 +1489,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         }
       }
       if (auto *MDep = dyn_cast<MemCpyInst>(MI))
-        if (processMemCpyMemCpyDependence(M, MDep, BAA))
-          return true;
+        return processMemCpyMemCpyDependence(M, MDep, BAA);
       if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
         if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
           LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
@@ -1739,27 +1508,6 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
     }
   }
 
-  // If the transfer is from a stack slot to a stack slot, then we may be able
-  // to perform the stack-move optimization. See the comments in
-  // performStackMoveOptzn() for more details.
-  auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
-  if (!DestAlloca)
-    return false;
-  auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
-  if (!SrcAlloca)
-    return false;
-  ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
-  if (Len == nullptr)
-    return false;
-  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue(),
-                            BAA)) {
-    // Avoid invalidating the iterator.
-    BBI = M->getNextNonDebugInstruction()->getIterator();
-    eraseInstruction(M);
-    ++NumMemCpyInstr;
-    return true;
-  }
-
   return false;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 4f1350e4ebb9..2031e70bee1d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -675,6 +675,12 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) {
     if (CI->isByValArgument(I)) {
       copyLocalTempOfByValueOperandIntoArguments(CI, I);
+      // When eliminating a tail call, we modify the values of the arguments.
+      // Therefore, if the byval parameter has a readonly attribute, we have to
+      // remove it. It is safe because, from the perspective of a caller, the
+      // byval parameter is always treated as "readonly," even if the readonly
+      // attribute is removed.
+      F.removeParamAttr(I, Attribute::ReadOnly);
       ArgumentPHIs[I]->addIncoming(F.getArg(I), BB);
     } else
       ArgumentPHIs[I]->addIncoming(CI->getArgOperand(I), BB);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 5b0951252c07..3ad97613fe7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -227,9 +227,21 @@ static Value *convertStrToInt(CallInst *CI, StringRef &Str, Value *EndPtr,
   return ConstantInt::get(RetTy, Result);
 }
 
+static bool isOnlyUsedInComparisonWithZero(Value *V) {
+  for (User *U : V->users()) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+        if (C->isNullValue())
+          continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
 static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
                                  const DataLayout &DL) {
-  if (!isOnlyUsedInZeroComparison(CI))
+  if (!isOnlyUsedInComparisonWithZero(CI))
     return false;
 
   if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d7e40e8ef978..b603bbe55dc9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3781,10 +3781,44 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
   // the incoming edges.
   VPBasicBlock *Header =
       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
+
+  // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
+  // sank outside of the loop would keep the same order as they had in the
+  // original loop.
+  SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
   for (VPRecipeBase &R : Header->phis()) {
     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
-      fixReduction(ReductionPhi, State);
-    else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+      ReductionPHIList.emplace_back(ReductionPhi);
+  }
+  stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
+                                       const VPReductionPHIRecipe *R2) {
+    auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
+    auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
+
+    // If neither of the recipes has an intermediate store, keep the order the
+    // same.
+    if (!IS1 && !IS2)
+      return false;
+
+    // If only one of the recipes has an intermediate store, then move it
+    // towards the beginning of the list.
+    if (IS1 && !IS2)
+      return true;
+
+    if (!IS1 && IS2)
+      return false;
+
+    // If both recipes have an intermediate store, then the recipe with the
+    // later store should be processed earlier. So it should go to the beginning
+    // of the list.
+    return DT->dominates(IS2, IS1);
+  });
+
+  for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
+    fixReduction(ReductionPhi, State);
+
+  for (VPRecipeBase &R : Header->phis()) {
+    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       fixFixedOrderRecurrence(FOR, State);
   }
 }