220 files changed, 25355 insertions, 7555 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 06222d7e7e44..59b94567a9c2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -121,14 +122,13 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) {
   BasicBlock *GuardBB = Phi.getIncomingBlock(RotSrc == P1);
   BasicBlock *RotBB = Phi.getIncomingBlock(RotSrc != P1);
   Instruction *TermI = GuardBB->getTerminator();
-  BasicBlock *TrueBB, *FalseBB;
   ICmpInst::Predicate Pred;
-  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()), TrueBB,
-                         FalseBB)))
+  BasicBlock *PhiBB = Phi.getParent();
+  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()),
+                         m_SpecificBB(PhiBB), m_SpecificBB(RotBB))))
     return false;
 
-  BasicBlock *PhiBB = Phi.getParent();
-  if (Pred != CmpInst::ICMP_EQ || TrueBB != PhiBB || FalseBB != RotBB)
+  if (Pred != CmpInst::ICMP_EQ)
     return false;
 
   // We matched a variation of this IR pattern:
@@ -251,6 +251,72 @@ static bool foldAnyOrAllBitsSet(Instruction &I) {
   return true;
 }
 
+// Try to recognize below function as popcount intrinsic.
+// This is the "best" algorithm from
+// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+// Also used in TargetLowering::expandCTPOP().
+//
+// int popcount(unsigned int i) {
+//   i = i - ((i >> 1) & 0x55555555);
+//   i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+//   i = ((i + (i >> 4)) & 0x0F0F0F0F);
+//   return (i * 0x01010101) >> 24;
+// }
+static bool tryToRecognizePopCount(Instruction &I) {
+  if (I.getOpcode() != Instruction::LShr)
+    return false;
+
+  Type *Ty = I.getType();
+  if (!Ty->isIntOrIntVectorTy())
+    return false;
+
+  unsigned Len = Ty->getScalarSizeInBits();
+  // FIXME: fix Len == 8 and other irregular type lengths.
+  if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
+    return false;
+
+  APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
+  APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
+  APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
+  APInt MaskShift = APInt(Len, Len - 8);
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *MulOp0;
+  // Matching "(i * 0x01010101...) >> 24".
+  if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
+       match(Op1, m_SpecificInt(MaskShift))) {
+    Value *ShiftOp0;
+    // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
+    if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
+                                    m_Deferred(ShiftOp0)),
+                            m_SpecificInt(Mask0F)))) {
+      Value *AndOp0;
+      // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
+      if (match(ShiftOp0,
+                m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
+                        m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),
+                              m_SpecificInt(Mask33))))) {
+        Value *Root, *SubOp1;
+        // Matching "i - ((i >> 1) & 0x55555555...)".
+        if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
+            match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
+                                m_SpecificInt(Mask55)))) {
+          LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
+          IRBuilder<> Builder(&I);
+          Function *Func = Intrinsic::getDeclaration(
+              I.getModule(), Intrinsic::ctpop, I.getType());
+          I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
@@ -269,6 +335,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedRotateToFunnelShift(I);
+      MadeChange |= tryToRecognizePopCount(I); 
     }
   }
 
@@ -303,7 +370,7 @@ void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
 }
 
 bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   return runImpl(F, TLI, DT);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp
new file mode 100644
index 000000000000..7c5e90cb53cd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -0,0 +1,305 @@
+//===-- CFGuard.cpp - Control Flow Guard checks -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the IR transform to add Microsoft's Control Flow Guard
+/// checks on Windows targets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/CFGuard.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+using OperandBundleDef = OperandBundleDefT<Value *>;
+
+#define DEBUG_TYPE "cfguard"
+
+STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added");
+
+namespace {
+
+/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes.
+/// These checks ensure that the target address corresponds to the start of an
+/// address-taken function. X86_64 targets use the CF_Dispatch mechanism. X86,
+/// ARM, and AArch64 targets use the CF_Check machanism.
+class CFGuard : public FunctionPass {
+public:
+  static char ID;
+
+  enum Mechanism { CF_Check, CF_Dispatch };
+
+  // Default constructor required for the INITIALIZE_PASS macro.
+  CFGuard() : FunctionPass(ID) {
+    initializeCFGuardPass(*PassRegistry::getPassRegistry());
+    // By default, use the guard check mechanism.
+    GuardMechanism = CF_Check;
+  }
+
+  // Recommended constructor used to specify the type of guard mechanism.
+  CFGuard(Mechanism Var) : FunctionPass(ID) {
+    initializeCFGuardPass(*PassRegistry::getPassRegistry());
+    GuardMechanism = Var;
+  }
+
+  /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
+  /// check mechanism. When the image is loaded, the loader puts the appropriate
+  /// guard check function pointer in the __guard_check_icall_fptr global
+  /// symbol. This checks that the target address is a valid address-taken
+  /// function. The address of the target function is passed to the guard check
+  /// function in an architecture-specific register (e.g. ECX on 32-bit X86,
+  /// X15 on Aarch64, and R0 on ARM). The guard check function has no return
+  /// value (if the target is invalid, the guard check funtion will raise an
+  /// error).
+  ///
+  /// For example, the following LLVM IR:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = call i32 %0()
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = load void (i8*)*, void (i8*)** @__guard_check_icall_fptr
+  ///   %2 = bitcast i32 ()* %0 to i8*
+  ///   call cfguard_checkcc void %1(i8* %2)
+  ///   %3 = call i32 %0()
+  /// \endcode
+  ///
+  /// For example, the following X86 assembly code:
+  /// \code
+  ///   movl  $_target_func, %eax
+  ///   calll *%eax
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  /// 	movl	$_target_func, %ecx
+  /// 	calll	*___guard_check_icall_fptr
+  /// 	calll	*%ecx
+  /// \endcode
+  ///
+  /// \param CB indirect call to instrument.
+  void insertCFGuardCheck(CallBase *CB);
+
+  /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
+  /// dispatch mechanism. When the image is loaded, the loader puts the
+  /// appropriate guard check function pointer in the
+  /// __guard_dispatch_icall_fptr global symbol. This checks that the target
+  /// address is a valid address-taken function and, if so, tail calls the
+  /// target. The target address is passed in an architecture-specific register
+  /// (e.g. RAX on X86_64), with all other arguments for the target function
+  /// passed as usual.
+  ///
+  /// For example, the following LLVM IR:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = call i32 %0()
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = load i32 ()*, i32 ()** @__guard_dispatch_icall_fptr
+  ///   %2 = call i32 %1() [ "cfguardtarget"(i32 ()* %0) ]
+  /// \endcode
+  ///
+  /// For example, the following X86_64 assembly code:
+  /// \code
+  ///   leaq   target_func(%rip), %rax
+  ///	  callq  *%rax
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  ///   leaq   target_func(%rip), %rax
+  ///   callq  *__guard_dispatch_icall_fptr(%rip)
+  /// \endcode
+  ///
+  /// \param CB indirect call to instrument.
+  void insertCFGuardDispatch(CallBase *CB);
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Only add checks if the module has the cfguard=2 flag.
+  int cfguard_module_flag = 0;
+  Mechanism GuardMechanism = CF_Check;
+  FunctionType *GuardFnType = nullptr;
+  PointerType *GuardFnPtrType = nullptr;
+  Constant *GuardFnGlobal = nullptr;
+};
+
+} // end anonymous namespace
+
+void CFGuard::insertCFGuardCheck(CallBase *CB) {
+
+  assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
+         "Only applicable for Windows targets");
+  assert(CB->isIndirectCall() &&
+         "Control Flow Guard checks can only be added to indirect calls");
+
+  IRBuilder<> B(CB);
+  Value *CalledOperand = CB->getCalledOperand();
+
+  // Load the global symbol as a pointer to the check function.
+  LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal);
+
+  // Create new call instruction. The CFGuard check should always be a call,
+  // even if the original CallBase is an Invoke or CallBr instruction.
+  CallInst *GuardCheck =
+      B.CreateCall(GuardFnType, GuardCheckLoad,
+                   {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())});
+
+  // Ensure that the first argument is passed in the correct register
+  // (e.g. ECX on 32-bit X86 targets).
+  GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
+}
+
+void CFGuard::insertCFGuardDispatch(CallBase *CB) {
+
+  assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
+         "Only applicable for Windows targets");
+  assert(CB->isIndirectCall() &&
+         "Control Flow Guard checks can only be added to indirect calls");
+
+  IRBuilder<> B(CB);
+  Value *CalledOperand = CB->getCalledOperand();
+  Type *CalledOperandType = CalledOperand->getType();
+
+  // Cast the guard dispatch global to the type of the called operand.
+  PointerType *PTy = PointerType::get(CalledOperandType, 0);
+  if (GuardFnGlobal->getType() != PTy)
+    GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy);
+
+  // Load the global as a pointer to a function of the same type.
+  LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal);
+
+  // Add the original call target as a cfguardtarget operand bundle.
+  SmallVector<llvm::OperandBundleDef, 1> Bundles;
+  CB->getOperandBundlesAsDefs(Bundles);
+  Bundles.emplace_back("cfguardtarget", CalledOperand);
+
+  // Create a copy of the call/invoke instruction and add the new bundle.
+  CallBase *NewCB;
+  if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+    NewCB = CallInst::Create(CI, Bundles, CB);
+  } else {
+    assert(isa<InvokeInst>(CB) && "Unknown indirect call type");
+    InvokeInst *II = cast<InvokeInst>(CB);
+    NewCB = llvm::InvokeInst::Create(II, Bundles, CB);
+  }
+
+  // Change the target of the call to be the guard dispatch function.
+  NewCB->setCalledOperand(GuardDispatchLoad);
+
+  // Replace the original call/invoke with the new instruction.
+  CB->replaceAllUsesWith(NewCB);
+
+  // Delete the original call/invoke.
+  CB->eraseFromParent();
+}
+
+bool CFGuard::doInitialization(Module &M) {
+
+  // Check if this module has the cfguard flag and read its value.
+  if (auto *MD =
+          mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
+    cfguard_module_flag = MD->getZExtValue();
+
+  // Skip modules for which CFGuard checks have been disabled.
+  if (cfguard_module_flag != 2)
+    return false;
+
+  // Set up prototypes for the guard check and dispatch functions.
+  GuardFnType = FunctionType::get(Type::getVoidTy(M.getContext()),
+                                  {Type::getInt8PtrTy(M.getContext())}, false);
+  GuardFnPtrType = PointerType::get(GuardFnType, 0);
+
+  // Get or insert the guard check or dispatch global symbols.
+  if (GuardMechanism == CF_Check) {
+    GuardFnGlobal =
+        M.getOrInsertGlobal("__guard_check_icall_fptr", GuardFnPtrType);
+  } else {
+    assert(GuardMechanism == CF_Dispatch && "Invalid CFGuard mechanism");
+    GuardFnGlobal =
+        M.getOrInsertGlobal("__guard_dispatch_icall_fptr", GuardFnPtrType);
+  }
+
+  return true;
+}
+
+bool CFGuard::runOnFunction(Function &F) {
+
+  // Skip modules for which CFGuard checks have been disabled.
+  if (cfguard_module_flag != 2)
+    return false;
+
+  SmallVector<CallBase *, 8> IndirectCalls;
+
+  // Iterate over the instructions to find all indirect call/invoke/callbr
+  // instructions. Make a separate list of pointers to indirect
+  // call/invoke/callbr instructions because the original instructions will be
+  // deleted as the checks are added.
+  for (BasicBlock &BB : F.getBasicBlockList()) {
+    for (Instruction &I : BB.getInstList()) {
+      auto *CB = dyn_cast<CallBase>(&I);
+      if (CB && CB->isIndirectCall() && !CB->hasFnAttr("guard_nocf")) {
+        IndirectCalls.push_back(CB);
+        CFGuardCounter++;
+      }
+    }
+  }
+
+  // If no checks are needed, return early.
+  if (IndirectCalls.empty()) {
+    return false;
+  }
+
+  // For each indirect call/invoke, add the appropriate dispatch or check.
+  if (GuardMechanism == CF_Dispatch) {
+    for (CallBase *CB : IndirectCalls) {
+      insertCFGuardDispatch(CB);
+    }
+  } else {
+    for (CallBase *CB : IndirectCalls) {
+      insertCFGuardCheck(CB);
+    }
+  }
+
+  return true;
+}
+
+char CFGuard::ID = 0;
+INITIALIZE_PASS(CFGuard, "CFGuard", "CFGuard", false, false)
+
+FunctionPass *llvm::createCFGuardCheckPass() {
+  return new CFGuard(CFGuard::CF_Check);
+}
+
+FunctionPass *llvm::createCFGuardDispatchPass() {
+  return new CFGuard(CFGuard::CF_Dispatch);
+}
+\ No newline at end of file
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index 1fb0a114d0c7..c2dbd6f41642 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -73,6 +73,8 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
         II->replaceAllUsesWith(ConstantInt::getTrue(Context));
         break;
       case Intrinsic::coro_id:
+      case Intrinsic::coro_id_retcon:
+      case Intrinsic::coro_id_retcon_once:
         II->replaceAllUsesWith(ConstantTokenNone::get(Context));
         break;
       case Intrinsic::coro_subfn_addr:
@@ -97,11 +99,11 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
 
 namespace {
 
-struct CoroCleanup : FunctionPass {
+struct CoroCleanupLegacy : FunctionPass {
   static char ID; // Pass identification, replacement for typeid
 
-  CoroCleanup() : FunctionPass(ID) {
-    initializeCoroCleanupPass(*PassRegistry::getPassRegistry());
+  CoroCleanupLegacy() : FunctionPass(ID) {
+    initializeCoroCleanupLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   std::unique_ptr<Lowerer> L;
@@ -111,8 +113,9 @@ struct CoroCleanup : FunctionPass {
   bool doInitialization(Module &M) override {
     if (coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
                                      "llvm.coro.subfn.addr", "llvm.coro.free",
-                                     "llvm.coro.id"}))
-      L = llvm::make_unique<Lowerer>(M);
+                                     "llvm.coro.id", "llvm.coro.id.retcon",
+                                     "llvm.coro.id.retcon.once"}))
+      L = std::make_unique<Lowerer>(M);
     return false;
   }
 
@@ -129,8 +132,8 @@ struct CoroCleanup : FunctionPass {
 };
 }
 
-char CoroCleanup::ID = 0;
-INITIALIZE_PASS(CoroCleanup, "coro-cleanup",
+char CoroCleanupLegacy::ID = 0;
+INITIALIZE_PASS(CoroCleanupLegacy, "coro-cleanup",
                 "Lower all coroutine related intrinsics", false, false)
 
-Pass *llvm::createCoroCleanupPass() { return new CoroCleanup(); }
+Pass *llvm::createCoroCleanupLegacyPass() { return new CoroCleanupLegacy(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 692697d6f32e..e73fb9eeb1e9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 #define DEBUG_TYPE "coro-early"
 
 namespace {
-// Created on demand if CoroEarly pass has work to do.
+// Created on demand if the coro-early pass has work to do.
 class Lowerer : public coro::LowererBase {
   IRBuilder<> Builder;
   PointerType *const AnyResumeFnPtrTy;
@@ -91,13 +91,14 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) {
   Value *Operand = II->getArgOperand(0);
 
   // ResumeFnAddr is the first pointer sized element of the coroutine frame.
+  static_assert(coro::Shape::SwitchFieldIndex::Resume == 0,
+                "resume function not at offset zero");
   auto *FrameTy = Int8Ptr;
   PointerType *FramePtrTy = FrameTy->getPointerTo();
 
   Builder.SetInsertPoint(II);
   auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy);
-  auto *Gep = Builder.CreateConstInBoundsGEP1_32(FrameTy, BCI, 0);
-  auto *Load = Builder.CreateLoad(FrameTy, Gep);
+  auto *Load = Builder.CreateLoad(BCI);
   auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
 
   II->replaceAllUsesWith(Cond);
@@ -189,6 +190,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
           }
         }
         break;
+      case Intrinsic::coro_id_retcon:
+      case Intrinsic::coro_id_retcon_once:
+        F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+        break;
       case Intrinsic::coro_resume:
         lowerResumeOrDestroy(CS, CoroSubFnInst::ResumeIndex);
         break;
@@ -220,10 +225,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
 
 namespace {
 
-struct CoroEarly : public FunctionPass {
+struct CoroEarlyLegacy : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid.
-  CoroEarly() : FunctionPass(ID) {
-    initializeCoroEarlyPass(*PassRegistry::getPassRegistry());
+  CoroEarlyLegacy() : FunctionPass(ID) {
+    initializeCoroEarlyLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   std::unique_ptr<Lowerer> L;
@@ -231,11 +236,18 @@ struct CoroEarly : public FunctionPass {
   // This pass has work to do only if we find intrinsics we are going to lower
   // in the module.
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(
-            M, {"llvm.coro.id", "llvm.coro.destroy", "llvm.coro.done",
-                "llvm.coro.end", "llvm.coro.noop", "llvm.coro.free",
-                "llvm.coro.promise", "llvm.coro.resume", "llvm.coro.suspend"}))
-      L = llvm::make_unique<Lowerer>(M);
+    if (coro::declaresIntrinsics(M, {"llvm.coro.id",
+                                     "llvm.coro.id.retcon",
+                                     "llvm.coro.id.retcon.once",
+                                     "llvm.coro.destroy",
+                                     "llvm.coro.done",
+                                     "llvm.coro.end",
+                                     "llvm.coro.noop", 
+                                     "llvm.coro.free",
+                                     "llvm.coro.promise",
+                                     "llvm.coro.resume",
+                                     "llvm.coro.suspend"}))
+      L = std::make_unique<Lowerer>(M);
     return false;
   }
 
@@ -255,8 +267,8 @@ struct CoroEarly : public FunctionPass {
 };
 }
 
-char CoroEarly::ID = 0;
-INITIALIZE_PASS(CoroEarly, "coro-early", "Lower early coroutine intrinsics",
-                false, false)
+char CoroEarlyLegacy::ID = 0;
+INITIALIZE_PASS(CoroEarlyLegacy, "coro-early",
+                "Lower early coroutine intrinsics", false, false)
 
-Pass *llvm::createCoroEarlyPass() { return new CoroEarly(); }
+Pass *llvm::createCoroEarlyLegacyPass() { return new CoroEarlyLegacy(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 6707aa1c827d..23d22e23861a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -23,7 +24,7 @@ using namespace llvm;
 #define DEBUG_TYPE "coro-elide"
 
 namespace {
-// Created on demand if CoroElide pass has work to do.
+// Created on demand if the coro-elide pass has work to do.
 struct Lowerer : coro::LowererBase {
   SmallVector<CoroIdInst *, 4> CoroIds;
   SmallVector<CoroBeginInst *, 1> CoroBegins;
@@ -276,17 +277,17 @@ static bool replaceDevirtTrigger(Function &F) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-struct CoroElide : FunctionPass {
+struct CoroElideLegacy : FunctionPass {
   static char ID;
-  CoroElide() : FunctionPass(ID) {
-    initializeCoroElidePass(*PassRegistry::getPassRegistry());
+  CoroElideLegacy() : FunctionPass(ID) {
+    initializeCoroElideLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   std::unique_ptr<Lowerer> L;
 
   bool doInitialization(Module &M) override {
     if (coro::declaresIntrinsics(M, {"llvm.coro.id"}))
-      L = llvm::make_unique<Lowerer>(M);
+      L = std::make_unique<Lowerer>(M);
     return false;
   }
 
@@ -329,15 +330,15 @@ struct CoroElide : FunctionPass {
 };
 }
 
-char CoroElide::ID = 0;
+char CoroElideLegacy::ID = 0;
 INITIALIZE_PASS_BEGIN(
-    CoroElide, "coro-elide",
+    CoroElideLegacy, "coro-elide",
     "Coroutine frame allocation elision and indirect calls replacement", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(
-    CoroElide, "coro-elide",
+    CoroElideLegacy, "coro-elide",
     "Coroutine frame allocation elision and indirect calls replacement", false,
     false)
 
-Pass *llvm::createCoroElidePass() { return new CoroElide(); }
+Pass *llvm::createCoroElideLegacyPass() { return new CoroElideLegacy(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 58bf22bee29b..2c42cf8a6d25 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -18,6 +18,7 @@
 
 #include "CoroInternal.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
@@ -28,6 +29,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
 using namespace llvm;
 
@@ -120,6 +122,15 @@ struct SuspendCrossingInfo {
         return false;
 
     BasicBlock *UseBB = I->getParent();
+
+    // As a special case, treat uses by an llvm.coro.suspend.retcon
+    // as if they were uses in the suspend's single predecessor: the
+    // uses conceptually occur before the suspend.
+    if (isa<CoroSuspendRetconInst>(I)) {
+      UseBB = UseBB->getSinglePredecessor();
+      assert(UseBB && "should have split coro.suspend into its own block");
+    }
+
     return hasPathCrossingSuspendPoint(DefBB, UseBB);
   }
 
@@ -128,7 +139,17 @@ struct SuspendCrossingInfo {
   }
 
   bool isDefinitionAcrossSuspend(Instruction &I, User *U) const {
-    return isDefinitionAcrossSuspend(I.getParent(), U);
+    auto *DefBB = I.getParent();
+
+    // As a special case, treat values produced by an llvm.coro.suspend.*
+    // as if they were defined in the single successor: the uses
+    // conceptually occur after the suspend.
+    if (isa<AnyCoroSuspendInst>(I)) {
+      DefBB = DefBB->getSingleSuccessor();
+      assert(DefBB && "should have split coro.suspend into its own block");
+    }
+
+    return isDefinitionAcrossSuspend(DefBB, U);
   }
 };
 } // end anonymous namespace
@@ -183,9 +204,10 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
     B.Suspend = true;
     B.Kills |= B.Consumes;
   };
-  for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
+  for (auto *CSI : Shape.CoroSuspends) {
     markSuspendBlock(CSI);
-    markSuspendBlock(CSI->getCoroSave());
+    if (auto *Save = CSI->getCoroSave())
+      markSuspendBlock(Save);
   }
 
   // Iterate propagating consumes and kills until they stop changing.
@@ -261,11 +283,13 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
 // We build up the list of spills for every case where a use is separated
 // from the definition by a suspend point.
 
+static const unsigned InvalidFieldIndex = ~0U;
+
 namespace {
 class Spill {
   Value *Def = nullptr;
   Instruction *User = nullptr;
-  unsigned FieldNo = 0;
+  unsigned FieldNo = InvalidFieldIndex;
 
 public:
   Spill(Value *Def, llvm::User *U) : Def(Def), User(cast<Instruction>(U)) {}
@@ -280,11 +304,11 @@ public:
   // the definition the first time they encounter it. Consider refactoring
   // SpillInfo into two arrays to normalize the spill representation.
   unsigned fieldIndex() const {
-    assert(FieldNo && "Accessing unassigned field");
+    assert(FieldNo != InvalidFieldIndex && "Accessing unassigned field");
     return FieldNo;
   }
   void setFieldIndex(unsigned FieldNumber) {
-    assert(!FieldNo && "Reassigning field number");
+    assert(FieldNo == InvalidFieldIndex && "Reassigning field number");
     FieldNo = FieldNumber;
   }
 };
@@ -376,18 +400,30 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   SmallString<32> Name(F.getName());
   Name.append(".Frame");
   StructType *FrameTy = StructType::create(C, Name);
-  auto *FramePtrTy = FrameTy->getPointerTo();
-  auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
-                                 /*isVarArg=*/false);
-  auto *FnPtrTy = FnTy->getPointerTo();
-
-  // Figure out how wide should be an integer type storing the suspend index.
-  unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
-  Type *PromiseType = Shape.PromiseAlloca
-                          ? Shape.PromiseAlloca->getType()->getElementType()
-                          : Type::getInt1Ty(C);
-  SmallVector<Type *, 8> Types{FnPtrTy, FnPtrTy, PromiseType,
-                               Type::getIntNTy(C, IndexBits)};
+  SmallVector<Type *, 8> Types;
+
+  AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
+
+  if (Shape.ABI == coro::ABI::Switch) {
+    auto *FramePtrTy = FrameTy->getPointerTo();
+    auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
+                                   /*IsVarArg=*/false);
+    auto *FnPtrTy = FnTy->getPointerTo();
+
+    // Figure out how wide should be an integer type storing the suspend index.
+    unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
+    Type *PromiseType = PromiseAlloca
+                            ? PromiseAlloca->getType()->getElementType()
+                            : Type::getInt1Ty(C);
+    Type *IndexType = Type::getIntNTy(C, IndexBits);
+    Types.push_back(FnPtrTy);
+    Types.push_back(FnPtrTy);
+    Types.push_back(PromiseType);
+    Types.push_back(IndexType);
+  } else {
+    assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
+  }
+
   Value *CurrentDef = nullptr;
 
   Padder.addTypes(Types);
@@ -399,7 +435,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
 
     CurrentDef = S.def();
     // PromiseAlloca was already added to Types array earlier.
-    if (CurrentDef == Shape.PromiseAlloca)
+    if (CurrentDef == PromiseAlloca)
       continue;
 
     uint64_t Count = 1;
@@ -430,9 +466,80 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   }
   FrameTy->setBody(Types);
 
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    break;
+
+  // Remember whether the frame is inline in the storage.
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    auto &Layout = F.getParent()->getDataLayout();
+    auto Id = Shape.getRetconCoroId();
+    Shape.RetconLowering.IsFrameInlineInStorage
+      = (Layout.getTypeAllocSize(FrameTy) <= Id->getStorageSize() &&
+         Layout.getABITypeAlignment(FrameTy) <= Id->getStorageAlignment());
+    break;
+  }
+  }
+
   return FrameTy;
 }
 
+// We use a pointer use visitor to discover if there are any writes into an
+// alloca that dominates CoroBegin. If that is the case, insertSpills will copy
+// the value from the alloca into the coroutine frame spill slot corresponding
+// to that alloca.
+namespace {
+struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
+  using Base = PtrUseVisitor<AllocaUseVisitor>;
+  AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
+                   const CoroBeginInst &CB)
+      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB) {}
+
+  // We are only interested in uses that dominate coro.begin.
+  void visit(Instruction &I) {
+    if (DT.dominates(&I, &CoroBegin))
+      Base::visit(I);
+  }
+  // We need to provide this overload as PtrUseVisitor uses a pointer based
+  // visiting function.
+  void visit(Instruction *I) { return visit(*I); }
+
+  void visitLoadInst(LoadInst &) {} // Good. Nothing to do.
+
+  // If the use is an operand, the pointer escaped and anything can write into
+  // that memory. If the use is the pointer, we are definitely writing into the
+  // alloca and therefore we need to copy.
+  void visitStoreInst(StoreInst &SI) { PI.setAborted(&SI); }
+
+  // Any other instruction that is not filtered out by PtrUseVisitor, will
+  // result in the copy.
+  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+
+private:
+  const DominatorTree &DT;
+  const CoroBeginInst &CoroBegin;
+};
+} // namespace
+static bool mightWriteIntoAllocaPtr(AllocaInst &A, const DominatorTree &DT,
+                                    const CoroBeginInst &CB) {
+  const DataLayout &DL = A.getModule()->getDataLayout();
+  AllocaUseVisitor Visitor(DL, DT, CB);
+  auto PtrI = Visitor.visitPtr(A);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    auto *PointerEscapingInstr = PtrI.getEscapingInst()
+                                     ? PtrI.getEscapingInst()
+                                     : PtrI.getAbortingInst();
+    if (PointerEscapingInstr) {
+      LLVM_DEBUG(
+          dbgs() << "AllocaInst copy was triggered by instruction: "
+                 << *PointerEscapingInstr << "\n");
+    }
+    return true;
+  }
+  return false;
+}
+
 // We need to make room to insert a spill after initial PHIs, but before
 // catchswitch instruction. Placing it before violates the requirement that
 // catchswitch, like all other EHPads must be the first nonPHI in a block.
@@ -476,7 +583,7 @@ static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
 //    whatever
 //
 //
-static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
+static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   auto *CB = Shape.CoroBegin;
   LLVMContext &C = CB->getContext();
   IRBuilder<> Builder(CB->getNextNode());
@@ -484,11 +591,14 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   PointerType *FramePtrTy = FrameTy->getPointerTo();
   auto *FramePtr =
       cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr"));
+  DominatorTree DT(*CB->getFunction());
 
   Value *CurrentValue = nullptr;
   BasicBlock *CurrentBlock = nullptr;
   Value *CurrentReload = nullptr;
-  unsigned Index = 0; // Proper field number will be read from field definition.
+
+  // Proper field number will be read from field definition.
+  unsigned Index = InvalidFieldIndex;
 
   // We need to keep track of any allocas that need "spilling"
   // since they will live in the coroutine frame now, all access to them
@@ -496,9 +606,11 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   // we remember allocas and their indices to be handled once we processed
   // all the spills.
   SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
-  // Promise alloca (if present) has a fixed field number (Shape::PromiseField)
-  if (Shape.PromiseAlloca)
-    Allocas.emplace_back(Shape.PromiseAlloca, coro::Shape::PromiseField);
+  // Promise alloca (if present) has a fixed field number.
+  if (auto *PromiseAlloca = Shape.getPromiseAlloca()) {
+    assert(Shape.ABI == coro::ABI::Switch);
+    Allocas.emplace_back(PromiseAlloca, coro::Shape::SwitchFieldIndex::Promise);
+  }
 
   // Create a GEP with the given index into the coroutine frame for the original
   // value Orig. Appends an extra 0 index for array-allocas, preserving the
@@ -526,7 +638,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   // Create a load instruction to reload the spilled value from the coroutine
   // frame.
   auto CreateReload = [&](Instruction *InsertBefore) {
-    assert(Index && "accessing unassigned field number");
+    assert(Index != InvalidFieldIndex && "accessing unassigned field number");
     Builder.SetInsertPoint(InsertBefore);
 
     auto *G = GetFramePointer(Index, CurrentValue);
@@ -558,29 +670,45 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
         // coroutine frame.
 
         Instruction *InsertPt = nullptr;
-        if (isa<Argument>(CurrentValue)) {
+        if (auto Arg = dyn_cast<Argument>(CurrentValue)) {
           // For arguments, we will place the store instruction right after
           // the coroutine frame pointer instruction, i.e. bitcast of
           // coro.begin from i8* to %f.frame*.
           InsertPt = FramePtr->getNextNode();
+
+          // If we're spilling an Argument, make sure we clear 'nocapture'
+          // from the coroutine function.
+          Arg->getParent()->removeParamAttr(Arg->getArgNo(),
+                                            Attribute::NoCapture);
+
         } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) {
           // If we are spilling the result of the invoke instruction, split the
           // normal edge and insert the spill in the new block.
           auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
           InsertPt = NewBB->getTerminator();
-        } else if (dyn_cast<PHINode>(CurrentValue)) {
+        } else if (isa<PHINode>(CurrentValue)) {
           // Skip the PHINodes and EH pads instructions.
           BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent();
           if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
             InsertPt = splitBeforeCatchSwitch(CSI);
           else
             InsertPt = &*DefBlock->getFirstInsertionPt();
+        } else if (auto CSI = dyn_cast<AnyCoroSuspendInst>(CurrentValue)) {
+          // Don't spill immediately after a suspend; splitting assumes
+          // that the suspend will be followed by a branch.
+          InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI();
         } else {
+          auto *I = cast<Instruction>(E.def());
+          assert(!I->isTerminator() && "unexpected terminator");
           // For all other values, the spill is placed immediately after
           // the definition.
-          assert(!cast<Instruction>(E.def())->isTerminator() &&
-                 "unexpected terminator");
-          InsertPt = cast<Instruction>(E.def())->getNextNode();
+          if (DT.dominates(CB, I)) {
+            InsertPt = I->getNextNode();
+          } else {
+            // Unless, it is not dominated by CoroBegin, then it will be
+            // inserted immediately after CoroFrame is computed.
+            InsertPt = FramePtr->getNextNode();
+          }
         }
 
         Builder.SetInsertPoint(InsertPt);
@@ -613,21 +741,53 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   }
 
   BasicBlock *FramePtrBB = FramePtr->getParent();
-  Shape.AllocaSpillBlock =
-      FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
-  Shape.AllocaSpillBlock->splitBasicBlock(&Shape.AllocaSpillBlock->front(),
-                                          "PostSpill");
 
-  Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
+  auto SpillBlock =
+    FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");      
+  SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
+  Shape.AllocaSpillBlock = SpillBlock;
   // If we found any allocas, replace all of their remaining uses with Geps.
+  // Note: we cannot do it indiscriminately as some of the uses may not be
+  // dominated by CoroBegin.
+  bool MightNeedToCopy = false;
+  Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
+  SmallVector<Instruction *, 4> UsersToUpdate;
   for (auto &P : Allocas) {
-    auto *G = GetFramePointer(P.second, P.first);
+    AllocaInst *const A = P.first;
+    UsersToUpdate.clear();
+    for (User *U : A->users()) {
+      auto *I = cast<Instruction>(U);
+      if (DT.dominates(CB, I))
+        UsersToUpdate.push_back(I);
+      else
+        MightNeedToCopy = true;
+    }
+    if (!UsersToUpdate.empty()) {
+      auto *G = GetFramePointer(P.second, A);
+      G->takeName(A);
+      for (Instruction *I : UsersToUpdate)
+        I->replaceUsesOfWith(A, G);
+    }
+  }
+  // If we discovered such uses not dominated by CoroBegin, see if any of them
+  // preceed coro begin and have instructions that can modify the
+  // value of the alloca and therefore would require a copying the value into
+  // the spill slot in the coroutine frame.
+  if (MightNeedToCopy) {
+    Builder.SetInsertPoint(FramePtr->getNextNode());
+
+    for (auto &P : Allocas) {
+      AllocaInst *const A = P.first;
+      if (mightWriteIntoAllocaPtr(*A, DT, *CB)) {
+        if (A->isArrayAllocation())
+          report_fatal_error(
+              "Coroutines cannot handle copying of array allocas yet");
 
-    // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) here,
-    // as we are changing location of the instruction.
-    G->takeName(P.first);
-    P.first->replaceAllUsesWith(G);
-    P.first->eraseFromParent();
+        auto *G = GetFramePointer(P.second, A);
+        auto *Value = Builder.CreateLoad(A);
+        Builder.CreateStore(Value, G);
+      }
+    }
   }
   return FramePtr;
 }
@@ -829,52 +989,6 @@ static void rewriteMaterializableInstructions(IRBuilder<> &IRB,
   }
 }
 
-// Move early uses of spilled variable after CoroBegin.
-// For example, if a parameter had address taken, we may end up with the code
-// like:
-//        define @f(i32 %n) {
-//          %n.addr = alloca i32
-//          store %n, %n.addr
-//          ...
-//          call @coro.begin
-//    we need to move the store after coro.begin
-static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
-                                        CoroBeginInst *CoroBegin) {
-  DominatorTree DT(F);
-  SmallVector<Instruction *, 8> NeedsMoving;
-
-  Value *CurrentValue = nullptr;
-
-  for (auto const &E : Spills) {
-    if (CurrentValue == E.def())
-      continue;
-
-    CurrentValue = E.def();
-
-    for (User *U : CurrentValue->users()) {
-      Instruction *I = cast<Instruction>(U);
-      if (!DT.dominates(CoroBegin, I)) {
-        LLVM_DEBUG(dbgs() << "will move: " << *I << "\n");
-
-        // TODO: Make this more robust. Currently if we run into a situation
-        // where simple instruction move won't work we panic and
-        // report_fatal_error.
-        for (User *UI : I->users()) {
-          if (!DT.dominates(CoroBegin, cast<Instruction>(UI)))
-            report_fatal_error("cannot move instruction since its users are not"
-                               " dominated by CoroBegin");
-        }
-
-        NeedsMoving.push_back(I);
-      }
-    }
-  }
-
-  Instruction *InsertPt = CoroBegin->getNextNode();
-  for (Instruction *I : NeedsMoving)
-    I->moveBefore(InsertPt);
-}
-
 // Splits the block at a particular instruction unless it is the first
 // instruction in the block with a single predecessor.
 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
@@ -895,21 +1009,337 @@ static void splitAround(Instruction *I, const Twine &Name) {
   splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
 }
 
+static bool isSuspendBlock(BasicBlock *BB) {
+  return isa<AnyCoroSuspendInst>(BB->front());
+}
+
+typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet;
+
+/// Does control flow starting at the given block ever reach a suspend
+/// instruction before reaching a block in VisitedOrFreeBBs?
+static bool isSuspendReachableFrom(BasicBlock *From,
+                                   VisitedBlocksSet &VisitedOrFreeBBs) {
+  // Eagerly try to add this block to the visited set.  If it's already
+  // there, stop recursing; this path doesn't reach a suspend before
+  // either looping or reaching a freeing block.
+  if (!VisitedOrFreeBBs.insert(From).second)
+    return false;
+
+  // We assume that we'll already have split suspends into their own blocks.
+  if (isSuspendBlock(From))
+    return true;
+
+  // Recurse on the successors.
+  for (auto Succ : successors(From)) {
+    if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs))
+      return true;
+  }
+
+  return false;
+}
+
+/// Is the given alloca "local", i.e. bounded in lifetime to not cross a
+/// suspend point?
+static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
+  // Seed the visited set with all the basic blocks containing a free
+  // so that we won't pass them up.
+  VisitedBlocksSet VisitedOrFreeBBs;
+  for (auto User : AI->users()) {
+    if (auto FI = dyn_cast<CoroAllocaFreeInst>(User))
+      VisitedOrFreeBBs.insert(FI->getParent());
+  }
+
+  return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs);
+}
+
+/// After we split the coroutine, will the given basic block be along
+/// an obvious exit path for the resumption function?
+static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
+                                              unsigned depth = 3) {
+  // If we've bottomed out our depth count, stop searching and assume
+  // that the path might loop back.
+  if (depth == 0) return false;
+
+  // If this is a suspend block, we're about to exit the resumption function.
+  if (isSuspendBlock(BB)) return true;
+
+  // Recurse into the successors.
+  for (auto Succ : successors(BB)) {
+    if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1))
+      return false;
+  }
+
+  // If none of the successors leads back in a loop, we're on an exit/abort.
+  return true;
+}
+
+static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) {
+  // Look for a free that isn't sufficiently obviously followed by
+  // either a suspend or a termination, i.e. something that will leave
+  // the coro resumption frame.
+  for (auto U : AI->users()) {
+    auto FI = dyn_cast<CoroAllocaFreeInst>(U);
+    if (!FI) continue;
+
+    if (!willLeaveFunctionImmediatelyAfter(FI->getParent()))
+      return true;
+  }
+
+  // If we never found one, we don't need a stack save.
+  return false;
+}
+
+/// Turn each of the given local allocas into a normal (dynamic) alloca
+/// instruction.
+static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
+                              SmallVectorImpl<Instruction*> &DeadInsts) {
+  for (auto AI : LocalAllocas) {
+    auto M = AI->getModule();
+    IRBuilder<> Builder(AI);
+
+    // Save the stack depth.  Try to avoid doing this if the stackrestore
+    // is going to immediately precede a return or something.
+    Value *StackSave = nullptr;
+    if (localAllocaNeedsStackSave(AI))
+      StackSave = Builder.CreateCall(
+                            Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+
+    // Allocate memory.
+    auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
+    Alloca->setAlignment(MaybeAlign(AI->getAlignment()));
+
+    for (auto U : AI->users()) {
+      // Replace gets with the allocation.
+      if (isa<CoroAllocaGetInst>(U)) {
+        U->replaceAllUsesWith(Alloca);
+
+      // Replace frees with stackrestores.  This is safe because
+      // alloca.alloc is required to obey a stack discipline, although we
+      // don't enforce that structurally.
+      } else {
+        auto FI = cast<CoroAllocaFreeInst>(U);
+        if (StackSave) {
+          Builder.SetInsertPoint(FI);
+          Builder.CreateCall(
+                    Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
+                             StackSave);
+        }
+      }
+      DeadInsts.push_back(cast<Instruction>(U));
+    }
+
+    DeadInsts.push_back(AI);
+  }
+}
+
+/// Turn the given coro.alloca.alloc call into a dynamic allocation.
+/// This happens during the all-instructions iteration, so it must not
+/// delete the call.
+static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI,
+                                        coro::Shape &Shape,
+                                   SmallVectorImpl<Instruction*> &DeadInsts) {
+  IRBuilder<> Builder(AI);
+  auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
+
+  for (User *U : AI->users()) {
+    if (isa<CoroAllocaGetInst>(U)) {
+      U->replaceAllUsesWith(Alloc);
+    } else {
+      auto FI = cast<CoroAllocaFreeInst>(U);
+      Builder.SetInsertPoint(FI);
+      Shape.emitDealloc(Builder, Alloc, nullptr);
+    }
+    DeadInsts.push_back(cast<Instruction>(U));
+  }
+
+  // Push this on last so that it gets deleted after all the others.
+  DeadInsts.push_back(AI);
+
+  // Return the new allocation value so that we can check for needed spills.
+  return cast<Instruction>(Alloc);
+}
+
+/// Get the current swifterror value.
+static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
+                                     coro::Shape &Shape) {
+  // Make a fake function pointer as a sort of intrinsic.
+  auto FnTy = FunctionType::get(ValueTy, {}, false);
+  auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
+
+  auto Call = Builder.CreateCall(Fn, {});
+  Shape.SwiftErrorOps.push_back(Call);
+
+  return Call;
+}
+
+/// Set the given value as the current swifterror value.
+///
+/// Returns a slot that can be used as a swifterror slot.
+static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V,
+                                     coro::Shape &Shape) {
+  // Make a fake function pointer as a sort of intrinsic.
+  auto FnTy = FunctionType::get(V->getType()->getPointerTo(),
+                                {V->getType()}, false);
+  auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
+
+  auto Call = Builder.CreateCall(Fn, { V });
+  Shape.SwiftErrorOps.push_back(Call);
+
+  return Call;
+}
+
+/// Set the swifterror value from the given alloca before a call,
+/// then put in back in the alloca afterwards.
+///
+/// Returns an address that will stand in for the swifterror slot
+/// until splitting.
+static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call,
+                                                 AllocaInst *Alloca,
+                                                 coro::Shape &Shape) {
+  auto ValueTy = Alloca->getAllocatedType();
+  IRBuilder<> Builder(Call);
+
+  // Load the current value from the alloca and set it as the
+  // swifterror value.
+  auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca);
+  auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape);
+
+  // Move to after the call.  Since swifterror only has a guaranteed
+  // value on normal exits, we can ignore implicit and explicit unwind
+  // edges.
+  if (isa<CallInst>(Call)) {
+    Builder.SetInsertPoint(Call->getNextNode());
+  } else {
+    auto Invoke = cast<InvokeInst>(Call);
+    Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg());
+  }
+
+  // Get the current swifterror value and store it to the alloca.
+  auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape);
+  Builder.CreateStore(ValueAfterCall, Alloca);
+
+  return Addr;
+}
+
+/// Eliminate a formerly-swifterror alloca by inserting the get/set
+/// intrinsics and attempting to MemToReg the alloca away.
+static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca,
+                                      coro::Shape &Shape) {
+  for (auto UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ) {
+    // We're likely changing the use list, so use a mutation-safe
+    // iteration pattern.
+    auto &Use = *UI;
+    ++UI;
+
+    // swifterror values can only be used in very specific ways.
+    // We take advantage of that here.
+    auto User = Use.getUser();
+    if (isa<LoadInst>(User) || isa<StoreInst>(User))
+      continue;
+
+    assert(isa<CallInst>(User) || isa<InvokeInst>(User));
+    auto Call = cast<Instruction>(User);
+
+    auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape);
+
+    // Use the returned slot address as the call argument.
+    Use.set(Addr);
+  }
+
+  // All the uses should be loads and stores now.
+  assert(isAllocaPromotable(Alloca));
+}
+
+/// "Eliminate" a swifterror argument by reducing it to the alloca case
+/// and then loading and storing in the prologue and epilog.
+///
+/// The argument keeps the swifterror flag.
+static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
+                                        coro::Shape &Shape,
+                             SmallVectorImpl<AllocaInst*> &AllocasToPromote) {
+  IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+
+  auto ArgTy = cast<PointerType>(Arg.getType());
+  auto ValueTy = ArgTy->getElementType();
+
+  // Reduce to the alloca case:
+
+  // Create an alloca and replace all uses of the arg with it.
+  auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace());
+  Arg.replaceAllUsesWith(Alloca);
+
+  // Set an initial value in the alloca.  swifterror is always null on entry.
+  auto InitialValue = Constant::getNullValue(ValueTy);
+  Builder.CreateStore(InitialValue, Alloca);
+
+  // Find all the suspends in the function and save and restore around them.
+  for (auto Suspend : Shape.CoroSuspends) {
+    (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape);
+  }
+
+  // Find all the coro.ends in the function and restore the error value.
+  for (auto End : Shape.CoroEnds) {
+    Builder.SetInsertPoint(End);
+    auto FinalValue = Builder.CreateLoad(ValueTy, Alloca);
+    (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape);
+  }
+
+  // Now we can use the alloca logic.
+  AllocasToPromote.push_back(Alloca);
+  eliminateSwiftErrorAlloca(F, Alloca, Shape);
+}
+
+/// Eliminate all problematic uses of swifterror arguments and allocas
+/// from the function.  We'll fix them up later when splitting the function.
+static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
+  SmallVector<AllocaInst*, 4> AllocasToPromote;
+
+  // Look for a swifterror argument.
+  for (auto &Arg : F.args()) {
+    if (!Arg.hasSwiftErrorAttr()) continue;
+
+    eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote);
+    break;
+  }
+
+  // Look for swifterror allocas.
+  for (auto &Inst : F.getEntryBlock()) {
+    auto Alloca = dyn_cast<AllocaInst>(&Inst);
+    if (!Alloca || !Alloca->isSwiftError()) continue;
+
+    // Clear the swifterror flag.
+    Alloca->setSwiftError(false);
+
+    AllocasToPromote.push_back(Alloca);
+    eliminateSwiftErrorAlloca(F, Alloca, Shape);
+  }
+
+  // If we have any allocas to promote, compute a dominator tree and
+  // promote them en masse.
+  if (!AllocasToPromote.empty()) {
+    DominatorTree DT(F);
+    PromoteMemToReg(AllocasToPromote, DT);
+  }
+}
+
 void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   // Lower coro.dbg.declare to coro.dbg.value, since we are going to rewrite
   // access to local variables.
   LowerDbgDeclare(F);
 
-  Shape.PromiseAlloca = Shape.CoroBegin->getId()->getPromise();
-  if (Shape.PromiseAlloca) {
-    Shape.CoroBegin->getId()->clearPromise();
+  eliminateSwiftError(F, Shape);
+
+  if (Shape.ABI == coro::ABI::Switch &&
+      Shape.SwitchLowering.PromiseAlloca) {
+    Shape.getSwitchCoroId()->clearPromise();
   }
 
   // Make sure that all coro.save, coro.suspend and the fallthrough coro.end
   // intrinsics are in their own blocks to simplify the logic of building up
   // SuspendCrossing data.
-  for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
-    splitAround(CSI->getCoroSave(), "CoroSave");
+  for (auto *CSI : Shape.CoroSuspends) {
+    if (auto *Save = CSI->getCoroSave())
+      splitAround(Save, "CoroSave");
     splitAround(CSI, "CoroSuspend");
   }
 
@@ -926,6 +1356,8 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
 
   IRBuilder<> Builder(F.getContext());
   SpillInfo Spills;
+  SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
+  SmallVector<Instruction*, 4> DeadInstructions;
 
   for (int Repeat = 0; Repeat < 4; ++Repeat) {
     // See if there are materializable instructions across suspend points.
@@ -955,11 +1387,40 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     // of the Coroutine Frame.
     if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin)
       continue;
+
     // The Coroutine Promise always included into coroutine frame, no need to
     // check for suspend crossing.
-    if (Shape.PromiseAlloca == &I)
+    if (Shape.ABI == coro::ABI::Switch &&
+        Shape.SwitchLowering.PromiseAlloca == &I)
       continue;
 
+    // Handle alloca.alloc specially here.
+    if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) {
+      // Check whether the alloca's lifetime is bounded by suspend points.
+      if (isLocalAlloca(AI)) {
+        LocalAllocas.push_back(AI);
+        continue;
+      }
+
+      // If not, do a quick rewrite of the alloca and then add spills of
+      // the rewritten value.  The rewrite doesn't invalidate anything in
+      // Spills because the other alloca intrinsics have no other operands
+      // besides AI, and it doesn't invalidate the iteration because we delay
+      // erasing AI.
+      auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions);
+
+      for (User *U : Alloc->users()) {
+        if (Checker.isDefinitionAcrossSuspend(*Alloc, U))
+          Spills.emplace_back(Alloc, U);
+      }
+      continue;
+    }
+
+    // Ignore alloca.get; we process this as part of coro.alloca.alloc.
+    if (isa<CoroAllocaGetInst>(I)) {
+      continue;
+    }
+
     for (User *U : I.users())
       if (Checker.isDefinitionAcrossSuspend(I, U)) {
         // We cannot spill a token.
@@ -970,7 +1431,10 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
       }
   }
   LLVM_DEBUG(dump("Spills", Spills));
-  moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
   Shape.FramePtr = insertSpills(Spills, Shape);
+  lowerLocalAllocas(LocalAllocas, DeadInstructions);
+
+  for (auto I : DeadInstructions)
+    I->eraseFromParent();
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h
index 5e19d7642e38..de2d2920cb15 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -27,6 +27,7 @@
 
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
@@ -77,10 +78,8 @@ public:
   }
 };
 
-/// This represents the llvm.coro.alloc instruction.
-class LLVM_LIBRARY_VISIBILITY CoroIdInst : public IntrinsicInst {
-  enum { AlignArg, PromiseArg, CoroutineArg, InfoArg };
-
+/// This represents a common base class for llvm.coro.id instructions.
+class LLVM_LIBRARY_VISIBILITY AnyCoroIdInst : public IntrinsicInst {
 public:
   CoroAllocInst *getCoroAlloc() {
     for (User *U : users())
@@ -97,6 +96,24 @@ public:
     llvm_unreachable("no coro.begin associated with coro.id");
   }
 
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    auto ID = I->getIntrinsicID();
+    return ID == Intrinsic::coro_id ||
+           ID == Intrinsic::coro_id_retcon ||
+           ID == Intrinsic::coro_id_retcon_once;
+  }
+
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.id instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdInst : public AnyCoroIdInst {
+  enum { AlignArg, PromiseArg, CoroutineArg, InfoArg };
+
+public:
   AllocaInst *getPromise() const {
     Value *Arg = getArgOperand(PromiseArg);
     return isa<ConstantPointerNull>(Arg)
@@ -182,6 +199,80 @@ public:
   }
 };
 
+/// This represents either the llvm.coro.id.retcon or
+/// llvm.coro.id.retcon.once instruction.
+class LLVM_LIBRARY_VISIBILITY AnyCoroIdRetconInst : public AnyCoroIdInst {
+  enum { SizeArg, AlignArg, StorageArg, PrototypeArg, AllocArg, DeallocArg };
+
+public:
+  void checkWellFormed() const;
+
+  uint64_t getStorageSize() const {
+    return cast<ConstantInt>(getArgOperand(SizeArg))->getZExtValue();
+  }
+
+  uint64_t getStorageAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  }
+
+  Value *getStorage() const {
+    return getArgOperand(StorageArg);
+  }
+
+  /// Return the prototype for the continuation function.  The type,
+  /// attributes, and calling convention of the continuation function(s)
+  /// are taken from this declaration.
+  Function *getPrototype() const {
+    return cast<Function>(getArgOperand(PrototypeArg)->stripPointerCasts());
+  }
+
+  /// Return the function to use for allocating memory.
+  Function *getAllocFunction() const {
+    return cast<Function>(getArgOperand(AllocArg)->stripPointerCasts());
+  }
+
+  /// Return the function to use for deallocating memory.
+  Function *getDeallocFunction() const {
+    return cast<Function>(getArgOperand(DeallocArg)->stripPointerCasts());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    auto ID = I->getIntrinsicID();
+    return ID == Intrinsic::coro_id_retcon
+        || ID == Intrinsic::coro_id_retcon_once;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.id.retcon instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdRetconInst
+    : public AnyCoroIdRetconInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_id_retcon;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.id.retcon.once instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdRetconOnceInst
+    : public AnyCoroIdRetconInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_id_retcon_once;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.frame instruction.
 class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
 public:
@@ -215,7 +306,9 @@ class LLVM_LIBRARY_VISIBILITY CoroBeginInst : public IntrinsicInst {
   enum { IdArg, MemArg };
 
 public:
-  CoroIdInst *getId() const { return cast<CoroIdInst>(getArgOperand(IdArg)); }
+  AnyCoroIdInst *getId() const {
+    return cast<AnyCoroIdInst>(getArgOperand(IdArg));
+  }
 
   Value *getMem() const { return getArgOperand(MemArg); }
 
@@ -261,8 +354,22 @@ public:
   }
 };
 
+class LLVM_LIBRARY_VISIBILITY AnyCoroSuspendInst : public IntrinsicInst {
+public:
+  CoroSaveInst *getCoroSave() const;
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_suspend ||
+           I->getIntrinsicID() == Intrinsic::coro_suspend_retcon;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.suspend instruction.
-class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public IntrinsicInst {
+class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public AnyCoroSuspendInst {
   enum { SaveArg, FinalArg };
 
 public:
@@ -273,6 +380,7 @@ public:
     assert(isa<ConstantTokenNone>(Arg));
     return nullptr;
   }
+
   bool isFinal() const {
     return cast<Constant>(getArgOperand(FinalArg))->isOneValue();
   }
@@ -286,6 +394,37 @@ public:
   }
 };
 
+inline CoroSaveInst *AnyCoroSuspendInst::getCoroSave() const {
+  if (auto Suspend = dyn_cast<CoroSuspendInst>(this))
+    return Suspend->getCoroSave();
+  return nullptr;
+}
+
+/// This represents the llvm.coro.suspend.retcon instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSuspendRetconInst : public AnyCoroSuspendInst {
+public:
+  op_iterator value_begin() { return arg_begin(); }
+  const_op_iterator value_begin() const { return arg_begin(); }
+
+  op_iterator value_end() { return arg_end(); }
+  const_op_iterator value_end() const { return arg_end(); }
+
+  iterator_range<op_iterator> value_operands() {
+    return make_range(value_begin(), value_end());
+  }
+  iterator_range<const_op_iterator> value_operands() const {
+    return make_range(value_begin(), value_end());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_suspend_retcon;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.size instruction.
 class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst {
 public:
@@ -317,6 +456,60 @@ public:
   }
 };
 
+/// This represents the llvm.coro.alloca.alloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocaAllocInst : public IntrinsicInst {
+  enum { SizeArg, AlignArg };
+public:
+  Value *getSize() const {
+    return getArgOperand(SizeArg);
+  }
+  unsigned getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_alloca_alloc;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.alloca.get instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocaGetInst : public IntrinsicInst {
+  enum { AllocArg };
+public:
+  CoroAllocaAllocInst *getAlloc() const {
+    return cast<CoroAllocaAllocInst>(getArgOperand(AllocArg));
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_alloca_get;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.alloca.free instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocaFreeInst : public IntrinsicInst {
+  enum { AllocArg };
+public:
+  CoroAllocaAllocInst *getAlloc() const {
+    return cast<CoroAllocaAllocInst>(getArgOperand(AllocArg));
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_alloca_free;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 } // End namespace llvm.
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 441c8a20f1f3..7eb35400c0d5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -12,6 +12,7 @@
 #define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
 
 #include "CoroInstr.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/Transforms/Coroutines.h"
 
 namespace llvm {
@@ -20,10 +21,10 @@ class CallGraph;
 class CallGraphSCC;
 class PassRegistry;
 
-void initializeCoroEarlyPass(PassRegistry &);
-void initializeCoroSplitPass(PassRegistry &);
-void initializeCoroElidePass(PassRegistry &);
-void initializeCoroCleanupPass(PassRegistry &);
+void initializeCoroEarlyLegacyPass(PassRegistry &);
+void initializeCoroSplitLegacyPass(PassRegistry &);
+void initializeCoroElideLegacyPass(PassRegistry &);
+void initializeCoroCleanupLegacyPass(PassRegistry &);
 
 // CoroEarly pass marks every function that has coro.begin with a string
 // attribute "coroutine.presplit"="0". CoroSplit pass processes the coroutine
@@ -42,7 +43,8 @@ void initializeCoroCleanupPass(PassRegistry &);
 
 namespace coro {
 
-bool declaresIntrinsics(Module &M, std::initializer_list<StringRef>);
+bool declaresIntrinsics(const Module &M,
+                        const std::initializer_list<StringRef>);
 void replaceAllCoroAllocs(CoroBeginInst *CB, bool Replacement);
 void replaceAllCoroFrees(CoroBeginInst *CB, Value *Replacement);
 void replaceCoroFree(CoroIdInst *CoroId, bool Elide);
@@ -61,37 +63,174 @@ struct LowererBase {
   Value *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
 };
 
+enum class ABI {
+  /// The "resume-switch" lowering, where there are separate resume and
+  /// destroy functions that are shared between all suspend points.  The
+  /// coroutine frame implicitly stores the resume and destroy functions,
+  /// the current index, and any promise value.
+  Switch,
+
+  /// The "returned-continuation" lowering, where each suspend point creates a
+  /// single continuation function that is used for both resuming and
+  /// destroying.  Does not support promises.
+  Retcon,
+
+  /// The "unique returned-continuation" lowering, where each suspend point
+  /// creates a single continuation function that is used for both resuming
+  /// and destroying.  Does not support promises.  The function is known to
+  /// suspend at most once during its execution, and the return value of
+  /// the continuation is void.
+  RetconOnce,
+};
+
 // Holds structural Coroutine Intrinsics for a particular function and other
 // values used during CoroSplit pass.
 struct LLVM_LIBRARY_VISIBILITY Shape {
   CoroBeginInst *CoroBegin;
   SmallVector<CoroEndInst *, 4> CoroEnds;
   SmallVector<CoroSizeInst *, 2> CoroSizes;
-  SmallVector<CoroSuspendInst *, 4> CoroSuspends;
-
-  // Field Indexes for known coroutine frame fields.
-  enum {
-    ResumeField,
-    DestroyField,
-    PromiseField,
-    IndexField,
+  SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
+  SmallVector<CallInst*, 2> SwiftErrorOps;
+
+  // Field indexes for special fields in the switch lowering.
+  struct SwitchFieldIndex {
+    enum {
+      Resume,
+      Destroy,
+      Promise,
+      Index,
+      /// The index of the first spill field.
+      FirstSpill
+    };
   };
 
+  coro::ABI ABI;
+
   StructType *FrameTy;
   Instruction *FramePtr;
   BasicBlock *AllocaSpillBlock;
-  SwitchInst *ResumeSwitch;
-  AllocaInst *PromiseAlloca;
-  bool HasFinalSuspend;
+
+  struct SwitchLoweringStorage {
+    SwitchInst *ResumeSwitch;
+    AllocaInst *PromiseAlloca;
+    BasicBlock *ResumeEntryBlock;
+    bool HasFinalSuspend;
+  };
+
+  struct RetconLoweringStorage {
+    Function *ResumePrototype;
+    Function *Alloc;
+    Function *Dealloc;
+    BasicBlock *ReturnBlock;
+    bool IsFrameInlineInStorage;
+  };
+
+  union {
+    SwitchLoweringStorage SwitchLowering;
+    RetconLoweringStorage RetconLowering;
+  };
+
+  CoroIdInst *getSwitchCoroId() const {
+    assert(ABI == coro::ABI::Switch);
+    return cast<CoroIdInst>(CoroBegin->getId());
+  }
+
+  AnyCoroIdRetconInst *getRetconCoroId() const {
+    assert(ABI == coro::ABI::Retcon ||
+           ABI == coro::ABI::RetconOnce);
+    return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
+  }
 
   IntegerType *getIndexType() const {
+    assert(ABI == coro::ABI::Switch);
     assert(FrameTy && "frame type not assigned");
-    return cast<IntegerType>(FrameTy->getElementType(IndexField));
+    return cast<IntegerType>(FrameTy->getElementType(SwitchFieldIndex::Index));
   }
   ConstantInt *getIndex(uint64_t Value) const {
     return ConstantInt::get(getIndexType(), Value);
   }
 
+  PointerType *getSwitchResumePointerType() const {
+    assert(ABI == coro::ABI::Switch);
+  assert(FrameTy && "frame type not assigned");
+  return cast<PointerType>(FrameTy->getElementType(SwitchFieldIndex::Resume));
+  }
+
+  FunctionType *getResumeFunctionType() const {
+    switch (ABI) {
+    case coro::ABI::Switch: {
+      auto *FnPtrTy = getSwitchResumePointerType();
+      return cast<FunctionType>(FnPtrTy->getPointerElementType());
+    }
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getFunctionType();
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  ArrayRef<Type*> getRetconResultTypes() const {
+    assert(ABI == coro::ABI::Retcon ||
+           ABI == coro::ABI::RetconOnce);
+    auto FTy = CoroBegin->getFunction()->getFunctionType();
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    if (auto STy = dyn_cast<StructType>(FTy->getReturnType())) {
+      return STy->elements().slice(1);
+    } else {
+      return ArrayRef<Type*>();
+    }
+  }
+
+  ArrayRef<Type*> getRetconResumeTypes() const {
+    assert(ABI == coro::ABI::Retcon ||
+           ABI == coro::ABI::RetconOnce);
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    auto FTy = RetconLowering.ResumePrototype->getFunctionType();
+    return FTy->params().slice(1);
+  }
+
+  CallingConv::ID getResumeFunctionCC() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return CallingConv::Fast;
+
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getCallingConv();
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  unsigned getFirstSpillFieldIndex() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return SwitchFieldIndex::FirstSpill;
+
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return 0;
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  AllocaInst *getPromiseAlloca() const {
+    if (ABI == coro::ABI::Switch)
+      return SwitchLowering.PromiseAlloca;
+    return nullptr;
+  }
+
+  /// Allocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  Value *emitAlloc(IRBuilder<> &Builder, Value *Size, CallGraph *CG) const;
+
+  /// Deallocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
+
   Shape() = default;
   explicit Shape(Function &F) { buildFrom(F); }
   void buildFrom(Function &F);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 5458e70ff16a..66cb3e74e53e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -27,7 +27,6 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -52,13 +51,16 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cassert>
 #include <cstddef>
@@ -70,9 +72,198 @@ using namespace llvm;
 
 #define DEBUG_TYPE "coro-split"
 
+namespace {
+
+/// A little helper class for building 
+class CoroCloner {
+public:
+  enum class Kind {
+    /// The shared resume function for a switch lowering.
+    SwitchResume,
+
+    /// The shared unwind function for a switch lowering.
+    SwitchUnwind,
+
+    /// The shared cleanup function for a switch lowering.
+    SwitchCleanup,
+
+    /// An individual continuation function.
+    Continuation,
+  };
+private:
+  Function &OrigF;
+  Function *NewF;
+  const Twine &Suffix;
+  coro::Shape &Shape;
+  Kind FKind;
+  ValueToValueMapTy VMap;
+  IRBuilder<> Builder;
+  Value *NewFramePtr = nullptr;
+  Value *SwiftErrorSlot = nullptr;
+
+  /// The active suspend instruction; meaningful only for continuation ABIs.
+  AnyCoroSuspendInst *ActiveSuspend = nullptr;
+
+public:
+  /// Create a cloner for a switch lowering.
+  CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
+             Kind FKind)
+    : OrigF(OrigF), NewF(nullptr), Suffix(Suffix), Shape(Shape),
+      FKind(FKind), Builder(OrigF.getContext()) {
+    assert(Shape.ABI == coro::ABI::Switch);
+  }
+
+  /// Create a cloner for a continuation lowering.
+  CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
+             Function *NewF, AnyCoroSuspendInst *ActiveSuspend)
+    : OrigF(OrigF), NewF(NewF), Suffix(Suffix), Shape(Shape),
+      FKind(Kind::Continuation), Builder(OrigF.getContext()),
+      ActiveSuspend(ActiveSuspend) {
+    assert(Shape.ABI == coro::ABI::Retcon ||
+           Shape.ABI == coro::ABI::RetconOnce);
+    assert(NewF && "need existing function for continuation");
+    assert(ActiveSuspend && "need active suspend point for continuation");
+  }
+
+  Function *getFunction() const {
+    assert(NewF != nullptr && "declaration not yet set");
+    return NewF;
+  }
+
+  void create();
+
+private:
+  bool isSwitchDestroyFunction() {
+    switch (FKind) {
+    case Kind::Continuation:
+    case Kind::SwitchResume:
+      return false;
+    case Kind::SwitchUnwind:
+    case Kind::SwitchCleanup:
+      return true;
+    }
+    llvm_unreachable("Unknown CoroCloner::Kind enum");
+  }
+
+  void createDeclaration();
+  void replaceEntryBlock();
+  Value *deriveNewFramePointer();
+  void replaceRetconSuspendUses();
+  void replaceCoroSuspends();
+  void replaceCoroEnds();
+  void replaceSwiftErrorOps();
+  void handleFinalSuspend();
+  void maybeFreeContinuationStorage();
+};
+
+} // end anonymous namespace
+
+static void maybeFreeRetconStorage(IRBuilder<> &Builder,
+                                   const coro::Shape &Shape, Value *FramePtr,
+                                   CallGraph *CG) {
+  assert(Shape.ABI == coro::ABI::Retcon ||
+         Shape.ABI == coro::ABI::RetconOnce);
+  if (Shape.RetconLowering.IsFrameInlineInStorage)
+    return;
+
+  Shape.emitDealloc(Builder, FramePtr, CG);
+}
+
+/// Replace a non-unwind call to llvm.coro.end.
+static void replaceFallthroughCoroEnd(CoroEndInst *End,
+                                      const coro::Shape &Shape, Value *FramePtr,
+                                      bool InResume, CallGraph *CG) {
+  // Start inserting right before the coro.end.
+  IRBuilder<> Builder(End);
+
+  // Create the return instruction.
+  switch (Shape.ABI) {
+  // The cloned functions in switch-lowering always return void.
+  case coro::ABI::Switch:
+    // coro.end doesn't immediately end the coroutine in the main function
+    // in this lowering, because we need to deallocate the coroutine.
+    if (!InResume)
+      return;
+    Builder.CreateRetVoid();
+    break;
+
+  // In unique continuation lowering, the continuations always return void.
+  // But we may have implicitly allocated storage.
+  case coro::ABI::RetconOnce:
+    maybeFreeRetconStorage(Builder, Shape, FramePtr, CG);
+    Builder.CreateRetVoid();
+    break;
+
+  // In non-unique continuation lowering, we signal completion by returning
+  // a null continuation.
+  case coro::ABI::Retcon: {
+    maybeFreeRetconStorage(Builder, Shape, FramePtr, CG);
+    auto RetTy = Shape.getResumeFunctionType()->getReturnType();
+    auto RetStructTy = dyn_cast<StructType>(RetTy);
+    PointerType *ContinuationTy =
+      cast<PointerType>(RetStructTy ? RetStructTy->getElementType(0) : RetTy);
+
+    Value *ReturnValue = ConstantPointerNull::get(ContinuationTy);
+    if (RetStructTy) {
+      ReturnValue = Builder.CreateInsertValue(UndefValue::get(RetStructTy),
+                                              ReturnValue, 0);
+    }
+    Builder.CreateRet(ReturnValue);
+    break;
+  }
+  }
+
+  // Remove the rest of the block, by splitting it into an unreachable block.
+  auto *BB = End->getParent();
+  BB->splitBasicBlock(End);
+  BB->getTerminator()->eraseFromParent();
+}
+
+/// Replace an unwind call to llvm.coro.end.
+static void replaceUnwindCoroEnd(CoroEndInst *End, const coro::Shape &Shape,
+                                 Value *FramePtr, bool InResume, CallGraph *CG){
+  IRBuilder<> Builder(End);
+
+  switch (Shape.ABI) {
+  // In switch-lowering, this does nothing in the main function.
+  case coro::ABI::Switch:
+    if (!InResume)
+      return;
+    break;
+
+  // In continuation-lowering, this frees the continuation storage.
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    maybeFreeRetconStorage(Builder, Shape, FramePtr, CG);
+    break;
+  }
+
+  // If coro.end has an associated bundle, add cleanupret instruction.
+  if (auto Bundle = End->getOperandBundle(LLVMContext::OB_funclet)) {
+    auto *FromPad = cast<CleanupPadInst>(Bundle->Inputs[0]);
+    auto *CleanupRet = Builder.CreateCleanupRet(FromPad, nullptr);
+    End->getParent()->splitBasicBlock(End);
+    CleanupRet->getParent()->getTerminator()->eraseFromParent();
+  }
+}
+
+static void replaceCoroEnd(CoroEndInst *End, const coro::Shape &Shape,
+                           Value *FramePtr, bool InResume, CallGraph *CG) {
+  if (End->isUnwind())
+    replaceUnwindCoroEnd(End, Shape, FramePtr, InResume, CG);
+  else
+    replaceFallthroughCoroEnd(End, Shape, FramePtr, InResume, CG);
+
+  auto &Context = End->getContext();
+  End->replaceAllUsesWith(InResume ? ConstantInt::getTrue(Context)
+                                   : ConstantInt::getFalse(Context));
+  End->eraseFromParent();
+}
+
 // Create an entry block for a resume function with a switch that will jump to
 // suspend points.
-static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
+static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
+  assert(Shape.ABI == coro::ABI::Switch);
   LLVMContext &C = F.getContext();
 
   // resume.entry:
@@ -91,15 +282,16 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
   IRBuilder<> Builder(NewEntry);
   auto *FramePtr = Shape.FramePtr;
   auto *FrameTy = Shape.FrameTy;
-  auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
-      FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+  auto *GepIndex = Builder.CreateStructGEP(
+      FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
   auto *Index = Builder.CreateLoad(Shape.getIndexType(), GepIndex, "index");
   auto *Switch =
       Builder.CreateSwitch(Index, UnreachBB, Shape.CoroSuspends.size());
-  Shape.ResumeSwitch = Switch;
+  Shape.SwitchLowering.ResumeSwitch = Switch;
 
   size_t SuspendIndex = 0;
-  for (CoroSuspendInst *S : Shape.CoroSuspends) {
+  for (auto *AnyS : Shape.CoroSuspends) {
+    auto *S = cast<CoroSuspendInst>(AnyS);
     ConstantInt *IndexVal = Shape.getIndex(SuspendIndex);
 
     // Replace CoroSave with a store to Index:
@@ -109,14 +301,15 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
     Builder.SetInsertPoint(Save);
     if (S->isFinal()) {
       // Final suspend point is represented by storing zero in ResumeFnAddr.
-      auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0,
-                                                          0, "ResumeFn.addr");
+      auto *GepIndex = Builder.CreateStructGEP(FrameTy, FramePtr,
+                                 coro::Shape::SwitchFieldIndex::Resume,
+                                  "ResumeFn.addr");
       auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
           cast<PointerType>(GepIndex->getType())->getElementType()));
       Builder.CreateStore(NullPtr, GepIndex);
     } else {
-      auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
-          FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+      auto *GepIndex = Builder.CreateStructGEP(
+          FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
       Builder.CreateStore(IndexVal, GepIndex);
     }
     Save->replaceAllUsesWith(ConstantTokenNone::get(C));
@@ -164,48 +357,9 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
   Builder.SetInsertPoint(UnreachBB);
   Builder.CreateUnreachable();
 
-  return NewEntry;
-}
-
-// In Resumers, we replace fallthrough coro.end with ret void and delete the
-// rest of the block.
-static void replaceFallthroughCoroEnd(IntrinsicInst *End,
-                                      ValueToValueMapTy &VMap) {
-  auto *NewE = cast<IntrinsicInst>(VMap[End]);
-  ReturnInst::Create(NewE->getContext(), nullptr, NewE);
-
-  // Remove the rest of the block, by splitting it into an unreachable block.
-  auto *BB = NewE->getParent();
-  BB->splitBasicBlock(NewE);
-  BB->getTerminator()->eraseFromParent();
+  Shape.SwitchLowering.ResumeEntryBlock = NewEntry;
 }
 
-// In Resumers, we replace unwind coro.end with True to force the immediate
-// unwind to caller.
-static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
-  if (Shape.CoroEnds.empty())
-    return;
-
-  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
-  auto *True = ConstantInt::getTrue(Context);
-  for (CoroEndInst *CE : Shape.CoroEnds) {
-    if (!CE->isUnwind())
-      continue;
-
-    auto *NewCE = cast<IntrinsicInst>(VMap[CE]);
-
-    // If coro.end has an associated bundle, add cleanupret instruction.
-    if (auto Bundle = NewCE->getOperandBundle(LLVMContext::OB_funclet)) {
-      Value *FromPad = Bundle->Inputs[0];
-      auto *CleanupRet = CleanupReturnInst::Create(FromPad, nullptr, NewCE);
-      NewCE->getParent()->splitBasicBlock(NewCE);
-      CleanupRet->getParent()->getTerminator()->eraseFromParent();
-    }
-
-    NewCE->replaceAllUsesWith(True);
-    NewCE->eraseFromParent();
-  }
-}
 
 // Rewrite final suspend point handling. We do not use suspend index to
 // represent the final suspend point. Instead we zero-out ResumeFnAddr in the
@@ -216,83 +370,364 @@ static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
 // In the destroy function, we add a code sequence to check if ResumeFnAddress
 // is Null, and if so, jump to the appropriate label to handle cleanup from the
 // final suspend point.
-static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr,
-                               coro::Shape &Shape, SwitchInst *Switch,
-                               bool IsDestroy) {
-  assert(Shape.HasFinalSuspend);
+void CoroCloner::handleFinalSuspend() {
+  assert(Shape.ABI == coro::ABI::Switch &&
+         Shape.SwitchLowering.HasFinalSuspend);
+  auto *Switch = cast<SwitchInst>(VMap[Shape.SwitchLowering.ResumeSwitch]);
   auto FinalCaseIt = std::prev(Switch->case_end());
   BasicBlock *ResumeBB = FinalCaseIt->getCaseSuccessor();
   Switch->removeCase(FinalCaseIt);
-  if (IsDestroy) {
+  if (isSwitchDestroyFunction()) {
     BasicBlock *OldSwitchBB = Switch->getParent();
     auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch");
     Builder.SetInsertPoint(OldSwitchBB->getTerminator());
-    auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(Shape.FrameTy, FramePtr,
-                                                        0, 0, "ResumeFn.addr");
-    auto *Load = Builder.CreateLoad(
-        Shape.FrameTy->getElementType(coro::Shape::ResumeField), GepIndex);
-    auto *NullPtr =
-        ConstantPointerNull::get(cast<PointerType>(Load->getType()));
-    auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
+    auto *GepIndex = Builder.CreateStructGEP(Shape.FrameTy, NewFramePtr,
+                                       coro::Shape::SwitchFieldIndex::Resume,
+                                             "ResumeFn.addr");
+    auto *Load = Builder.CreateLoad(Shape.getSwitchResumePointerType(),
+                                    GepIndex);
+    auto *Cond = Builder.CreateIsNull(Load);
     Builder.CreateCondBr(Cond, ResumeBB, NewSwitchBB);
     OldSwitchBB->getTerminator()->eraseFromParent();
   }
 }
 
-// Create a resume clone by cloning the body of the original function, setting
-// new entry block and replacing coro.suspend an appropriate value to force
-// resume or cleanup pass for every suspend point.
-static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
-                             BasicBlock *ResumeEntry, int8_t FnIndex) {
-  Module *M = F.getParent();
-  auto *FrameTy = Shape.FrameTy;
-  auto *FnPtrTy = cast<PointerType>(FrameTy->getElementType(0));
-  auto *FnTy = cast<FunctionType>(FnPtrTy->getElementType());
+static Function *createCloneDeclaration(Function &OrigF, coro::Shape &Shape,
+                                        const Twine &Suffix,
+                                        Module::iterator InsertBefore) {
+  Module *M = OrigF.getParent();
+  auto *FnTy = Shape.getResumeFunctionType();
 
   Function *NewF =
-      Function::Create(FnTy, GlobalValue::LinkageTypes::ExternalLinkage,
-                       F.getName() + Suffix, M);
+      Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
+                       OrigF.getName() + Suffix);
   NewF->addParamAttr(0, Attribute::NonNull);
   NewF->addParamAttr(0, Attribute::NoAlias);
 
-  ValueToValueMapTy VMap;
+  M->getFunctionList().insert(InsertBefore, NewF);
+
+  return NewF;
+}
+
+/// Replace uses of the active llvm.coro.suspend.retcon call with the
+/// arguments to the continuation function.
+///
+/// This assumes that the builder has a meaningful insertion point.
+void CoroCloner::replaceRetconSuspendUses() {
+  assert(Shape.ABI == coro::ABI::Retcon ||
+         Shape.ABI == coro::ABI::RetconOnce);
+
+  auto NewS = VMap[ActiveSuspend];
+  if (NewS->use_empty()) return;
+
+  // Copy out all the continuation arguments after the buffer pointer into
+  // an easily-indexed data structure for convenience.
+  SmallVector<Value*, 8> Args;
+  for (auto I = std::next(NewF->arg_begin()), E = NewF->arg_end(); I != E; ++I)
+    Args.push_back(&*I);
+
+  // If the suspend returns a single scalar value, we can just do a simple
+  // replacement.
+  if (!isa<StructType>(NewS->getType())) {
+    assert(Args.size() == 1);
+    NewS->replaceAllUsesWith(Args.front());
+    return;
+  }
+
+  // Try to peephole extracts of an aggregate return.
+  for (auto UI = NewS->use_begin(), UE = NewS->use_end(); UI != UE; ) {
+    auto EVI = dyn_cast<ExtractValueInst>((UI++)->getUser());
+    if (!EVI || EVI->getNumIndices() != 1)
+      continue;
+
+    EVI->replaceAllUsesWith(Args[EVI->getIndices().front()]);
+    EVI->eraseFromParent();
+  }
+
+  // If we have no remaining uses, we're done.
+  if (NewS->use_empty()) return;
+
+  // Otherwise, we need to create an aggregate.
+  Value *Agg = UndefValue::get(NewS->getType());
+  for (size_t I = 0, E = Args.size(); I != E; ++I)
+    Agg = Builder.CreateInsertValue(Agg, Args[I], I);
+
+  NewS->replaceAllUsesWith(Agg);
+}
+
+void CoroCloner::replaceCoroSuspends() {
+  Value *SuspendResult;
+
+  switch (Shape.ABI) {
+  // In switch lowering, replace coro.suspend with the appropriate value
+  // for the type of function we're extracting.
+  // Replacing coro.suspend with (0) will result in control flow proceeding to
+  // a resume label associated with a suspend point, replacing it with (1) will
+  // result in control flow proceeding to a cleanup label associated with this
+  // suspend point.
+  case coro::ABI::Switch:
+    SuspendResult = Builder.getInt8(isSwitchDestroyFunction() ? 1 : 0);
+    break;
+
+  // In returned-continuation lowering, the arguments from earlier
+  // continuations are theoretically arbitrary, and they should have been
+  // spilled.
+  case coro::ABI::RetconOnce:
+  case coro::ABI::Retcon:
+    return;
+  }
+
+  for (AnyCoroSuspendInst *CS : Shape.CoroSuspends) {
+    // The active suspend was handled earlier.
+    if (CS == ActiveSuspend) continue;
+
+    auto *MappedCS = cast<AnyCoroSuspendInst>(VMap[CS]);
+    MappedCS->replaceAllUsesWith(SuspendResult);
+    MappedCS->eraseFromParent();
+  }
+}
+
+void CoroCloner::replaceCoroEnds() {
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    // We use a null call graph because there's no call graph node for
+    // the cloned function yet.  We'll just be rebuilding that later.
+    auto NewCE = cast<CoroEndInst>(VMap[CE]);
+    replaceCoroEnd(NewCE, Shape, NewFramePtr, /*in resume*/ true, nullptr);
+  }
+}
+
+static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
+                                 ValueToValueMapTy *VMap) {
+  Value *CachedSlot = nullptr;
+  auto getSwiftErrorSlot = [&](Type *ValueTy) -> Value * {
+    if (CachedSlot) {
+      assert(CachedSlot->getType()->getPointerElementType() == ValueTy &&
+             "multiple swifterror slots in function with different types");
+      return CachedSlot;
+    }
+
+    // Check if the function has a swifterror argument.
+    for (auto &Arg : F.args()) {
+      if (Arg.isSwiftError()) {
+        CachedSlot = &Arg;
+        assert(Arg.getType()->getPointerElementType() == ValueTy &&
+               "swifterror argument does not have expected type");
+        return &Arg;
+      }
+    }
+
+    // Create a swifterror alloca.
+    IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+    auto Alloca = Builder.CreateAlloca(ValueTy);
+    Alloca->setSwiftError(true);
+
+    CachedSlot = Alloca;
+    return Alloca;
+  };
+
+  for (CallInst *Op : Shape.SwiftErrorOps) {
+    auto MappedOp = VMap ? cast<CallInst>((*VMap)[Op]) : Op;
+    IRBuilder<> Builder(MappedOp);
+
+    // If there are no arguments, this is a 'get' operation.
+    Value *MappedResult;
+    if (Op->getNumArgOperands() == 0) {
+      auto ValueTy = Op->getType();
+      auto Slot = getSwiftErrorSlot(ValueTy);
+      MappedResult = Builder.CreateLoad(ValueTy, Slot);
+    } else {
+      assert(Op->getNumArgOperands() == 1);
+      auto Value = MappedOp->getArgOperand(0);
+      auto ValueTy = Value->getType();
+      auto Slot = getSwiftErrorSlot(ValueTy);
+      Builder.CreateStore(Value, Slot);
+      MappedResult = Slot;
+    }
+
+    MappedOp->replaceAllUsesWith(MappedResult);
+    MappedOp->eraseFromParent();
+  }
+
+  // If we're updating the original function, we've invalidated SwiftErrorOps.
+  if (VMap == nullptr) {
+    Shape.SwiftErrorOps.clear();
+  }
+}
+
+void CoroCloner::replaceSwiftErrorOps() {
+  ::replaceSwiftErrorOps(*NewF, Shape, &VMap);
+}
+
+void CoroCloner::replaceEntryBlock() {
+  // In the original function, the AllocaSpillBlock is a block immediately
+  // following the allocation of the frame object which defines GEPs for
+  // all the allocas that have been moved into the frame, and it ends by
+  // branching to the original beginning of the coroutine.  Make this 
+  // the entry block of the cloned function.
+  auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
+  Entry->setName("entry" + Suffix);
+  Entry->moveBefore(&NewF->getEntryBlock());
+  Entry->getTerminator()->eraseFromParent();
+
+  // Clear all predecessors of the new entry block.  There should be
+  // exactly one predecessor, which we created when splitting out
+  // AllocaSpillBlock to begin with.
+  assert(Entry->hasOneUse());
+  auto BranchToEntry = cast<BranchInst>(Entry->user_back());
+  assert(BranchToEntry->isUnconditional());
+  Builder.SetInsertPoint(BranchToEntry);
+  Builder.CreateUnreachable();
+  BranchToEntry->eraseFromParent();
+
+  // TODO: move any allocas into Entry that weren't moved into the frame.
+  // (Currently we move all allocas into the frame.)
+
+  // Branch from the entry to the appropriate place.
+  Builder.SetInsertPoint(Entry);
+  switch (Shape.ABI) {
+  case coro::ABI::Switch: {
+    // In switch-lowering, we built a resume-entry block in the original
+    // function.  Make the entry block branch to this.
+    auto *SwitchBB =
+      cast<BasicBlock>(VMap[Shape.SwitchLowering.ResumeEntryBlock]);
+    Builder.CreateBr(SwitchBB);
+    break;
+  }
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    // In continuation ABIs, we want to branch to immediately after the
+    // active suspend point.  Earlier phases will have put the suspend in its
+    // own basic block, so just thread our jump directly to its successor.
+    auto MappedCS = cast<CoroSuspendRetconInst>(VMap[ActiveSuspend]);
+    auto Branch = cast<BranchInst>(MappedCS->getNextNode());
+    assert(Branch->isUnconditional());
+    Builder.CreateBr(Branch->getSuccessor(0));
+    break;
+  }
+  }
+}
+
+/// Derive the value of the new frame pointer.
+Value *CoroCloner::deriveNewFramePointer() {
+  // Builder should be inserting to the front of the new entry block.
+
+  switch (Shape.ABI) {
+  // In switch-lowering, the argument is the frame pointer.
+  case coro::ABI::Switch:
+    return &*NewF->arg_begin();
+
+  // In continuation-lowering, the argument is the opaque storage.
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    Argument *NewStorage = &*NewF->arg_begin();
+    auto FramePtrTy = Shape.FrameTy->getPointerTo();
+
+    // If the storage is inline, just bitcast to the storage to the frame type.
+    if (Shape.RetconLowering.IsFrameInlineInStorage)
+      return Builder.CreateBitCast(NewStorage, FramePtrTy);
+
+    // Otherwise, load the real frame from the opaque storage.
+    auto FramePtrPtr =
+      Builder.CreateBitCast(NewStorage, FramePtrTy->getPointerTo());
+    return Builder.CreateLoad(FramePtrPtr);
+  }
+  }
+  llvm_unreachable("bad ABI");
+}
+
+/// Clone the body of the original function into a resume function of
+/// some sort.
+void CoroCloner::create() {
+  // Create the new function if we don't already have one.
+  if (!NewF) {
+    NewF = createCloneDeclaration(OrigF, Shape, Suffix,
+                                  OrigF.getParent()->end());
+  }
+
   // Replace all args with undefs. The buildCoroutineFrame algorithm already
   // rewritten access to the args that occurs after suspend points with loads
   // and stores to/from the coroutine frame.
-  for (Argument &A : F.args())
+  for (Argument &A : OrigF.args())
     VMap[&A] = UndefValue::get(A.getType());
 
   SmallVector<ReturnInst *, 4> Returns;
 
-  CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
-  NewF->setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
+  // Ignore attempts to change certain attributes of the function.
+  // TODO: maybe there should be a way to suppress this during cloning?
+  auto savedVisibility = NewF->getVisibility();
+  auto savedUnnamedAddr = NewF->getUnnamedAddr();
+  auto savedDLLStorageClass = NewF->getDLLStorageClass();
+
+  // NewF's linkage (which CloneFunctionInto does *not* change) might not
+  // be compatible with the visibility of OrigF (which it *does* change),
+  // so protect against that.
+  auto savedLinkage = NewF->getLinkage();
+  NewF->setLinkage(llvm::GlobalValue::ExternalLinkage);
+
+  CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+  NewF->setLinkage(savedLinkage);
+  NewF->setVisibility(savedVisibility);
+  NewF->setUnnamedAddr(savedUnnamedAddr);
+  NewF->setDLLStorageClass(savedDLLStorageClass);
+
+  auto &Context = NewF->getContext();
+
+  // Replace the attributes of the new function:
+  auto OrigAttrs = NewF->getAttributes();
+  auto NewAttrs = AttributeList();
+
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    // Bootstrap attributes by copying function attributes from the
+    // original function.  This should include optimization settings and so on.
+    NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex,
+                                      OrigAttrs.getFnAttributes());
+    break;
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    // If we have a continuation prototype, just use its attributes,
+    // full-stop.
+    NewAttrs = Shape.RetconLowering.ResumePrototype->getAttributes();
+    break;
+  }
 
-  // Remove old returns.
-  for (ReturnInst *Return : Returns)
-    changeToUnreachable(Return, /*UseLLVMTrap=*/false);
+  // Make the frame parameter nonnull and noalias.
+  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NonNull);
+  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NoAlias);
+
+  switch (Shape.ABI) {
+  // In these ABIs, the cloned functions always return 'void', and the
+  // existing return sites are meaningless.  Note that for unique
+  // continuations, this includes the returns associated with suspends;
+  // this is fine because we can't suspend twice.
+  case coro::ABI::Switch:
+  case coro::ABI::RetconOnce:
+    // Remove old returns.
+    for (ReturnInst *Return : Returns)
+      changeToUnreachable(Return, /*UseLLVMTrap=*/false);
+    break;
+
+  // With multi-suspend continuations, we'll already have eliminated the
+  // original returns and inserted returns before all the suspend points,
+  // so we want to leave any returns in place.
+  case coro::ABI::Retcon:
+    break;
+  }
 
-  // Remove old return attributes.
-  NewF->removeAttributes(
-      AttributeList::ReturnIndex,
-      AttributeFuncs::typeIncompatible(NewF->getReturnType()));
+  NewF->setAttributes(NewAttrs);
+  NewF->setCallingConv(Shape.getResumeFunctionCC());
 
-  // Make AllocaSpillBlock the new entry block.
-  auto *SwitchBB = cast<BasicBlock>(VMap[ResumeEntry]);
-  auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
-  Entry->moveBefore(&NewF->getEntryBlock());
-  Entry->getTerminator()->eraseFromParent();
-  BranchInst::Create(SwitchBB, Entry);
-  Entry->setName("entry" + Suffix);
-
-  // Clear all predecessors of the new entry block.
-  auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
-  Entry->replaceAllUsesWith(Switch->getDefaultDest());
+  // Set up the new entry block.
+  replaceEntryBlock();
 
-  IRBuilder<> Builder(&NewF->getEntryBlock().front());
+  Builder.SetInsertPoint(&NewF->getEntryBlock().front());
+  NewFramePtr = deriveNewFramePointer();
 
   // Remap frame pointer.
-  Argument *NewFramePtr = &*NewF->arg_begin();
-  Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]);
+  Value *OldFramePtr = VMap[Shape.FramePtr];
   NewFramePtr->takeName(OldFramePtr);
   OldFramePtr->replaceAllUsesWith(NewFramePtr);
 
@@ -302,50 +737,55 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   Value *OldVFrame = cast<Value>(VMap[Shape.CoroBegin]);
   OldVFrame->replaceAllUsesWith(NewVFrame);
 
-  // Rewrite final suspend handling as it is not done via switch (allows to
-  // remove final case from the switch, since it is undefined behavior to resume
-  // the coroutine suspended at the final suspend point.
-  if (Shape.HasFinalSuspend) {
-    auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
-    bool IsDestroy = FnIndex != 0;
-    handleFinalSuspend(Builder, NewFramePtr, Shape, Switch, IsDestroy);
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    // Rewrite final suspend handling as it is not done via switch (allows to
+    // remove final case from the switch, since it is undefined behavior to
+    // resume the coroutine suspended at the final suspend point.
+    if (Shape.SwitchLowering.HasFinalSuspend)
+      handleFinalSuspend();
+    break;
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    // Replace uses of the active suspend with the corresponding
+    // continuation-function arguments.
+    assert(ActiveSuspend != nullptr &&
+           "no active suspend when lowering a continuation-style coroutine");
+    replaceRetconSuspendUses();
+    break;
   }
 
-  // Replace coro suspend with the appropriate resume index.
-  // Replacing coro.suspend with (0) will result in control flow proceeding to
-  // a resume label associated with a suspend point, replacing it with (1) will
-  // result in control flow proceeding to a cleanup label associated with this
-  // suspend point.
-  auto *NewValue = Builder.getInt8(FnIndex ? 1 : 0);
-  for (CoroSuspendInst *CS : Shape.CoroSuspends) {
-    auto *MappedCS = cast<CoroSuspendInst>(VMap[CS]);
-    MappedCS->replaceAllUsesWith(NewValue);
-    MappedCS->eraseFromParent();
-  }
+  // Handle suspends.
+  replaceCoroSuspends();
+
+  // Handle swifterror.
+  replaceSwiftErrorOps();
 
   // Remove coro.end intrinsics.
-  replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap);
-  replaceUnwindCoroEnds(Shape, VMap);
+  replaceCoroEnds();
+
   // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
   // to suppress deallocation code.
-  coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
-                        /*Elide=*/FnIndex == 2);
-
-  NewF->setCallingConv(CallingConv::Fast);
-
-  return NewF;
+  if (Shape.ABI == coro::ABI::Switch)
+    coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
+                          /*Elide=*/ FKind == CoroCloner::Kind::SwitchCleanup);
 }
 
-static void removeCoroEnds(coro::Shape &Shape) {
-  if (Shape.CoroEnds.empty())
-    return;
-
-  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
-  auto *False = ConstantInt::getFalse(Context);
+// Create a resume clone by cloning the body of the original function, setting
+// new entry block and replacing coro.suspend an appropriate value to force
+// resume or cleanup pass for every suspend point.
+static Function *createClone(Function &F, const Twine &Suffix,
+                             coro::Shape &Shape, CoroCloner::Kind FKind) {
+  CoroCloner Cloner(F, Suffix, Shape, FKind);
+  Cloner.create();
+  return Cloner.getFunction();
+}
 
-  for (CoroEndInst *CE : Shape.CoroEnds) {
-    CE->replaceAllUsesWith(False);
-    CE->eraseFromParent();
+/// Remove calls to llvm.coro.end in the original function.
+static void removeCoroEnds(const coro::Shape &Shape, CallGraph *CG) {
+  for (auto End : Shape.CoroEnds) {
+    replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, CG);
   }
 }
 
@@ -377,8 +817,12 @@ static void replaceFrameSize(coro::Shape &Shape) {
 //                    i8* bitcast([2 x void(%f.frame*)*] * @f.resumers to i8*))
 //
 // Assumes that all the functions have the same signature.
-static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin,
-                        std::initializer_list<Function *> Fns) {
+static void setCoroInfo(Function &F, coro::Shape &Shape,
+                        ArrayRef<Function *> Fns) {
+  // This only works under the switch-lowering ABI because coro elision
+  // only works on the switch-lowering ABI.
+  assert(Shape.ABI == coro::ABI::Switch);
+
   SmallVector<Constant *, 4> Args(Fns.begin(), Fns.end());
   assert(!Args.empty());
   Function *Part = *Fns.begin();
@@ -393,38 +837,45 @@ static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin,
   // Update coro.begin instruction to refer to this constant.
   LLVMContext &C = F.getContext();
   auto *BC = ConstantExpr::getPointerCast(GV, Type::getInt8PtrTy(C));
-  CoroBegin->getId()->setInfo(BC);
+  Shape.getSwitchCoroId()->setInfo(BC);
 }
 
 // Store addresses of Resume/Destroy/Cleanup functions in the coroutine frame.
 static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
                             Function *DestroyFn, Function *CleanupFn) {
+  assert(Shape.ABI == coro::ABI::Switch);
+
   IRBuilder<> Builder(Shape.FramePtr->getNextNode());
-  auto *ResumeAddr = Builder.CreateConstInBoundsGEP2_32(
-      Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::ResumeField,
+  auto *ResumeAddr = Builder.CreateStructGEP(
+      Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Resume,
       "resume.addr");
   Builder.CreateStore(ResumeFn, ResumeAddr);
 
   Value *DestroyOrCleanupFn = DestroyFn;
 
-  CoroIdInst *CoroId = Shape.CoroBegin->getId();
+  CoroIdInst *CoroId = Shape.getSwitchCoroId();
   if (CoroAllocInst *CA = CoroId->getCoroAlloc()) {
     // If there is a CoroAlloc and it returns false (meaning we elide the
     // allocation, use CleanupFn instead of DestroyFn).
     DestroyOrCleanupFn = Builder.CreateSelect(CA, DestroyFn, CleanupFn);
   }
 
-  auto *DestroyAddr = Builder.CreateConstInBoundsGEP2_32(
-      Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::DestroyField,
+  auto *DestroyAddr = Builder.CreateStructGEP(
+      Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Destroy,
       "destroy.addr");
   Builder.CreateStore(DestroyOrCleanupFn, DestroyAddr);
 }
 
 static void postSplitCleanup(Function &F) {
   removeUnreachableBlocks(F);
+
+  // For now, we do a mandatory verification step because we don't
+  // entirely trust this pass.  Note that we don't want to add a verifier
+  // pass to FPM below because it will also verify all the global data.
+  verifyFunction(F);
+
   legacy::FunctionPassManager FPM(F.getParent());
 
-  FPM.add(createVerifierPass());
   FPM.add(createSCCPPass());
   FPM.add(createCFGSimplificationPass());
   FPM.add(createEarlyCSEPass());
@@ -457,17 +908,29 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
 // values and select the correct case successor when possible.
 static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
   DenseMap<Value *, Value *> ResolvedValues;
+  BasicBlock *UnconditionalSucc = nullptr;
 
   Instruction *I = InitialInst;
   while (I->isTerminator()) {
     if (isa<ReturnInst>(I)) {
-      if (I != InitialInst)
+      if (I != InitialInst) {
+        // If InitialInst is an unconditional branch,
+        // remove PHI values that come from basic block of InitialInst
+        if (UnconditionalSucc)
+          for (PHINode &PN : UnconditionalSucc->phis()) {
+            int idx = PN.getBasicBlockIndex(InitialInst->getParent());
+            if (idx != -1)
+              PN.removeIncomingValue(idx);
+          }
         ReplaceInstWithInst(InitialInst, I->clone());
+      }
       return true;
     }
     if (auto *BR = dyn_cast<BranchInst>(I)) {
       if (BR->isUnconditional()) {
         BasicBlock *BB = BR->getSuccessor(0);
+        if (I == InitialInst)
+          UnconditionalSucc = BB;
         scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
         I = BB->getFirstNonPHIOrDbgOrLifetime();
         continue;
@@ -520,21 +983,34 @@ static void addMustTailToCoroResumes(Function &F) {
 
 // Coroutine has no suspend points. Remove heap allocation for the coroutine
 // frame if possible.
-static void handleNoSuspendCoroutine(CoroBeginInst *CoroBegin, Type *FrameTy) {
+static void handleNoSuspendCoroutine(coro::Shape &Shape) {
+  auto *CoroBegin = Shape.CoroBegin;
   auto *CoroId = CoroBegin->getId();
   auto *AllocInst = CoroId->getCoroAlloc();
-  coro::replaceCoroFree(CoroId, /*Elide=*/AllocInst != nullptr);
-  if (AllocInst) {
-    IRBuilder<> Builder(AllocInst);
-    // FIXME: Need to handle overaligned members.
-    auto *Frame = Builder.CreateAlloca(FrameTy);
-    auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
-    AllocInst->replaceAllUsesWith(Builder.getFalse());
-    AllocInst->eraseFromParent();
-    CoroBegin->replaceAllUsesWith(VFrame);
-  } else {
-    CoroBegin->replaceAllUsesWith(CoroBegin->getMem());
+  switch (Shape.ABI) {
+  case coro::ABI::Switch: {
+    auto SwitchId = cast<CoroIdInst>(CoroId);
+    coro::replaceCoroFree(SwitchId, /*Elide=*/AllocInst != nullptr);
+    if (AllocInst) {
+      IRBuilder<> Builder(AllocInst);
+      // FIXME: Need to handle overaligned members.
+      auto *Frame = Builder.CreateAlloca(Shape.FrameTy);
+      auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
+      AllocInst->replaceAllUsesWith(Builder.getFalse());
+      AllocInst->eraseFromParent();
+      CoroBegin->replaceAllUsesWith(VFrame);
+    } else {
+      CoroBegin->replaceAllUsesWith(CoroBegin->getMem());
+    }
+    break;
   }
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    CoroBegin->replaceAllUsesWith(UndefValue::get(CoroBegin->getType()));
+    break;
+  }
+
   CoroBegin->eraseFromParent();
 }
 
@@ -670,12 +1146,16 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
 
 // Remove suspend points that are simplified.
 static void simplifySuspendPoints(coro::Shape &Shape) {
+  // Currently, the only simplification we do is switch-lowering-specific.
+  if (Shape.ABI != coro::ABI::Switch)
+    return;
+
   auto &S = Shape.CoroSuspends;
   size_t I = 0, N = S.size();
   if (N == 0)
     return;
   while (true) {
-    if (simplifySuspendPoint(S[I], Shape.CoroBegin)) {
+    if (simplifySuspendPoint(cast<CoroSuspendInst>(S[I]), Shape.CoroBegin)) {
       if (--N == I)
         break;
       std::swap(S[I], S[N]);
@@ -687,142 +1167,227 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
   S.resize(N);
 }
 
-static SmallPtrSet<BasicBlock *, 4> getCoroBeginPredBlocks(CoroBeginInst *CB) {
-  // Collect all blocks that we need to look for instructions to relocate.
-  SmallPtrSet<BasicBlock *, 4> RelocBlocks;
-  SmallVector<BasicBlock *, 4> Work;
-  Work.push_back(CB->getParent());
+static void splitSwitchCoroutine(Function &F, coro::Shape &Shape,
+                                 SmallVectorImpl<Function *> &Clones) {
+  assert(Shape.ABI == coro::ABI::Switch);
 
-  do {
-    BasicBlock *Current = Work.pop_back_val();
-    for (BasicBlock *BB : predecessors(Current))
-      if (RelocBlocks.count(BB) == 0) {
-        RelocBlocks.insert(BB);
-        Work.push_back(BB);
-      }
-  } while (!Work.empty());
-  return RelocBlocks;
-}
-
-static SmallPtrSet<Instruction *, 8>
-getNotRelocatableInstructions(CoroBeginInst *CoroBegin,
-                              SmallPtrSetImpl<BasicBlock *> &RelocBlocks) {
-  SmallPtrSet<Instruction *, 8> DoNotRelocate;
-  // Collect all instructions that we should not relocate
-  SmallVector<Instruction *, 8> Work;
-
-  // Start with CoroBegin and terminators of all preceding blocks.
-  Work.push_back(CoroBegin);
-  BasicBlock *CoroBeginBB = CoroBegin->getParent();
-  for (BasicBlock *BB : RelocBlocks)
-    if (BB != CoroBeginBB)
-      Work.push_back(BB->getTerminator());
-
-  // For every instruction in the Work list, place its operands in DoNotRelocate
-  // set.
-  do {
-    Instruction *Current = Work.pop_back_val();
-    LLVM_DEBUG(dbgs() << "CoroSplit: Will not relocate: " << *Current << "\n");
-    DoNotRelocate.insert(Current);
-    for (Value *U : Current->operands()) {
-      auto *I = dyn_cast<Instruction>(U);
-      if (!I)
-        continue;
+  createResumeEntryBlock(F, Shape);
+  auto ResumeClone = createClone(F, ".resume", Shape,
+                                 CoroCloner::Kind::SwitchResume);
+  auto DestroyClone = createClone(F, ".destroy", Shape,
+                                  CoroCloner::Kind::SwitchUnwind);
+  auto CleanupClone = createClone(F, ".cleanup", Shape,
+                                  CoroCloner::Kind::SwitchCleanup);
 
-      if (auto *A = dyn_cast<AllocaInst>(I)) {
-        // Stores to alloca instructions that occur before the coroutine frame
-        // is allocated should not be moved; the stored values may be used by
-        // the coroutine frame allocator. The operands to those stores must also
-        // remain in place.
-        for (const auto &User : A->users())
-          if (auto *SI = dyn_cast<llvm::StoreInst>(User))
-            if (RelocBlocks.count(SI->getParent()) != 0 &&
-                DoNotRelocate.count(SI) == 0) {
-              Work.push_back(SI);
-              DoNotRelocate.insert(SI);
-            }
-        continue;
-      }
+  postSplitCleanup(*ResumeClone);
+  postSplitCleanup(*DestroyClone);
+  postSplitCleanup(*CleanupClone);
+
+  addMustTailToCoroResumes(*ResumeClone);
+
+  // Store addresses resume/destroy/cleanup functions in the coroutine frame.
+  updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
+
+  assert(Clones.empty());
+  Clones.push_back(ResumeClone);
+  Clones.push_back(DestroyClone);
+  Clones.push_back(CleanupClone);
+
+  // Create a constant array referring to resume/destroy/clone functions pointed
+  // by the last argument of @llvm.coro.info, so that CoroElide pass can
+  // determined correct function to call.
+  setCoroInfo(F, Shape, Clones);
+}
+
+static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
+                                 SmallVectorImpl<Function *> &Clones) {
+  assert(Shape.ABI == coro::ABI::Retcon ||
+         Shape.ABI == coro::ABI::RetconOnce);
+  assert(Clones.empty());
+
+  // Reset various things that the optimizer might have decided it
+  // "knows" about the coroutine function due to not seeing a return.
+  F.removeFnAttr(Attribute::NoReturn);
+  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+
+  // Allocate the frame.
+  auto *Id = cast<AnyCoroIdRetconInst>(Shape.CoroBegin->getId());
+  Value *RawFramePtr;
+  if (Shape.RetconLowering.IsFrameInlineInStorage) {
+    RawFramePtr = Id->getStorage();
+  } else {
+    IRBuilder<> Builder(Id);
+
+    // Determine the size of the frame.
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    auto Size = DL.getTypeAllocSize(Shape.FrameTy);
+
+    // Allocate.  We don't need to update the call graph node because we're
+    // going to recompute it from scratch after splitting.
+    RawFramePtr = Shape.emitAlloc(Builder, Builder.getInt64(Size), nullptr);
+    RawFramePtr =
+      Builder.CreateBitCast(RawFramePtr, Shape.CoroBegin->getType());
+
+    // Stash the allocated frame pointer in the continuation storage.
+    auto Dest = Builder.CreateBitCast(Id->getStorage(),
+                                      RawFramePtr->getType()->getPointerTo());
+    Builder.CreateStore(RawFramePtr, Dest);
+  }
 
-      if (DoNotRelocate.count(I) == 0) {
-        Work.push_back(I);
-        DoNotRelocate.insert(I);
+  // Map all uses of llvm.coro.begin to the allocated frame pointer.
+  {
+    // Make sure we don't invalidate Shape.FramePtr.
+    TrackingVH<Instruction> Handle(Shape.FramePtr);
+    Shape.CoroBegin->replaceAllUsesWith(RawFramePtr);
+    Shape.FramePtr = Handle.getValPtr();
+  }
+
+  // Create a unique return block.
+  BasicBlock *ReturnBB = nullptr;
+  SmallVector<PHINode *, 4> ReturnPHIs;
+
+  // Create all the functions in order after the main function.
+  auto NextF = std::next(F.getIterator());
+
+  // Create a continuation function for each of the suspend points.
+  Clones.reserve(Shape.CoroSuspends.size());
+  for (size_t i = 0, e = Shape.CoroSuspends.size(); i != e; ++i) {
+    auto Suspend = cast<CoroSuspendRetconInst>(Shape.CoroSuspends[i]);
+
+    // Create the clone declaration.
+    auto Continuation =
+      createCloneDeclaration(F, Shape, ".resume." + Twine(i), NextF);
+    Clones.push_back(Continuation);
+
+    // Insert a branch to the unified return block immediately before
+    // the suspend point.
+    auto SuspendBB = Suspend->getParent();
+    auto NewSuspendBB = SuspendBB->splitBasicBlock(Suspend);
+    auto Branch = cast<BranchInst>(SuspendBB->getTerminator());
+
+    // Create the unified return block.
+    if (!ReturnBB) {
+      // Place it before the first suspend.
+      ReturnBB = BasicBlock::Create(F.getContext(), "coro.return", &F,
+                                    NewSuspendBB);
+      Shape.RetconLowering.ReturnBlock = ReturnBB;
+
+      IRBuilder<> Builder(ReturnBB);
+
+      // Create PHIs for all the return values.
+      assert(ReturnPHIs.empty());
+
+      // First, the continuation.
+      ReturnPHIs.push_back(Builder.CreatePHI(Continuation->getType(),
+                                             Shape.CoroSuspends.size()));
+
+      // Next, all the directly-yielded values.
+      for (auto ResultTy : Shape.getRetconResultTypes())
+        ReturnPHIs.push_back(Builder.CreatePHI(ResultTy,
+                                               Shape.CoroSuspends.size()));
+
+      // Build the return value.
+      auto RetTy = F.getReturnType();
+
+      // Cast the continuation value if necessary.
+      // We can't rely on the types matching up because that type would
+      // have to be infinite.
+      auto CastedContinuationTy =
+        (ReturnPHIs.size() == 1 ? RetTy : RetTy->getStructElementType(0));
+      auto *CastedContinuation =
+        Builder.CreateBitCast(ReturnPHIs[0], CastedContinuationTy);
+
+      Value *RetV;
+      if (ReturnPHIs.size() == 1) {
+        RetV = CastedContinuation;
+      } else {
+        RetV = UndefValue::get(RetTy);
+        RetV = Builder.CreateInsertValue(RetV, CastedContinuation, 0);
+        for (size_t I = 1, E = ReturnPHIs.size(); I != E; ++I)
+          RetV = Builder.CreateInsertValue(RetV, ReturnPHIs[I], I);
       }
+
+      Builder.CreateRet(RetV);
     }
-  } while (!Work.empty());
-  return DoNotRelocate;
-}
 
-static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) {
-  // Analyze which non-alloca instructions are needed for allocation and
-  // relocate the rest to after coro.begin. We need to do it, since some of the
-  // targets of those instructions may be placed into coroutine frame memory
-  // for which becomes available after coro.begin intrinsic.
+    // Branch to the return block.
+    Branch->setSuccessor(0, ReturnBB);
+    ReturnPHIs[0]->addIncoming(Continuation, SuspendBB);
+    size_t NextPHIIndex = 1;
+    for (auto &VUse : Suspend->value_operands())
+      ReturnPHIs[NextPHIIndex++]->addIncoming(&*VUse, SuspendBB);
+    assert(NextPHIIndex == ReturnPHIs.size());
+  }
 
-  auto BlockSet = getCoroBeginPredBlocks(CoroBegin);
-  auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet);
+  assert(Clones.size() == Shape.CoroSuspends.size());
+  for (size_t i = 0, e = Shape.CoroSuspends.size(); i != e; ++i) {
+    auto Suspend = Shape.CoroSuspends[i];
+    auto Clone = Clones[i];
 
-  Instruction *InsertPt = CoroBegin->getNextNode();
-  BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well.
-  for (auto B = BB.begin(), E = BB.end(); B != E;) {
-    Instruction &I = *B++;
-    if (isa<AllocaInst>(&I))
-      continue;
-    if (&I == CoroBegin)
-      break;
-    if (DoNotRelocateSet.count(&I))
-      continue;
-    I.moveBefore(InsertPt);
+    CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend).create();
+  }
+}
+
+namespace {
+  class PrettyStackTraceFunction : public PrettyStackTraceEntry {
+    Function &F;
+  public:
+    PrettyStackTraceFunction(Function &F) : F(F) {}
+    void print(raw_ostream &OS) const override {
+      OS << "While splitting coroutine ";
+      F.printAsOperand(OS, /*print type*/ false, F.getParent());
+      OS << "\n";
+    }
+  };
+}
+
+static void splitCoroutine(Function &F, coro::Shape &Shape,
+                           SmallVectorImpl<Function *> &Clones) {
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    return splitSwitchCoroutine(F, Shape, Clones);
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    return splitRetconCoroutine(F, Shape, Clones);
   }
+  llvm_unreachable("bad ABI kind");
 }
 
 static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
-  EliminateUnreachableBlocks(F);
+  PrettyStackTraceFunction prettyStackTrace(F);
+
+  // The suspend-crossing algorithm in buildCoroutineFrame get tripped
+  // up by uses in unreachable blocks, so remove them as a first pass.
+  removeUnreachableBlocks(F);
 
   coro::Shape Shape(F);
   if (!Shape.CoroBegin)
     return;
 
   simplifySuspendPoints(Shape);
-  relocateInstructionBefore(Shape.CoroBegin, F);
   buildCoroutineFrame(F, Shape);
   replaceFrameSize(Shape);
 
+  SmallVector<Function*, 4> Clones;
+
   // If there are no suspend points, no split required, just remove
   // the allocation and deallocation blocks, they are not needed.
   if (Shape.CoroSuspends.empty()) {
-    handleNoSuspendCoroutine(Shape.CoroBegin, Shape.FrameTy);
-    removeCoroEnds(Shape);
-    postSplitCleanup(F);
-    coro::updateCallGraph(F, {}, CG, SCC);
-    return;
+    handleNoSuspendCoroutine(Shape);
+  } else {
+    splitCoroutine(F, Shape, Clones);
   }
 
-  auto *ResumeEntry = createResumeEntryBlock(F, Shape);
-  auto ResumeClone = createClone(F, ".resume", Shape, ResumeEntry, 0);
-  auto DestroyClone = createClone(F, ".destroy", Shape, ResumeEntry, 1);
-  auto CleanupClone = createClone(F, ".cleanup", Shape, ResumeEntry, 2);
-
-  // We no longer need coro.end in F.
-  removeCoroEnds(Shape);
+  // Replace all the swifterror operations in the original function.
+  // This invalidates SwiftErrorOps in the Shape.
+  replaceSwiftErrorOps(F, Shape, nullptr);
 
+  removeCoroEnds(Shape, &CG);
   postSplitCleanup(F);
-  postSplitCleanup(*ResumeClone);
-  postSplitCleanup(*DestroyClone);
-  postSplitCleanup(*CleanupClone);
-
-  addMustTailToCoroResumes(*ResumeClone);
-
-  // Store addresses resume/destroy/cleanup functions in the coroutine frame.
-  updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
-
-  // Create a constant array referring to resume/destroy/clone functions pointed
-  // by the last argument of @llvm.coro.info, so that CoroElide pass can
-  // determined correct function to call.
-  setCoroInfo(F, Shape.CoroBegin, {ResumeClone, DestroyClone, CleanupClone});
 
   // Update call graph and add the functions we created to the SCC.
-  coro::updateCallGraph(F, {ResumeClone, DestroyClone, CleanupClone}, CG, SCC);
+  coro::updateCallGraph(F, Clones, CG, SCC);
 }
 
 // When we see the coroutine the first time, we insert an indirect call to a
@@ -856,9 +1421,10 @@ static void prepareForSplit(Function &F, CallGraph &CG) {
   CG[&F]->addCalledFunction(IndirectCall, CG.getCallsExternalNode());
 }
 
-// Make sure that there is a devirtualization trigger function that CoroSplit
-// pass uses the force restart CGSCC pipeline. If devirt trigger function is not
-// found, we will create one and add it to the current SCC.
+// Make sure that there is a devirtualization trigger function that the
+// coro-split pass uses to force a restart of the CGSCC pipeline. If the devirt
+// trigger function is not found, we will create one and add it to the current
+// SCC.
 static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) {
   Module &M = CG.getModule();
   if (M.getFunction(CORO_DEVIRT_TRIGGER_FN))
@@ -881,17 +1447,91 @@ static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) {
   SCC.initialize(Nodes);
 }
 
+/// Replace a call to llvm.coro.prepare.retcon.
+static void replacePrepare(CallInst *Prepare, CallGraph &CG) {
+  auto CastFn = Prepare->getArgOperand(0); // as an i8*
+  auto Fn = CastFn->stripPointerCasts(); // as its original type
+
+  // Find call graph nodes for the preparation.
+  CallGraphNode *PrepareUserNode = nullptr, *FnNode = nullptr;
+  if (auto ConcreteFn = dyn_cast<Function>(Fn)) {
+    PrepareUserNode = CG[Prepare->getFunction()];
+    FnNode = CG[ConcreteFn];
+  }
+
+  // Attempt to peephole this pattern:
+  //    %0 = bitcast [[TYPE]] @some_function to i8*
+  //    %1 = call @llvm.coro.prepare.retcon(i8* %0)
+  //    %2 = bitcast %1 to [[TYPE]]
+  // ==>
+  //    %2 = @some_function
+  for (auto UI = Prepare->use_begin(), UE = Prepare->use_end();
+         UI != UE; ) {
+    // Look for bitcasts back to the original function type.
+    auto *Cast = dyn_cast<BitCastInst>((UI++)->getUser());
+    if (!Cast || Cast->getType() != Fn->getType()) continue;
+
+    // Check whether the replacement will introduce new direct calls.
+    // If so, we'll need to update the call graph.
+    if (PrepareUserNode) {
+      for (auto &Use : Cast->uses()) {
+        if (auto *CB = dyn_cast<CallBase>(Use.getUser())) {
+          if (!CB->isCallee(&Use))
+            continue;
+          PrepareUserNode->removeCallEdgeFor(*CB);
+          PrepareUserNode->addCalledFunction(CB, FnNode);
+        }
+      }
+    }
+
+    // Replace and remove the cast.
+    Cast->replaceAllUsesWith(Fn);
+    Cast->eraseFromParent();
+  }
+
+  // Replace any remaining uses with the function as an i8*.
+  // This can never directly be a callee, so we don't need to update CG.
+  Prepare->replaceAllUsesWith(CastFn);
+  Prepare->eraseFromParent();
+
+  // Kill dead bitcasts.
+  while (auto *Cast = dyn_cast<BitCastInst>(CastFn)) {
+    if (!Cast->use_empty()) break;
+    CastFn = Cast->getOperand(0);
+    Cast->eraseFromParent();
+  }
+}
+
+/// Remove calls to llvm.coro.prepare.retcon, a barrier meant to prevent
+/// IPO from operating on calls to a retcon coroutine before it's been
+/// split.  This is only safe to do after we've split all retcon
+/// coroutines in the module.  We can do that this in this pass because
+/// this pass does promise to split all retcon coroutines (as opposed to
+/// switch coroutines, which are lowered in multiple stages).
+static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) {
+  bool Changed = false;
+  for (auto PI = PrepareFn->use_begin(), PE = PrepareFn->use_end();
+         PI != PE; ) {
+    // Intrinsics can only be used in calls.
+    auto *Prepare = cast<CallInst>((PI++)->getUser());
+    replacePrepare(Prepare, CG);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 //===----------------------------------------------------------------------===//
 //                              Top Level Driver
 //===----------------------------------------------------------------------===//
 
 namespace {
 
-struct CoroSplit : public CallGraphSCCPass {
+struct CoroSplitLegacy : public CallGraphSCCPass {
   static char ID; // Pass identification, replacement for typeid
 
-  CoroSplit() : CallGraphSCCPass(ID) {
-    initializeCoroSplitPass(*PassRegistry::getPassRegistry());
+  CoroSplitLegacy() : CallGraphSCCPass(ID) {
+    initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   bool Run = false;
@@ -899,7 +1539,9 @@ struct CoroSplit : public CallGraphSCCPass {
   // A coroutine is identified by the presence of coro.begin intrinsic, if
   // we don't have any, this pass has nothing to do.
   bool doInitialization(CallGraph &CG) override {
-    Run = coro::declaresIntrinsics(CG.getModule(), {"llvm.coro.begin"});
+    Run = coro::declaresIntrinsics(CG.getModule(),
+                                   {"llvm.coro.begin",
+                                    "llvm.coro.prepare.retcon"});
     return CallGraphSCCPass::doInitialization(CG);
   }
 
@@ -907,6 +1549,12 @@ struct CoroSplit : public CallGraphSCCPass {
     if (!Run)
       return false;
 
+    // Check for uses of llvm.coro.prepare.retcon.
+    auto PrepareFn =
+      SCC.getCallGraph().getModule().getFunction("llvm.coro.prepare.retcon");
+    if (PrepareFn && PrepareFn->use_empty())
+      PrepareFn = nullptr;
+
     // Find coroutines for processing.
     SmallVector<Function *, 4> Coroutines;
     for (CallGraphNode *CGN : SCC)
@@ -914,12 +1562,17 @@ struct CoroSplit : public CallGraphSCCPass {
         if (F->hasFnAttribute(CORO_PRESPLIT_ATTR))
           Coroutines.push_back(F);
 
-    if (Coroutines.empty())
+    if (Coroutines.empty() && !PrepareFn)
       return false;
 
     CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+    if (Coroutines.empty())
+      return replaceAllPrepares(PrepareFn, CG);
+
     createDevirtTriggerFunc(CG, SCC);
 
+    // Split all the coroutines.
     for (Function *F : Coroutines) {
       Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR);
       StringRef Value = Attr.getValueAsString();
@@ -932,6 +1585,10 @@ struct CoroSplit : public CallGraphSCCPass {
       F->removeFnAttr(CORO_PRESPLIT_ATTR);
       splitCoroutine(*F, CG, SCC);
     }
+
+    if (PrepareFn)
+      replaceAllPrepares(PrepareFn, CG);
+
     return true;
   }
 
@@ -944,16 +1601,16 @@ struct CoroSplit : public CallGraphSCCPass {
 
 } // end anonymous namespace
 
-char CoroSplit::ID = 0;
+char CoroSplitLegacy::ID = 0;
 
 INITIALIZE_PASS_BEGIN(
-    CoroSplit, "coro-split",
+    CoroSplitLegacy, "coro-split",
     "Split coroutine into a set of functions driving its state machine", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_END(
-    CoroSplit, "coro-split",
+    CoroSplitLegacy, "coro-split",
     "Split coroutine into a set of functions driving its state machine", false,
     false)
 
-Pass *llvm::createCoroSplitPass() { return new CoroSplit(); }
+Pass *llvm::createCoroSplitLegacyPass() { return new CoroSplitLegacy(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index a581d1d21169..02d11af3303f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -11,14 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Coroutines.h"
-#include "llvm-c/Transforms/Coroutines.h"
 #include "CoroInstr.h"
 #include "CoroInternal.h"
+#include "llvm-c/Transforms/Coroutines.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -31,10 +30,12 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstddef>
 #include <utility>
@@ -42,39 +43,39 @@
 using namespace llvm;
 
 void llvm::initializeCoroutines(PassRegistry &Registry) {
-  initializeCoroEarlyPass(Registry);
-  initializeCoroSplitPass(Registry);
-  initializeCoroElidePass(Registry);
-  initializeCoroCleanupPass(Registry);
+  initializeCoroEarlyLegacyPass(Registry);
+  initializeCoroSplitLegacyPass(Registry);
+  initializeCoroElideLegacyPass(Registry);
+  initializeCoroCleanupLegacyPass(Registry);
 }
 
 static void addCoroutineOpt0Passes(const PassManagerBuilder &Builder,
                                    legacy::PassManagerBase &PM) {
-  PM.add(createCoroSplitPass());
-  PM.add(createCoroElidePass());
+  PM.add(createCoroSplitLegacyPass());
+  PM.add(createCoroElideLegacyPass());
 
   PM.add(createBarrierNoopPass());
-  PM.add(createCoroCleanupPass());
+  PM.add(createCoroCleanupLegacyPass());
 }
 
 static void addCoroutineEarlyPasses(const PassManagerBuilder &Builder,
                                     legacy::PassManagerBase &PM) {
-  PM.add(createCoroEarlyPass());
+  PM.add(createCoroEarlyLegacyPass());
 }
 
 static void addCoroutineScalarOptimizerPasses(const PassManagerBuilder &Builder,
                                               legacy::PassManagerBase &PM) {
-  PM.add(createCoroElidePass());
+  PM.add(createCoroElideLegacyPass());
 }
 
 static void addCoroutineSCCPasses(const PassManagerBuilder &Builder,
                                   legacy::PassManagerBase &PM) {
-  PM.add(createCoroSplitPass());
+  PM.add(createCoroSplitLegacyPass());
 }
 
 static void addCoroutineOptimizerLastPasses(const PassManagerBuilder &Builder,
                                             legacy::PassManagerBase &PM) {
-  PM.add(createCoroCleanupPass());
+  PM.add(createCoroCleanupLegacyPass());
 }
 
 void llvm::addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder) {
@@ -123,12 +124,26 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
 static bool isCoroutineIntrinsicName(StringRef Name) {
   // NOTE: Must be sorted!
   static const char *const CoroIntrinsics[] = {
-      "llvm.coro.alloc",   "llvm.coro.begin",   "llvm.coro.destroy",
-      "llvm.coro.done",    "llvm.coro.end",     "llvm.coro.frame",
-      "llvm.coro.free",    "llvm.coro.id",      "llvm.coro.noop",
-      "llvm.coro.param",   "llvm.coro.promise", "llvm.coro.resume",
-      "llvm.coro.save",    "llvm.coro.size",    "llvm.coro.subfn.addr",
+      "llvm.coro.alloc",
+      "llvm.coro.begin",
+      "llvm.coro.destroy",
+      "llvm.coro.done",
+      "llvm.coro.end",
+      "llvm.coro.frame",
+      "llvm.coro.free",
+      "llvm.coro.id",
+      "llvm.coro.id.retcon",
+      "llvm.coro.id.retcon.once",
+      "llvm.coro.noop",
+      "llvm.coro.param",
+      "llvm.coro.prepare.retcon",
+      "llvm.coro.promise",
+      "llvm.coro.resume",
+      "llvm.coro.save",
+      "llvm.coro.size",
+      "llvm.coro.subfn.addr",
       "llvm.coro.suspend",
+      "llvm.coro.suspend.retcon",
   };
   return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
 }
@@ -136,8 +151,8 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
 
 // Verifies if a module has named values listed. Also, in debug mode verifies
 // that names are intrinsic names.
-bool coro::declaresIntrinsics(Module &M,
-                              std::initializer_list<StringRef> List) {
+bool coro::declaresIntrinsics(const Module &M,
+                              const std::initializer_list<StringRef> List) {
   for (StringRef Name : List) {
     assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic");
     if (M.getNamedValue(Name))
@@ -217,9 +232,6 @@ static void clear(coro::Shape &Shape) {
   Shape.FrameTy = nullptr;
   Shape.FramePtr = nullptr;
   Shape.AllocaSpillBlock = nullptr;
-  Shape.ResumeSwitch = nullptr;
-  Shape.PromiseAlloca = nullptr;
-  Shape.HasFinalSuspend = false;
 }
 
 static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
@@ -235,6 +247,7 @@ static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
 
 // Collect "interesting" coroutine intrinsics.
 void coro::Shape::buildFrom(Function &F) {
+  bool HasFinalSuspend = false;
   size_t FinalSuspendIndex = 0;
   clear(*this);
   SmallVector<CoroFrameInst *, 8> CoroFrames;
@@ -257,9 +270,15 @@ void coro::Shape::buildFrom(Function &F) {
         if (II->use_empty())
           UnusedCoroSaves.push_back(cast<CoroSaveInst>(II));
         break;
-      case Intrinsic::coro_suspend:
-        CoroSuspends.push_back(cast<CoroSuspendInst>(II));
-        if (CoroSuspends.back()->isFinal()) {
+      case Intrinsic::coro_suspend_retcon: {
+        auto Suspend = cast<CoroSuspendRetconInst>(II);
+        CoroSuspends.push_back(Suspend);
+        break;
+      }
+      case Intrinsic::coro_suspend: {
+        auto Suspend = cast<CoroSuspendInst>(II);
+        CoroSuspends.push_back(Suspend);
+        if (Suspend->isFinal()) {
           if (HasFinalSuspend)
             report_fatal_error(
               "Only one suspend point can be marked as final");
@@ -267,18 +286,23 @@ void coro::Shape::buildFrom(Function &F) {
           FinalSuspendIndex = CoroSuspends.size() - 1;
         }
         break;
+      }
       case Intrinsic::coro_begin: {
         auto CB = cast<CoroBeginInst>(II);
-        if (CB->getId()->getInfo().isPreSplit()) {
-          if (CoroBegin)
-            report_fatal_error(
+
+        // Ignore coro id's that aren't pre-split.
+        auto Id = dyn_cast<CoroIdInst>(CB->getId());
+        if (Id && !Id->getInfo().isPreSplit())
+          break;
+
+        if (CoroBegin)
+          report_fatal_error(
                 "coroutine should have exactly one defining @llvm.coro.begin");
-          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-          CB->removeAttribute(AttributeList::FunctionIndex,
-                              Attribute::NoDuplicate);
-          CoroBegin = CB;
-        }
+        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+        CB->removeAttribute(AttributeList::FunctionIndex,
+                            Attribute::NoDuplicate);
+        CoroBegin = CB;
         break;
       }
       case Intrinsic::coro_end:
@@ -310,7 +334,7 @@ void coro::Shape::buildFrom(Function &F) {
 
     // Replace all coro.suspend with undef and remove related coro.saves if
     // present.
-    for (CoroSuspendInst *CS : CoroSuspends) {
+    for (AnyCoroSuspendInst *CS : CoroSuspends) {
       CS->replaceAllUsesWith(UndefValue::get(CS->getType()));
       CS->eraseFromParent();
       if (auto *CoroSave = CS->getCoroSave())
@@ -324,19 +348,136 @@ void coro::Shape::buildFrom(Function &F) {
     return;
   }
 
+  auto Id = CoroBegin->getId();
+  switch (auto IdIntrinsic = Id->getIntrinsicID()) {
+  case Intrinsic::coro_id: {
+    auto SwitchId = cast<CoroIdInst>(Id);
+    this->ABI = coro::ABI::Switch;
+    this->SwitchLowering.HasFinalSuspend = HasFinalSuspend;
+    this->SwitchLowering.ResumeSwitch = nullptr;
+    this->SwitchLowering.PromiseAlloca = SwitchId->getPromise();
+    this->SwitchLowering.ResumeEntryBlock = nullptr;
+
+    for (auto AnySuspend : CoroSuspends) {
+      auto Suspend = dyn_cast<CoroSuspendInst>(AnySuspend);
+      if (!Suspend) {
+#ifndef NDEBUG
+        AnySuspend->dump();
+#endif
+        report_fatal_error("coro.id must be paired with coro.suspend");
+      }
+
+      if (!Suspend->getCoroSave())
+        createCoroSave(CoroBegin, Suspend);
+    }
+    break;
+  }
+
+  case Intrinsic::coro_id_retcon:
+  case Intrinsic::coro_id_retcon_once: {
+    auto ContinuationId = cast<AnyCoroIdRetconInst>(Id);
+    ContinuationId->checkWellFormed();
+    this->ABI = (IdIntrinsic == Intrinsic::coro_id_retcon
+                  ? coro::ABI::Retcon
+                  : coro::ABI::RetconOnce);
+    auto Prototype = ContinuationId->getPrototype();
+    this->RetconLowering.ResumePrototype = Prototype;
+    this->RetconLowering.Alloc = ContinuationId->getAllocFunction();
+    this->RetconLowering.Dealloc = ContinuationId->getDeallocFunction();
+    this->RetconLowering.ReturnBlock = nullptr;
+    this->RetconLowering.IsFrameInlineInStorage = false;
+
+    // Determine the result value types, and make sure they match up with
+    // the values passed to the suspends.
+    auto ResultTys = getRetconResultTypes();
+    auto ResumeTys = getRetconResumeTypes();
+
+    for (auto AnySuspend : CoroSuspends) {
+      auto Suspend = dyn_cast<CoroSuspendRetconInst>(AnySuspend);
+      if (!Suspend) {
+#ifndef NDEBUG
+        AnySuspend->dump();
+#endif
+        report_fatal_error("coro.id.retcon.* must be paired with "
+                           "coro.suspend.retcon");
+      }
+
+      // Check that the argument types of the suspend match the results.
+      auto SI = Suspend->value_begin(), SE = Suspend->value_end();
+      auto RI = ResultTys.begin(), RE = ResultTys.end();
+      for (; SI != SE && RI != RE; ++SI, ++RI) {
+        auto SrcTy = (*SI)->getType();
+        if (SrcTy != *RI) {
+          // The optimizer likes to eliminate bitcasts leading into variadic
+          // calls, but that messes with our invariants.  Re-insert the
+          // bitcast and ignore this type mismatch.
+          if (CastInst::isBitCastable(SrcTy, *RI)) {
+            auto BCI = new BitCastInst(*SI, *RI, "", Suspend);
+            SI->set(BCI);
+            continue;
+          }
+
+#ifndef NDEBUG
+          Suspend->dump();
+          Prototype->getFunctionType()->dump();
+#endif
+          report_fatal_error("argument to coro.suspend.retcon does not "
+                             "match corresponding prototype function result");
+        }
+      }
+      if (SI != SE || RI != RE) {
+#ifndef NDEBUG
+        Suspend->dump();
+        Prototype->getFunctionType()->dump();
+#endif
+        report_fatal_error("wrong number of arguments to coro.suspend.retcon");
+      }
+
+      // Check that the result type of the suspend matches the resume types.
+      Type *SResultTy = Suspend->getType();
+      ArrayRef<Type*> SuspendResultTys;
+      if (SResultTy->isVoidTy()) {
+        // leave as empty array
+      } else if (auto SResultStructTy = dyn_cast<StructType>(SResultTy)) {
+        SuspendResultTys = SResultStructTy->elements();
+      } else {
+        // forms an ArrayRef using SResultTy, be careful
+        SuspendResultTys = SResultTy;
+      }
+      if (SuspendResultTys.size() != ResumeTys.size()) {
+#ifndef NDEBUG
+        Suspend->dump();
+        Prototype->getFunctionType()->dump();
+#endif
+        report_fatal_error("wrong number of results from coro.suspend.retcon");
+      }
+      for (size_t I = 0, E = ResumeTys.size(); I != E; ++I) {
+        if (SuspendResultTys[I] != ResumeTys[I]) {
+#ifndef NDEBUG
+          Suspend->dump();
+          Prototype->getFunctionType()->dump();
+#endif
+          report_fatal_error("result from coro.suspend.retcon does not "
+                             "match corresponding prototype function param");
+        }
+      }
+    }
+    break;
+  }
+
+  default:
+    llvm_unreachable("coro.begin is not dependent on a coro.id call");
+  }
+
   // The coro.free intrinsic is always lowered to the result of coro.begin.
   for (CoroFrameInst *CF : CoroFrames) {
     CF->replaceAllUsesWith(CoroBegin);
     CF->eraseFromParent();
   }
 
-  // Canonicalize coro.suspend by inserting a coro.save if needed.
-  for (CoroSuspendInst *CS : CoroSuspends)
-    if (!CS->getCoroSave())
-      createCoroSave(CoroBegin, CS);
-
   // Move final suspend to be the last element in the CoroSuspends vector.
-  if (HasFinalSuspend &&
+  if (ABI == coro::ABI::Switch &&
+      SwitchLowering.HasFinalSuspend &&
       FinalSuspendIndex != CoroSuspends.size() - 1)
     std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
 
@@ -345,18 +486,166 @@ void coro::Shape::buildFrom(Function &F) {
     CoroSave->eraseFromParent();
 }
 
+static void propagateCallAttrsFromCallee(CallInst *Call, Function *Callee) {
+  Call->setCallingConv(Callee->getCallingConv());
+  // TODO: attributes?
+}
+
+static void addCallToCallGraph(CallGraph *CG, CallInst *Call, Function *Callee){
+  if (CG)
+    (*CG)[Call->getFunction()]->addCalledFunction(Call, (*CG)[Callee]);
+}
+
+Value *coro::Shape::emitAlloc(IRBuilder<> &Builder, Value *Size,
+                              CallGraph *CG) const {
+  switch (ABI) {
+  case coro::ABI::Switch:
+    llvm_unreachable("can't allocate memory in coro switch-lowering");
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    auto Alloc = RetconLowering.Alloc;
+    Size = Builder.CreateIntCast(Size,
+                                 Alloc->getFunctionType()->getParamType(0),
+                                 /*is signed*/ false);
+    auto *Call = Builder.CreateCall(Alloc, Size);
+    propagateCallAttrsFromCallee(Call, Alloc);
+    addCallToCallGraph(CG, Call, Alloc);
+    return Call;
+  }
+  }
+  llvm_unreachable("Unknown coro::ABI enum");
+}
+
+void coro::Shape::emitDealloc(IRBuilder<> &Builder, Value *Ptr,
+                              CallGraph *CG) const {
+  switch (ABI) {
+  case coro::ABI::Switch:
+    llvm_unreachable("can't allocate memory in coro switch-lowering");
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    auto Dealloc = RetconLowering.Dealloc;
+    Ptr = Builder.CreateBitCast(Ptr,
+                                Dealloc->getFunctionType()->getParamType(0));
+    auto *Call = Builder.CreateCall(Dealloc, Ptr);
+    propagateCallAttrsFromCallee(Call, Dealloc);
+    addCallToCallGraph(CG, Call, Dealloc);
+    return;
+  }
+  }
+  llvm_unreachable("Unknown coro::ABI enum");
+}
+
+LLVM_ATTRIBUTE_NORETURN
+static void fail(const Instruction *I, const char *Reason, Value *V) {
+#ifndef NDEBUG
+  I->dump();
+  if (V) {
+    errs() << "  Value: ";
+    V->printAsOperand(llvm::errs());
+    errs() << '\n';
+  }
+#endif
+  report_fatal_error(Reason);
+}
+
+/// Check that the given value is a well-formed prototype for the
+/// llvm.coro.id.retcon.* intrinsics.
+static void checkWFRetconPrototype(const AnyCoroIdRetconInst *I, Value *V) {
+  auto F = dyn_cast<Function>(V->stripPointerCasts());
+  if (!F)
+    fail(I, "llvm.coro.id.retcon.* prototype not a Function", V);
+
+  auto FT = F->getFunctionType();
+
+  if (isa<CoroIdRetconInst>(I)) {
+    bool ResultOkay;
+    if (FT->getReturnType()->isPointerTy()) {
+      ResultOkay = true;
+    } else if (auto SRetTy = dyn_cast<StructType>(FT->getReturnType())) {
+      ResultOkay = (!SRetTy->isOpaque() &&
+                    SRetTy->getNumElements() > 0 &&
+                    SRetTy->getElementType(0)->isPointerTy());
+    } else {
+      ResultOkay = false;
+    }
+    if (!ResultOkay)
+      fail(I, "llvm.coro.id.retcon prototype must return pointer as first "
+              "result", F);
+
+    if (FT->getReturnType() !=
+          I->getFunction()->getFunctionType()->getReturnType())
+      fail(I, "llvm.coro.id.retcon prototype return type must be same as"
+              "current function return type", F);
+  } else {
+    // No meaningful validation to do here for llvm.coro.id.unique.once.
+  }
+
+  if (FT->getNumParams() == 0 || !FT->getParamType(0)->isPointerTy())
+    fail(I, "llvm.coro.id.retcon.* prototype must take pointer as "
+            "its first parameter", F);
+}
+
+/// Check that the given value is a well-formed allocator.
+static void checkWFAlloc(const Instruction *I, Value *V) {
+  auto F = dyn_cast<Function>(V->stripPointerCasts());
+  if (!F)
+    fail(I, "llvm.coro.* allocator not a Function", V);
+
+  auto FT = F->getFunctionType();
+  if (!FT->getReturnType()->isPointerTy())
+    fail(I, "llvm.coro.* allocator must return a pointer", F);
+
+  if (FT->getNumParams() != 1 ||
+      !FT->getParamType(0)->isIntegerTy())
+    fail(I, "llvm.coro.* allocator must take integer as only param", F);
+}
+
+/// Check that the given value is a well-formed deallocator.
+static void checkWFDealloc(const Instruction *I, Value *V) {
+  auto F = dyn_cast<Function>(V->stripPointerCasts());
+  if (!F)
+    fail(I, "llvm.coro.* deallocator not a Function", V);
+
+  auto FT = F->getFunctionType();
+  if (!FT->getReturnType()->isVoidTy())
+    fail(I, "llvm.coro.* deallocator must return void", F);
+
+  if (FT->getNumParams() != 1 ||
+      !FT->getParamType(0)->isPointerTy())
+    fail(I, "llvm.coro.* deallocator must take pointer as only param", F);
+}
+
+static void checkConstantInt(const Instruction *I, Value *V,
+                             const char *Reason) {
+  if (!isa<ConstantInt>(V)) {
+    fail(I, Reason, V);
+  }
+}
+
+void AnyCoroIdRetconInst::checkWellFormed() const {
+  checkConstantInt(this, getArgOperand(SizeArg),
+                   "size argument to coro.id.retcon.* must be constant");
+  checkConstantInt(this, getArgOperand(AlignArg),
+                   "alignment argument to coro.id.retcon.* must be constant");
+  checkWFRetconPrototype(this, getArgOperand(PrototypeArg));
+  checkWFAlloc(this, getArgOperand(AllocArg));
+  checkWFDealloc(this, getArgOperand(DeallocArg));
+}
+
 void LLVMAddCoroEarlyPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroEarlyPass());
+  unwrap(PM)->add(createCoroEarlyLegacyPass());
 }
 
 void LLVMAddCoroSplitPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroSplitPass());
+  unwrap(PM)->add(createCoroSplitLegacyPass());
 }
 
 void LLVMAddCoroElidePass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroElidePass());
+  unwrap(PM)->add(createCoroElideLegacyPass());
 }
 
 void LLVMAddCoroCleanupPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCoroCleanupPass());
+  unwrap(PM)->add(createCoroCleanupLegacyPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index c50805692b98..06d1763353f4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Inliner.h"
 #include "llvm/Transforms/Utils/Cloning.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 95a9f31cced3..cdf8a2eb598e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -70,6 +70,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
@@ -304,7 +305,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
           // of the previous load.
           LoadInst *newLoad =
               IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
-          newLoad->setAlignment(OrigLoad->getAlignment());
+          newLoad->setAlignment(MaybeAlign(OrigLoad->getAlignment()));
           // Transfer the AA info too.
           AAMDNodes AAInfo;
           OrigLoad->getAAMetadata(AAInfo);
@@ -386,8 +387,9 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
       // Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
-                                        I->getParamAlignment(), "", InsertPt);
+      Value *TheAlloca =
+          new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
+                         MaybeAlign(I->getParamAlignment()), "", InsertPt);
       StructType *STy = cast<StructType>(AgTy);
       Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
                         nullptr};
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
index 2a52c6b9b4ad..f2995817eaf8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -16,20 +16,31 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 #include <cassert>
 
 using namespace llvm;
@@ -46,19 +57,82 @@ STATISTIC(NumAttributesValidFixpoint,
           "Number of abstract attributes in a valid fixpoint state");
 STATISTIC(NumAttributesManifested,
           "Number of abstract attributes manifested in IR");
-STATISTIC(NumFnNoUnwind, "Number of functions marked nounwind");
-
-STATISTIC(NumFnUniqueReturned, "Number of function with unique return");
-STATISTIC(NumFnKnownReturns, "Number of function with known return values");
-STATISTIC(NumFnArgumentReturned,
-          "Number of function arguments marked returned");
-STATISTIC(NumFnNoSync, "Number of functions marked nosync");
-STATISTIC(NumFnNoFree, "Number of functions marked nofree");
-STATISTIC(NumFnReturnedNonNull,
-          "Number of function return values marked nonnull");
-STATISTIC(NumFnArgumentNonNull, "Number of function arguments marked nonnull");
-STATISTIC(NumCSArgumentNonNull, "Number of call site arguments marked nonnull");
-STATISTIC(NumFnWillReturn, "Number of functions marked willreturn");
+STATISTIC(NumAttributesFixedDueToRequiredDependences,
+          "Number of abstract attributes fixed due to required dependences");
+
+// Some helper macros to deal with statistics tracking.
+//
+// Usage:
+// For simple IR attribute tracking overload trackStatistics in the abstract
+// attribute and choose the right STATS_DECLTRACK_********* macro,
+// e.g.,:
+//  void trackStatistics() const override {
+//    STATS_DECLTRACK_ARG_ATTR(returned)
+//  }
+// If there is a single "increment" side one can use the macro
+// STATS_DECLTRACK with a custom message. If there are multiple increment
+// sides, STATS_DECL and STATS_TRACK can also be used separatly.
+//
+#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME)                                     \
+  ("Number of " #TYPE " marked '" #NAME "'")
+#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME
+#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG);
+#define STATS_DECL(NAME, TYPE, MSG)                                            \
+  STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG);
+#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE));
+#define STATS_DECLTRACK(NAME, TYPE, MSG)                                       \
+  {                                                                            \
+    STATS_DECL(NAME, TYPE, MSG)                                                \
+    STATS_TRACK(NAME, TYPE)                                                    \
+  }
+#define STATS_DECLTRACK_ARG_ATTR(NAME)                                         \
+  STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME))
+#define STATS_DECLTRACK_CSARG_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSArguments,                                           \
+                  BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME))
+#define STATS_DECLTRACK_FN_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME))
+#define STATS_DECLTRACK_CS_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME))
+#define STATS_DECLTRACK_FNRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, FunctionReturn,                                        \
+                  BUILD_STAT_MSG_IR_ATTR(function returns, NAME))
+#define STATS_DECLTRACK_CSRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSReturn,                                              \
+                  BUILD_STAT_MSG_IR_ATTR(call site returns, NAME))
+#define STATS_DECLTRACK_FLOATING_ATTR(NAME)                                    \
+  STATS_DECLTRACK(NAME, Floating,                                              \
+                  ("Number of floating values known to be '" #NAME "'"))
+
+// Specialization of the operator<< for abstract attributes subclasses. This
+// disambiguates situations where multiple operators are applicable.
+namespace llvm {
+#define PIPE_OPERATOR(CLASS)                                                   \
+  raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) {                  \
+    return OS << static_cast<const AbstractAttribute &>(AA);                   \
+  }
+
+PIPE_OPERATOR(AAIsDead)
+PIPE_OPERATOR(AANoUnwind)
+PIPE_OPERATOR(AANoSync)
+PIPE_OPERATOR(AANoRecurse)
+PIPE_OPERATOR(AAWillReturn)
+PIPE_OPERATOR(AANoReturn)
+PIPE_OPERATOR(AAReturnedValues)
+PIPE_OPERATOR(AANonNull)
+PIPE_OPERATOR(AANoAlias)
+PIPE_OPERATOR(AADereferenceable)
+PIPE_OPERATOR(AAAlign)
+PIPE_OPERATOR(AANoCapture)
+PIPE_OPERATOR(AAValueSimplify)
+PIPE_OPERATOR(AANoFree)
+PIPE_OPERATOR(AAHeapToStack)
+PIPE_OPERATOR(AAReachability)
+PIPE_OPERATOR(AAMemoryBehavior)
+PIPE_OPERATOR(AAValueConstantRange)
+
+#undef PIPE_OPERATOR
+} // namespace llvm
 
 // TODO: Determine a good default value.
 //
@@ -72,18 +146,36 @@ static cl::opt<unsigned>
     MaxFixpointIterations("attributor-max-iterations", cl::Hidden,
                           cl::desc("Maximal number of fixpoint iterations."),
                           cl::init(32));
+static cl::opt<bool> VerifyMaxFixpointIterations(
+    "attributor-max-iterations-verify", cl::Hidden,
+    cl::desc("Verify that max-iterations is a tight bound for a fixpoint"),
+    cl::init(false));
 
 static cl::opt<bool> DisableAttributor(
     "attributor-disable", cl::Hidden,
     cl::desc("Disable the attributor inter-procedural deduction pass."),
     cl::init(true));
 
-static cl::opt<bool> VerifyAttributor(
-    "attributor-verify", cl::Hidden,
-    cl::desc("Verify the Attributor deduction and "
-             "manifestation of attributes -- may issue false-positive errors"),
+static cl::opt<bool> AnnotateDeclarationCallSites(
+    "attributor-annotate-decl-cs", cl::Hidden,
+    cl::desc("Annotate call sites of function declarations."), cl::init(false));
+
+static cl::opt<bool> ManifestInternal(
+    "attributor-manifest-internal", cl::Hidden,
+    cl::desc("Manifest Attributor internal string attributes."),
     cl::init(false));
 
+static cl::opt<unsigned> DepRecInterval(
+    "attributor-dependence-recompute-interval", cl::Hidden,
+    cl::desc("Number of iterations until dependences are recomputed."),
+    cl::init(4));
+
+static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion",
+                                       cl::init(true), cl::Hidden);
+
+static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
+                                       cl::Hidden);
+
 /// Logic operators for the change status enum class.
 ///
 ///{
@@ -95,78 +187,98 @@ ChangeStatus llvm::operator&(ChangeStatus l, ChangeStatus r) {
 }
 ///}
 
-/// Helper to adjust the statistics.
-static void bookkeeping(AbstractAttribute::ManifestPosition MP,
-                        const Attribute &Attr) {
-  if (!AreStatisticsEnabled())
-    return;
+Argument *IRPosition::getAssociatedArgument() const {
+  if (getPositionKind() == IRP_ARGUMENT)
+    return cast<Argument>(&getAnchorValue());
+
+  // Not an Argument and no argument number means this is not a call site
+  // argument, thus we cannot find a callback argument to return.
+  int ArgNo = getArgNo();
+  if (ArgNo < 0)
+    return nullptr;
+
+  // Use abstract call sites to make the connection between the call site
+  // values and the ones in callbacks. If a callback was found that makes use
+  // of the underlying call site operand, we want the corresponding callback
+  // callee argument and not the direct callee argument.
+  Optional<Argument *> CBCandidateArg;
+  SmallVector<const Use *, 4> CBUses;
+  ImmutableCallSite ICS(&getAnchorValue());
+  AbstractCallSite::getCallbackUses(ICS, CBUses);
+  for (const Use *U : CBUses) {
+    AbstractCallSite ACS(U);
+    assert(ACS && ACS.isCallbackCall());
+    if (!ACS.getCalledFunction())
+      continue;
 
-  if (!Attr.isEnumAttribute())
-    return;
-  switch (Attr.getKindAsEnum()) {
-  case Attribute::NoUnwind:
-    NumFnNoUnwind++;
-    return;
-  case Attribute::Returned:
-    NumFnArgumentReturned++;
-    return;
-  case Attribute::NoSync:
-    NumFnNoSync++;
-    break;
-  case Attribute::NoFree:
-    NumFnNoFree++;
-    break;
-  case Attribute::NonNull:
-    switch (MP) {
-    case AbstractAttribute::MP_RETURNED:
-      NumFnReturnedNonNull++;
-      break;
-    case AbstractAttribute::MP_ARGUMENT:
-      NumFnArgumentNonNull++;
-      break;
-    case AbstractAttribute::MP_CALL_SITE_ARGUMENT:
-      NumCSArgumentNonNull++;
-      break;
-    default:
-      break;
+    for (unsigned u = 0, e = ACS.getNumArgOperands(); u < e; u++) {
+
+      // Test if the underlying call site operand is argument number u of the
+      // callback callee.
+      if (ACS.getCallArgOperandNo(u) != ArgNo)
+        continue;
+
+      assert(ACS.getCalledFunction()->arg_size() > u &&
+             "ACS mapped into var-args arguments!");
+      if (CBCandidateArg.hasValue()) {
+        CBCandidateArg = nullptr;
+        break;
+      }
+      CBCandidateArg = ACS.getCalledFunction()->getArg(u);
     }
-    break;
-  case Attribute::WillReturn:
-    NumFnWillReturn++;
-    break;
-  default:
-    return;
   }
+
+  // If we found a unique callback candidate argument, return it.
+  if (CBCandidateArg.hasValue() && CBCandidateArg.getValue())
+    return CBCandidateArg.getValue();
+
+  // If no callbacks were found, or none used the underlying call site operand
+  // exclusively, use the direct callee argument if available.
+  const Function *Callee = ICS.getCalledFunction();
+  if (Callee && Callee->arg_size() > unsigned(ArgNo))
+    return Callee->getArg(ArgNo);
+
+  return nullptr;
 }
 
-template <typename StateTy>
-using followValueCB_t = std::function<bool(Value *, StateTy &State)>;
-template <typename StateTy>
-using visitValueCB_t = std::function<void(Value *, StateTy &State)>;
+/// For calls (and invokes) we will only replace instruction uses to not disturb
+/// the old style call graph.
+/// TODO: Remove this once we get rid of the old PM.
+static void replaceAllInstructionUsesWith(Value &Old, Value &New) {
+  if (!isa<CallBase>(Old))
+    return Old.replaceAllUsesWith(&New);
+  SmallVector<Use *, 8> Uses;
+  for (Use &U : Old.uses())
+    if (isa<Instruction>(U.getUser()))
+      Uses.push_back(&U);
+  for (Use *U : Uses)
+    U->set(&New);
+}
 
-/// Recursively visit all values that might become \p InitV at some point. This
+/// Recursively visit all values that might become \p IRP at some point. This
 /// will be done by looking through cast instructions, selects, phis, and calls
-/// with the "returned" attribute. The callback \p FollowValueCB is asked before
-/// a potential origin value is looked at. If no \p FollowValueCB is passed, a
-/// default one is used that will make sure we visit every value only once. Once
-/// we cannot look through the value any further, the callback \p VisitValueCB
-/// is invoked and passed the current value and the \p State. To limit how much
-/// effort is invested, we will never visit more than \p MaxValues values.
-template <typename StateTy>
+/// with the "returned" attribute. Once we cannot look through the value any
+/// further, the callback \p VisitValueCB is invoked and passed the current
+/// value, the \p State, and a flag to indicate if we stripped anything. To
+/// limit how much effort is invested, we will never visit more values than
+/// specified by \p MaxValues.
+template <typename AAType, typename StateTy>
 static bool genericValueTraversal(
-    Value *InitV, StateTy &State, visitValueCB_t<StateTy> &VisitValueCB,
-    followValueCB_t<StateTy> *FollowValueCB = nullptr, int MaxValues = 8) {
-
+    Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State,
+    const function_ref<bool(Value &, StateTy &, bool)> &VisitValueCB,
+    int MaxValues = 8) {
+
+  const AAIsDead *LivenessAA = nullptr;
+  if (IRP.getAnchorScope())
+    LivenessAA = &A.getAAFor<AAIsDead>(
+        QueryingAA, IRPosition::function(*IRP.getAnchorScope()),
+        /* TrackDependence */ false);
+  bool AnyDead = false;
+
+  // TODO: Use Positions here to allow context sensitivity in VisitValueCB
   SmallPtrSet<Value *, 16> Visited;
-  followValueCB_t<bool> DefaultFollowValueCB = [&](Value *Val, bool &) {
-    return Visited.insert(Val).second;
-  };
-
-  if (!FollowValueCB)
-    FollowValueCB = &DefaultFollowValueCB;
-
   SmallVector<Value *, 16> Worklist;
-  Worklist.push_back(InitV);
+  Worklist.push_back(&IRP.getAssociatedValue());
 
   int Iteration = 0;
   do {
@@ -174,7 +286,7 @@ static bool genericValueTraversal(
 
     // Check if we should process the current value. To prevent endless
     // recursion keep a record of the values we followed!
-    if (!(*FollowValueCB)(V, State))
+    if (!Visited.insert(V).second)
       continue;
 
     // Make sure we limit the compile time for complex expressions.
@@ -183,23 +295,23 @@ static bool genericValueTraversal(
 
     // Explicitly look through calls with a "returned" attribute if we do
     // not have a pointer as stripPointerCasts only works on them.
+    Value *NewV = nullptr;
     if (V->getType()->isPointerTy()) {
-      V = V->stripPointerCasts();
+      NewV = V->stripPointerCasts();
     } else {
       CallSite CS(V);
       if (CS && CS.getCalledFunction()) {
-        Value *NewV = nullptr;
         for (Argument &Arg : CS.getCalledFunction()->args())
           if (Arg.hasReturnedAttr()) {
             NewV = CS.getArgOperand(Arg.getArgNo());
             break;
           }
-        if (NewV) {
-          Worklist.push_back(NewV);
-          continue;
-        }
       }
     }
+    if (NewV && NewV != V) {
+      Worklist.push_back(NewV);
+      continue;
+    }
 
     // Look through select instructions, visit both potential values.
     if (auto *SI = dyn_cast<SelectInst>(V)) {
@@ -208,35 +320,34 @@ static bool genericValueTraversal(
       continue;
     }
 
-    // Look through phi nodes, visit all operands.
+    // Look through phi nodes, visit all live operands.
     if (auto *PHI = dyn_cast<PHINode>(V)) {
-      Worklist.append(PHI->op_begin(), PHI->op_end());
+      assert(LivenessAA &&
+             "Expected liveness in the presence of instructions!");
+      for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
+        const BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
+        if (LivenessAA->isAssumedDead(IncomingBB->getTerminator())) {
+          AnyDead = true;
+          continue;
+        }
+        Worklist.push_back(PHI->getIncomingValue(u));
+      }
       continue;
     }
 
     // Once a leaf is reached we inform the user through the callback.
-    VisitValueCB(V, State);
+    if (!VisitValueCB(*V, State, Iteration > 1))
+      return false;
   } while (!Worklist.empty());
 
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+
   // All values have been visited.
   return true;
 }
 
-/// Helper to identify the correct offset into an attribute list.
-static unsigned getAttrIndex(AbstractAttribute::ManifestPosition MP,
-                             unsigned ArgNo = 0) {
-  switch (MP) {
-  case AbstractAttribute::MP_ARGUMENT:
-  case AbstractAttribute::MP_CALL_SITE_ARGUMENT:
-    return ArgNo + AttributeList::FirstArgIndex;
-  case AbstractAttribute::MP_FUNCTION:
-    return AttributeList::FunctionIndex;
-  case AbstractAttribute::MP_RETURNED:
-    return AttributeList::ReturnIndex;
-  }
-  llvm_unreachable("Unknown manifest position!");
-}
-
 /// Return true if \p New is equal or worse than \p Old.
 static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
   if (!Old.isIntAttribute())
@@ -247,12 +358,9 @@ static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
 
 /// Return true if the information provided by \p Attr was added to the
 /// attribute list \p Attrs. This is only the case if it was not already present
-/// in \p Attrs at the position describe by \p MP and \p ArgNo.
+/// in \p Attrs at the position describe by \p PK and \p AttrIdx.
 static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
-                             AttributeList &Attrs,
-                             AbstractAttribute::ManifestPosition MP,
-                             unsigned ArgNo = 0) {
-  unsigned AttrIdx = getAttrIndex(MP, ArgNo);
+                             AttributeList &Attrs, int AttrIdx) {
 
   if (Attr.isEnumAttribute()) {
     Attribute::AttrKind Kind = Attr.getKindAsEnum();
@@ -270,10 +378,32 @@ static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
     Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
     return true;
   }
+  if (Attr.isIntAttribute()) {
+    Attribute::AttrKind Kind = Attr.getKindAsEnum();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
 
   llvm_unreachable("Expected enum or string attribute!");
 }
 
+static const Value *
+getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
+                                     const DataLayout &DL,
+                                     bool AllowNonInbounds = false) {
+  const Value *Ptr =
+      Attributor::getPointerOperand(I, /* AllowVolatile */ false);
+  if (!Ptr)
+    return nullptr;
+
+  return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
+                                          AllowNonInbounds);
+}
+
 ChangeStatus AbstractAttribute::update(Attributor &A) {
   ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
   if (getState().isAtFixpoint())
@@ -289,143 +419,515 @@ ChangeStatus AbstractAttribute::update(Attributor &A) {
   return HasChanged;
 }
 
-ChangeStatus AbstractAttribute::manifest(Attributor &A) {
-  assert(getState().isValidState() &&
-         "Attempted to manifest an invalid state!");
-  assert(getAssociatedValue() &&
-         "Attempted to manifest an attribute without associated value!");
-
-  ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
-  SmallVector<Attribute, 4> DeducedAttrs;
-  getDeducedAttributes(DeducedAttrs);
-
-  Function &ScopeFn = getAnchorScope();
-  LLVMContext &Ctx = ScopeFn.getContext();
-  ManifestPosition MP = getManifestPosition();
-
-  AttributeList Attrs;
-  SmallVector<unsigned, 4> ArgNos;
+ChangeStatus
+IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
+                                   const ArrayRef<Attribute> &DeducedAttrs) {
+  Function *ScopeFn = IRP.getAssociatedFunction();
+  IRPosition::Kind PK = IRP.getPositionKind();
 
   // In the following some generic code that will manifest attributes in
   // DeducedAttrs if they improve the current IR. Due to the different
   // annotation positions we use the underlying AttributeList interface.
-  // Note that MP_CALL_SITE_ARGUMENT can annotate multiple locations.
 
-  switch (MP) {
-  case MP_ARGUMENT:
-    ArgNos.push_back(cast<Argument>(getAssociatedValue())->getArgNo());
-    Attrs = ScopeFn.getAttributes();
+  AttributeList Attrs;
+  switch (PK) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+    return ChangeStatus::UNCHANGED;
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_FUNCTION:
+  case IRPosition::IRP_RETURNED:
+    Attrs = ScopeFn->getAttributes();
     break;
-  case MP_FUNCTION:
-  case MP_RETURNED:
-    ArgNos.push_back(0);
-    Attrs = ScopeFn.getAttributes();
+  case IRPosition::IRP_CALL_SITE:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    Attrs = ImmutableCallSite(&IRP.getAnchorValue()).getAttributes();
     break;
-  case MP_CALL_SITE_ARGUMENT: {
-    CallSite CS(&getAnchoredValue());
-    for (unsigned u = 0, e = CS.getNumArgOperands(); u != e; u++)
-      if (CS.getArgOperand(u) == getAssociatedValue())
-        ArgNos.push_back(u);
-    Attrs = CS.getAttributes();
-  }
   }
 
+  ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+  LLVMContext &Ctx = IRP.getAnchorValue().getContext();
   for (const Attribute &Attr : DeducedAttrs) {
-    for (unsigned ArgNo : ArgNos) {
-      if (!addIfNotExistent(Ctx, Attr, Attrs, MP, ArgNo))
-        continue;
+    if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx()))
+      continue;
 
-      HasChanged = ChangeStatus::CHANGED;
-      bookkeeping(MP, Attr);
-    }
+    HasChanged = ChangeStatus::CHANGED;
   }
 
   if (HasChanged == ChangeStatus::UNCHANGED)
     return HasChanged;
 
-  switch (MP) {
-  case MP_ARGUMENT:
-  case MP_FUNCTION:
-  case MP_RETURNED:
-    ScopeFn.setAttributes(Attrs);
+  switch (PK) {
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_FUNCTION:
+  case IRPosition::IRP_RETURNED:
+    ScopeFn->setAttributes(Attrs);
+    break;
+  case IRPosition::IRP_CALL_SITE:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    CallSite(&IRP.getAnchorValue()).setAttributes(Attrs);
+    break;
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
     break;
-  case MP_CALL_SITE_ARGUMENT:
-    CallSite(&getAnchoredValue()).setAttributes(Attrs);
   }
 
   return HasChanged;
 }
 
-Function &AbstractAttribute::getAnchorScope() {
-  Value &V = getAnchoredValue();
-  if (isa<Function>(V))
-    return cast<Function>(V);
-  if (isa<Argument>(V))
-    return *cast<Argument>(V).getParent();
-  if (isa<Instruction>(V))
-    return *cast<Instruction>(V).getFunction();
-  llvm_unreachable("No scope for anchored value found!");
+const IRPosition IRPosition::EmptyKey(255);
+const IRPosition IRPosition::TombstoneKey(256);
+
+SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
+  IRPositions.emplace_back(IRP);
+
+  ImmutableCallSite ICS(&IRP.getAnchorValue());
+  switch (IRP.getPositionKind()) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+  case IRPosition::IRP_FUNCTION:
+    return;
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_RETURNED:
+    IRPositions.emplace_back(
+        IRPosition::function(*IRP.getAssociatedFunction()));
+    return;
+  case IRPosition::IRP_CALL_SITE:
+    assert(ICS && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
+    if (!ICS.hasOperandBundles())
+      if (const Function *Callee = ICS.getCalledFunction())
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+    return;
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+    assert(ICS && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
+    if (!ICS.hasOperandBundles()) {
+      if (const Function *Callee = ICS.getCalledFunction()) {
+        IRPositions.emplace_back(IRPosition::returned(*Callee));
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+      }
+    }
+    IRPositions.emplace_back(
+        IRPosition::callsite_function(cast<CallBase>(*ICS.getInstruction())));
+    return;
+  case IRPosition::IRP_CALL_SITE_ARGUMENT: {
+    int ArgNo = IRP.getArgNo();
+    assert(ICS && ArgNo >= 0 && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
+    if (!ICS.hasOperandBundles()) {
+      const Function *Callee = ICS.getCalledFunction();
+      if (Callee && Callee->arg_size() > unsigned(ArgNo))
+        IRPositions.emplace_back(IRPosition::argument(*Callee->getArg(ArgNo)));
+      if (Callee)
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+    }
+    IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue()));
+    return;
+  }
+  }
 }
 
-const Function &AbstractAttribute::getAnchorScope() const {
-  return const_cast<AbstractAttribute *>(this)->getAnchorScope();
+bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs,
+                         bool IgnoreSubsumingPositions) const {
+  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
+    for (Attribute::AttrKind AK : AKs)
+      if (EquivIRP.getAttr(AK).getKindAsEnum() == AK)
+        return true;
+    // The first position returned by the SubsumingPositionIterator is
+    // always the position itself. If we ignore subsuming positions we
+    // are done after the first iteration.
+    if (IgnoreSubsumingPositions)
+      break;
+  }
+  return false;
 }
 
-/// -----------------------NoUnwind Function Attribute--------------------------
+void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs,
+                          SmallVectorImpl<Attribute> &Attrs,
+                          bool IgnoreSubsumingPositions) const {
+  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
+    for (Attribute::AttrKind AK : AKs) {
+      const Attribute &Attr = EquivIRP.getAttr(AK);
+      if (Attr.getKindAsEnum() == AK)
+        Attrs.push_back(Attr);
+    }
+    // The first position returned by the SubsumingPositionIterator is
+    // always the position itself. If we ignore subsuming positions we
+    // are done after the first iteration.
+    if (IgnoreSubsumingPositions)
+      break;
+  }
+}
 
-struct AANoUnwindFunction : AANoUnwind, BooleanState {
+void IRPosition::verify() {
+  switch (KindOrArgNo) {
+  default:
+    assert(KindOrArgNo >= 0 && "Expected argument or call site argument!");
+    assert((isa<CallBase>(AnchorVal) || isa<Argument>(AnchorVal)) &&
+           "Expected call base or argument for positive attribute index!");
+    if (isa<Argument>(AnchorVal)) {
+      assert(cast<Argument>(AnchorVal)->getArgNo() == unsigned(getArgNo()) &&
+             "Argument number mismatch!");
+      assert(cast<Argument>(AnchorVal) == &getAssociatedValue() &&
+             "Associated value mismatch!");
+    } else {
+      assert(cast<CallBase>(*AnchorVal).arg_size() > unsigned(getArgNo()) &&
+             "Call site argument number mismatch!");
+      assert(cast<CallBase>(*AnchorVal).getArgOperand(getArgNo()) ==
+                 &getAssociatedValue() &&
+             "Associated value mismatch!");
+    }
+    break;
+  case IRP_INVALID:
+    assert(!AnchorVal && "Expected no value for an invalid position!");
+    break;
+  case IRP_FLOAT:
+    assert((!isa<CallBase>(&getAssociatedValue()) &&
+            !isa<Argument>(&getAssociatedValue())) &&
+           "Expected specialized kind for call base and argument values!");
+    break;
+  case IRP_RETURNED:
+    assert(isa<Function>(AnchorVal) &&
+           "Expected function for a 'returned' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  case IRP_CALL_SITE_RETURNED:
+    assert((isa<CallBase>(AnchorVal)) &&
+           "Expected call base for 'call site returned' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  case IRP_CALL_SITE:
+    assert((isa<CallBase>(AnchorVal)) &&
+           "Expected call base for 'call site function' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  case IRP_FUNCTION:
+    assert(isa<Function>(AnchorVal) &&
+           "Expected function for a 'function' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  }
+}
 
-  AANoUnwindFunction(Function &F, InformationCache &InfoCache)
-      : AANoUnwind(F, InfoCache) {}
+namespace {
+/// Helper function to clamp a state \p S of type \p StateType with the
+/// information in \p R and indicate/return if \p S did change (as-in update is
+/// required to be run again).
+template <typename StateType>
+ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) {
+  auto Assumed = S.getAssumed();
+  S ^= R;
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
+}
 
-  /// See AbstractAttribute::getState()
-  /// {
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
-  /// }
+/// Clamp the information known for all returned values of a function
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
+                                     StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for "
+                    << QueryingAA << " into " << S << "\n");
+
+  assert((QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_RETURNED ||
+          QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_CALL_SITE_RETURNED) &&
+         "Can only clamp returned value states for a function returned or call "
+         "site returned position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // Callback for each possibly returned value.
+  auto CheckReturnValue = [&](Value &RV) -> bool {
+    const IRPosition &RVPos = IRPosition::value(RV);
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
+                      << " @ " << RVPos << "\n");
+    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_FUNCTION; }
+  if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
 
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nounwind" : "may-unwind";
+/// Helper class to compose two generic deduction
+template <typename AAType, typename Base, typename StateType,
+          template <typename...> class F, template <typename...> class G>
+struct AAComposeTwoGenericDeduction
+    : public F<AAType, G<AAType, Base, StateType>, StateType> {
+  AAComposeTwoGenericDeduction(const IRPosition &IRP)
+      : F<AAType, G<AAType, Base, StateType>, StateType>(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus ChangedF =
+        F<AAType, G<AAType, Base, StateType>, StateType>::updateImpl(A);
+    ChangeStatus ChangedG = G<AAType, Base, StateType>::updateImpl(A);
+    return ChangedF | ChangedG;
   }
+};
+
+/// Helper class for generic deduction: return value -> returned position.
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AAReturnedFromReturnedValues : public Base {
+  AAReturnedFromReturnedValues(const IRPosition &IRP) : Base(IRP) {}
 
   /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S;
+    clampReturnedValueStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all returned values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
+};
+
+/// Clamp the information known at all call sites for a given argument
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
+                                        StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for "
+                    << QueryingAA << " into " << S << "\n");
+
+  assert(QueryingAA.getIRPosition().getPositionKind() ==
+             IRPosition::IRP_ARGUMENT &&
+         "Can only clamp call site argument states for an argument position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // The argument number which is also the call site argument number.
+  unsigned ArgNo = QueryingAA.getIRPosition().getArgNo();
+
+  auto CallSiteCheck = [&](AbstractCallSite ACS) {
+    const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+    // Check if a coresponding argument was found or if it is on not associated
+    // (which can happen for callback calls).
+    if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+      return false;
+
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
+                      << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
+    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
+
+  if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
 
-  /// See AANoUnwind::isAssumedNoUnwind().
-  bool isAssumedNoUnwind() const override { return getAssumed(); }
+/// Helper class for generic deduction: call site argument -> argument position.
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AAArgumentFromCallSiteArguments : public Base {
+  AAArgumentFromCallSiteArguments(const IRPosition &IRP) : Base(IRP) {}
 
-  /// See AANoUnwind::isKnownNoUnwind().
-  bool isKnownNoUnwind() const override { return getKnown(); }
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S;
+    clampCallSiteArgumentStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
 };
 
-ChangeStatus AANoUnwindFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+/// Helper class for generic replication: function returned -> cs returned.
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AACallSiteReturnedFromReturned : public Base {
+  AACallSiteReturnedFromReturned(const IRPosition &IRP) : Base(IRP) {}
 
-  // The map from instruction opcodes to those instructions in the function.
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
-  auto Opcodes = {
-      (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr,
-      (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet,
-      (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    assert(this->getIRPosition().getPositionKind() ==
+               IRPosition::IRP_CALL_SITE_RETURNED &&
+           "Can only wrap function returned positions for call site returned "
+           "positions!");
+    auto &S = this->getState();
+
+    const Function *AssociatedFunction =
+        this->getIRPosition().getAssociatedFunction();
+    if (!AssociatedFunction)
+      return S.indicatePessimisticFixpoint();
+
+    IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
+    const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        S, static_cast<const typename AAType::StateType &>(AA.getState()));
+  }
+};
 
-  for (unsigned Opcode : Opcodes) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
-      if (!I->mayThrow())
-        continue;
+/// Helper class for generic deduction using must-be-executed-context
+/// Base class is required to have `followUse` method.
 
-      auto *NoUnwindAA = A.getAAFor<AANoUnwind>(*this, *I);
+/// bool followUse(Attributor &A, const Use *U, const Instruction *I)
+/// U - Underlying use.
+/// I - The user of the \p U.
+/// `followUse` returns true if the value should be tracked transitively.
 
-      if (!NoUnwindAA || !NoUnwindAA->isAssumedNoUnwind()) {
-        indicatePessimisticFixpoint();
-        return ChangeStatus::CHANGED;
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AAFromMustBeExecutedContext : public Base {
+  AAFromMustBeExecutedContext(const IRPosition &IRP) : Base(IRP) {}
+
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    const IRPosition &IRP = this->getIRPosition();
+    Instruction *CtxI = IRP.getCtxI();
+
+    if (!CtxI)
+      return;
+
+    for (const Use &U : IRP.getAssociatedValue().uses())
+      Uses.insert(&U);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto BeforeState = this->getState();
+    auto &S = this->getState();
+    Instruction *CtxI = this->getIRPosition().getCtxI();
+    if (!CtxI)
+      return ChangeStatus::UNCHANGED;
+
+    MustBeExecutedContextExplorer &Explorer =
+        A.getInfoCache().getMustBeExecutedContextExplorer();
+
+    auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI);
+    for (unsigned u = 0; u < Uses.size(); ++u) {
+      const Use *U = Uses[u];
+      if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) {
+        bool Found = Explorer.findInContextOf(UserI, EIt, EEnd);
+        if (Found && Base::followUse(A, U, UserI))
+          for (const Use &Us : UserI->uses())
+            Uses.insert(&Us);
       }
     }
+
+    return BeforeState == S ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
   }
-  return ChangeStatus::UNCHANGED;
-}
+
+private:
+  /// Container for (transitive) uses of the associated value.
+  SetVector<const Use *> Uses;
+};
+
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+using AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext =
+    AAComposeTwoGenericDeduction<AAType, Base, StateType,
+                                 AAFromMustBeExecutedContext,
+                                 AAArgumentFromCallSiteArguments>;
+
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+using AACallSiteReturnedFromReturnedAndMustBeExecutedContext =
+    AAComposeTwoGenericDeduction<AAType, Base, StateType,
+                                 AAFromMustBeExecutedContext,
+                                 AACallSiteReturnedFromReturned>;
+
+/// -----------------------NoUnwind Function Attribute--------------------------
+
+struct AANoUnwindImpl : AANoUnwind {
+  AANoUnwindImpl(const IRPosition &IRP) : AANoUnwind(IRP) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nounwind" : "may-unwind";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto Opcodes = {
+        (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr,
+        (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet,
+        (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
+
+    auto CheckForNoUnwind = [&](Instruction &I) {
+      if (!I.mayThrow())
+        return true;
+
+      if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
+        const auto &NoUnwindAA =
+            A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(ICS));
+        return NoUnwindAA.isAssumedNoUnwind();
+      }
+      return false;
+    };
+
+    if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoUnwindFunction final : public AANoUnwindImpl {
+  AANoUnwindFunction(const IRPosition &IRP) : AANoUnwindImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) }
+};
+
+/// NoUnwind attribute deduction for a call sites.
+struct AANoUnwindCallSite final : AANoUnwindImpl {
+  AANoUnwindCallSite(const IRPosition &IRP) : AANoUnwindImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoUnwindImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoUnwind::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
+};
 
 /// --------------------- Function Return Values -------------------------------
 
@@ -434,68 +936,48 @@ ChangeStatus AANoUnwindFunction::updateImpl(Attributor &A) {
 ///
 /// If there is a unique returned value R, the manifest method will:
 ///   - mark R with the "returned" attribute, if R is an argument.
-class AAReturnedValuesImpl final : public AAReturnedValues, AbstractState {
+class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState {
 
   /// Mapping of values potentially returned by the associated function to the
   /// return instructions that might return them.
-  DenseMap<Value *, SmallPtrSet<ReturnInst *, 2>> ReturnedValues;
+  MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues;
+
+  /// Mapping to remember the number of returned values for a call site such
+  /// that we can avoid updates if nothing changed.
+  DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA;
+
+  /// Set of unresolved calls returned by the associated function.
+  SmallSetVector<CallBase *, 4> UnresolvedCalls;
 
   /// State flags
   ///
   ///{
-  bool IsFixed;
-  bool IsValidState;
-  bool HasOverdefinedReturnedCalls;
+  bool IsFixed = false;
+  bool IsValidState = true;
   ///}
 
-  /// Collect values that could become \p V in the set \p Values, each mapped to
-  /// \p ReturnInsts.
-  void collectValuesRecursively(
-      Attributor &A, Value *V, SmallPtrSetImpl<ReturnInst *> &ReturnInsts,
-      DenseMap<Value *, SmallPtrSet<ReturnInst *, 2>> &Values) {
-
-    visitValueCB_t<bool> VisitValueCB = [&](Value *Val, bool &) {
-      assert(!isa<Instruction>(Val) ||
-             &getAnchorScope() == cast<Instruction>(Val)->getFunction());
-      Values[Val].insert(ReturnInsts.begin(), ReturnInsts.end());
-    };
-
-    bool UnusedBool;
-    bool Success = genericValueTraversal(V, UnusedBool, VisitValueCB);
-
-    // If we did abort the above traversal we haven't see all the values.
-    // Consequently, we cannot know if the information we would derive is
-    // accurate so we give up early.
-    if (!Success)
-      indicatePessimisticFixpoint();
-  }
-
 public:
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAReturnedValuesImpl(Function &F, InformationCache &InfoCache)
-      : AAReturnedValues(F, InfoCache) {
-    // We do not have an associated argument yet.
-    AssociatedVal = nullptr;
-  }
+  AAReturnedValuesImpl(const IRPosition &IRP) : AAReturnedValues(IRP) {}
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     // Reset the state.
-    AssociatedVal = nullptr;
     IsFixed = false;
     IsValidState = true;
-    HasOverdefinedReturnedCalls = false;
     ReturnedValues.clear();
 
-    Function &F = cast<Function>(getAnchoredValue());
+    Function *F = getAssociatedFunction();
+    if (!F) {
+      indicatePessimisticFixpoint();
+      return;
+    }
 
     // The map from instruction opcodes to those instructions in the function.
-    auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+    auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F);
 
     // Look through all arguments, if one is marked as returned we are done.
-    for (Argument &Arg : F.args()) {
+    for (Argument &Arg : F->args()) {
       if (Arg.hasReturnedAttr()) {
-
         auto &ReturnInstSet = ReturnedValues[&Arg];
         for (Instruction *RI : OpcodeInstMap[Instruction::Ret])
           ReturnInstSet.insert(cast<ReturnInst>(RI));
@@ -505,13 +987,8 @@ public:
       }
     }
 
-    // If no argument was marked as returned we look at all return instructions
-    // and collect potentially returned values.
-    for (Instruction *RI : OpcodeInstMap[Instruction::Ret]) {
-      SmallPtrSet<ReturnInst *, 1> RISet({cast<ReturnInst>(RI)});
-      collectValuesRecursively(A, cast<ReturnInst>(RI)->getReturnValue(), RISet,
-                               ReturnedValues);
-    }
+    if (!F->hasExactDefinition())
+      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::manifest(...).
@@ -523,25 +1000,35 @@ public:
   /// See AbstractAttribute::getState(...).
   const AbstractState &getState() const override { return *this; }
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_ARGUMENT; }
-
   /// See AbstractAttribute::updateImpl(Attributor &A).
   ChangeStatus updateImpl(Attributor &A) override;
 
+  llvm::iterator_range<iterator> returned_values() override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  llvm::iterator_range<const_iterator> returned_values() const override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override {
+    return UnresolvedCalls;
+  }
+
   /// Return the number of potential return values, -1 if unknown.
-  size_t getNumReturnValues() const {
+  size_t getNumReturnValues() const override {
     return isValidState() ? ReturnedValues.size() : -1;
   }
 
   /// Return an assumed unique return value if a single candidate is found. If
   /// there cannot be one, return a nullptr. If it is not clear yet, return the
   /// Optional::NoneType.
-  Optional<Value *> getAssumedUniqueReturnValue() const;
+  Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
 
-  /// See AbstractState::checkForallReturnedValues(...).
-  bool
-  checkForallReturnedValues(std::function<bool(Value &)> &Pred) const override;
+  /// See AbstractState::checkForAllReturnedValues(...).
+  bool checkForAllReturnedValuesAndReturnInsts(
+      const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+          &Pred) const override;
 
   /// Pretty print the attribute similar to the IR representation.
   const std::string getAsStr() const override;
@@ -553,13 +1040,15 @@ public:
   bool isValidState() const override { return IsValidState; }
 
   /// See AbstractState::indicateOptimisticFixpoint(...).
-  void indicateOptimisticFixpoint() override {
+  ChangeStatus indicateOptimisticFixpoint() override {
     IsFixed = true;
-    IsValidState &= true;
+    return ChangeStatus::UNCHANGED;
   }
-  void indicatePessimisticFixpoint() override {
+
+  ChangeStatus indicatePessimisticFixpoint() override {
     IsFixed = true;
     IsValidState = false;
+    return ChangeStatus::CHANGED;
   }
 };
 
@@ -568,21 +1057,58 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
 
   // Bookkeeping.
   assert(isValidState());
-  NumFnKnownReturns++;
+  STATS_DECLTRACK(KnownReturnValues, FunctionReturn,
+                  "Number of function with known return values");
 
   // Check if we have an assumed unique return value that we could manifest.
-  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue();
+  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
 
   if (!UniqueRV.hasValue() || !UniqueRV.getValue())
     return Changed;
 
   // Bookkeeping.
-  NumFnUniqueReturned++;
+  STATS_DECLTRACK(UniqueReturnValue, FunctionReturn,
+                  "Number of function with unique return");
+
+  // Callback to replace the uses of CB with the constant C.
+  auto ReplaceCallSiteUsersWith = [](CallBase &CB, Constant &C) {
+    if (CB.getNumUses() == 0 || CB.isMustTailCall())
+      return ChangeStatus::UNCHANGED;
+    replaceAllInstructionUsesWith(CB, C);
+    return ChangeStatus::CHANGED;
+  };
 
   // If the assumed unique return value is an argument, annotate it.
   if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) {
-    AssociatedVal = UniqueRVArg;
-    Changed = AbstractAttribute::manifest(A) | Changed;
+    // TODO: This should be handled differently!
+    this->AnchorVal = UniqueRVArg;
+    this->KindOrArgNo = UniqueRVArg->getArgNo();
+    Changed = IRAttribute::manifest(A);
+  } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) {
+    // We can replace the returned value with the unique returned constant.
+    Value &AnchorValue = getAnchorValue();
+    if (Function *F = dyn_cast<Function>(&AnchorValue)) {
+      for (const Use &U : F->uses())
+        if (CallBase *CB = dyn_cast<CallBase>(U.getUser()))
+          if (CB->isCallee(&U)) {
+            Constant *RVCCast =
+                CB->getType() == RVC->getType()
+                    ? RVC
+                    : ConstantExpr::getTruncOrBitCast(RVC, CB->getType());
+            Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed;
+          }
+    } else {
+      assert(isa<CallBase>(AnchorValue) &&
+             "Expcected a function or call base anchor!");
+      Constant *RVCCast =
+          AnchorValue.getType() == RVC->getType()
+              ? RVC
+              : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType());
+      Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast);
+    }
+    if (Changed == ChangeStatus::CHANGED)
+      STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn,
+                      "Number of function returns replaced by constant return");
   }
 
   return Changed;
@@ -590,18 +1116,20 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
 
 const std::string AAReturnedValuesImpl::getAsStr() const {
   return (isAtFixpoint() ? "returns(#" : "may-return(#") +
-         (isValidState() ? std::to_string(getNumReturnValues()) : "?") + ")";
+         (isValidState() ? std::to_string(getNumReturnValues()) : "?") +
+         ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]";
 }
 
-Optional<Value *> AAReturnedValuesImpl::getAssumedUniqueReturnValue() const {
-  // If checkForallReturnedValues provides a unique value, ignoring potential
+Optional<Value *>
+AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const {
+  // If checkForAllReturnedValues provides a unique value, ignoring potential
   // undef values that can also be present, it is assumed to be the actual
   // return value and forwarded to the caller of this method. If there are
   // multiple, a nullptr is returned indicating there cannot be a unique
   // returned value.
   Optional<Value *> UniqueRV;
 
-  std::function<bool(Value &)> Pred = [&](Value &RV) -> bool {
+  auto Pred = [&](Value &RV) -> bool {
     // If we found a second returned value and neither the current nor the saved
     // one is an undef, there is no unique returned value. Undefs are special
     // since we can pretend they have any value.
@@ -618,14 +1146,15 @@ Optional<Value *> AAReturnedValuesImpl::getAssumedUniqueReturnValue() const {
     return true;
   };
 
-  if (!checkForallReturnedValues(Pred))
+  if (!A.checkForAllReturnedValues(Pred, *this))
     UniqueRV = nullptr;
 
   return UniqueRV;
 }
 
-bool AAReturnedValuesImpl::checkForallReturnedValues(
-    std::function<bool(Value &)> &Pred) const {
+bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
+    const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+        &Pred) const {
   if (!isValidState())
     return false;
 
@@ -634,11 +1163,11 @@ bool AAReturnedValuesImpl::checkForallReturnedValues(
   for (auto &It : ReturnedValues) {
     Value *RV = It.first;
 
-    ImmutableCallSite ICS(RV);
-    if (ICS && !HasOverdefinedReturnedCalls)
+    CallBase *CB = dyn_cast<CallBase>(RV);
+    if (CB && !UnresolvedCalls.count(CB))
       continue;
 
-    if (!Pred(*RV))
+    if (!Pred(*RV, It.second))
       return false;
   }
 
@@ -646,125 +1175,195 @@ bool AAReturnedValuesImpl::checkForallReturnedValues(
 }
 
 ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
+  size_t NumUnresolvedCalls = UnresolvedCalls.size();
+  bool Changed = false;
+
+  // State used in the value traversals starting in returned values.
+  struct RVState {
+    // The map in which we collect return values -> return instrs.
+    decltype(ReturnedValues) &RetValsMap;
+    // The flag to indicate a change.
+    bool &Changed;
+    // The return instrs we come from.
+    SmallSetVector<ReturnInst *, 4> RetInsts;
+  };
 
-  // Check if we know of any values returned by the associated function,
-  // if not, we are done.
-  if (getNumReturnValues() == 0) {
-    indicateOptimisticFixpoint();
-    return ChangeStatus::UNCHANGED;
-  }
+  // Callback for a leaf value returned by the associated function.
+  auto VisitValueCB = [](Value &Val, RVState &RVS, bool) -> bool {
+    auto Size = RVS.RetValsMap[&Val].size();
+    RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end());
+    bool Inserted = RVS.RetValsMap[&Val].size() != Size;
+    RVS.Changed |= Inserted;
+    LLVM_DEBUG({
+      if (Inserted)
+        dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val
+               << " => " << RVS.RetInsts.size() << "\n";
+    });
+    return true;
+  };
 
-  // Check if any of the returned values is a call site we can refine.
-  decltype(ReturnedValues) AddRVs;
-  bool HasCallSite = false;
+  // Helper method to invoke the generic value traversal.
+  auto VisitReturnedValue = [&](Value &RV, RVState &RVS) {
+    IRPosition RetValPos = IRPosition::value(RV);
+    return genericValueTraversal<AAReturnedValues, RVState>(A, RetValPos, *this,
+                                                            RVS, VisitValueCB);
+  };
 
-  // Look at all returned call sites.
-  for (auto &It : ReturnedValues) {
-    SmallPtrSet<ReturnInst *, 2> &ReturnInsts = It.second;
-    Value *RV = It.first;
-    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Potentially returned value " << *RV
-                      << "\n");
+  // Callback for all "return intructions" live in the associated function.
+  auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) {
+    ReturnInst &Ret = cast<ReturnInst>(I);
+    RVState RVS({ReturnedValues, Changed, {}});
+    RVS.RetInsts.insert(&Ret);
+    return VisitReturnedValue(*Ret.getReturnValue(), RVS);
+  };
 
-    // Only call sites can change during an update, ignore the rest.
-    CallSite RetCS(RV);
-    if (!RetCS)
+  // Start by discovering returned values from all live returned instructions in
+  // the associated function.
+  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}))
+    return indicatePessimisticFixpoint();
+
+  // Once returned values "directly" present in the code are handled we try to
+  // resolve returned calls.
+  decltype(ReturnedValues) NewRVsMap;
+  for (auto &It : ReturnedValues) {
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *It.first
+                      << " by #" << It.second.size() << " RIs\n");
+    CallBase *CB = dyn_cast<CallBase>(It.first);
+    if (!CB || UnresolvedCalls.count(CB))
       continue;
 
-    // For now, any call site we see will prevent us from directly fixing the
-    // state. However, if the information on the callees is fixed, the call
-    // sites will be removed and we will fix the information for this state.
-    HasCallSite = true;
-
-    // Try to find a assumed unique return value for the called function.
-    auto *RetCSAA = A.getAAFor<AAReturnedValuesImpl>(*this, *RV);
-    if (!RetCSAA) {
-      HasOverdefinedReturnedCalls = true;
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned call site (" << *RV
-                        << ") with " << (RetCSAA ? "invalid" : "no")
-                        << " associated state\n");
+    if (!CB->getCalledFunction()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
       continue;
     }
 
-    // Try to find a assumed unique return value for the called function.
-    Optional<Value *> AssumedUniqueRV = RetCSAA->getAssumedUniqueReturnValue();
-
-    // If no assumed unique return value was found due to the lack of
-    // candidates, we may need to resolve more calls (through more update
-    // iterations) or the called function will not return. Either way, we simply
-    // stick with the call sites as return values. Because there were not
-    // multiple possibilities, we do not treat it as overdefined.
-    if (!AssumedUniqueRV.hasValue())
+    // TODO: use the function scope once we have call site AAReturnedValues.
+    const auto &RetValAA = A.getAAFor<AAReturnedValues>(
+        *this, IRPosition::function(*CB->getCalledFunction()));
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: "
+                      << RetValAA << "\n");
+
+    // Skip dead ends, thus if we do not know anything about the returned
+    // call we mark it as unresolved and it will stay that way.
+    if (!RetValAA.getState().isValidState()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
       continue;
+    }
 
-    // If multiple, non-refinable values were found, there cannot be a unique
-    // return value for the called function. The returned call is overdefined!
-    if (!AssumedUniqueRV.getValue()) {
-      HasOverdefinedReturnedCalls = true;
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned call site has multiple "
-                           "potentially returned values\n");
+    // Do not try to learn partial information. If the callee has unresolved
+    // return values we will treat the call as unresolved/opaque.
+    auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls();
+    if (!RetValAAUnresolvedCalls.empty()) {
+      UnresolvedCalls.insert(CB);
       continue;
     }
 
-    LLVM_DEBUG({
-      bool UniqueRVIsKnown = RetCSAA->isAtFixpoint();
-      dbgs() << "[AAReturnedValues] Returned call site "
-             << (UniqueRVIsKnown ? "known" : "assumed")
-             << " unique return value: " << *AssumedUniqueRV << "\n";
-    });
+    // Now check if we can track transitively returned values. If possible, thus
+    // if all return value can be represented in the current scope, do so.
+    bool Unresolved = false;
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) ||
+          isa<Constant>(RetVal))
+        continue;
+      // Anything that did not fit in the above categories cannot be resolved,
+      // mark the call as unresolved.
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value "
+                           "cannot be translated: "
+                        << *RetVal << "\n");
+      UnresolvedCalls.insert(CB);
+      Unresolved = true;
+      break;
+    }
 
-    // The assumed unique return value.
-    Value *AssumedRetVal = AssumedUniqueRV.getValue();
-
-    // If the assumed unique return value is an argument, lookup the matching
-    // call site operand and recursively collect new returned values.
-    // If it is not an argument, it is just put into the set of returned values
-    // as we would have already looked through casts, phis, and similar values.
-    if (Argument *AssumedRetArg = dyn_cast<Argument>(AssumedRetVal))
-      collectValuesRecursively(A,
-                               RetCS.getArgOperand(AssumedRetArg->getArgNo()),
-                               ReturnInsts, AddRVs);
-    else
-      AddRVs[AssumedRetVal].insert(ReturnInsts.begin(), ReturnInsts.end());
-  }
+    if (Unresolved)
+      continue;
 
-  // Keep track of any change to trigger updates on dependent attributes.
-  ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    // Now track transitively returned values.
+    unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB];
+    if (NumRetAA == RetValAA.getNumReturnValues()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not "
+                           "changed since it was seen last\n");
+      continue;
+    }
+    NumRetAA = RetValAA.getNumReturnValues();
+
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (Argument *Arg = dyn_cast<Argument>(RetVal)) {
+        // Arguments are mapped to call site operands and we begin the traversal
+        // again.
+        bool Unused = false;
+        RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
+        VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS);
+        continue;
+      } else if (isa<CallBase>(RetVal)) {
+        // Call sites are resolved by the callee attribute over time, no need to
+        // do anything for us.
+        continue;
+      } else if (isa<Constant>(RetVal)) {
+        // Constants are valid everywhere, we can simply take them.
+        NewRVsMap[RetVal].insert(It.second.begin(), It.second.end());
+        continue;
+      }
+    }
+  }
 
-  for (auto &It : AddRVs) {
+  // To avoid modifications to the ReturnedValues map while we iterate over it
+  // we kept record of potential new entries in a copy map, NewRVsMap.
+  for (auto &It : NewRVsMap) {
     assert(!It.second.empty() && "Entry does not add anything.");
     auto &ReturnInsts = ReturnedValues[It.first];
     for (ReturnInst *RI : It.second)
-      if (ReturnInsts.insert(RI).second) {
+      if (ReturnInsts.insert(RI)) {
         LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
                           << *It.first << " => " << *RI << "\n");
-        Changed = ChangeStatus::CHANGED;
+        Changed = true;
       }
   }
 
-  // If there is no call site in the returned values we are done.
-  if (!HasCallSite) {
-    indicateOptimisticFixpoint();
-    return ChangeStatus::CHANGED;
-  }
-
-  return Changed;
+  Changed |= (NumUnresolvedCalls != UnresolvedCalls.size());
+  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
 }
 
-/// ------------------------ NoSync Function Attribute -------------------------
+struct AAReturnedValuesFunction final : public AAReturnedValuesImpl {
+  AAReturnedValuesFunction(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {}
 
-struct AANoSyncFunction : AANoSync, BooleanState {
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) }
+};
 
-  AANoSyncFunction(Function &F, InformationCache &InfoCache)
-      : AANoSync(F, InfoCache) {}
+/// Returned values information for a call sites.
+struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
+  AAReturnedValuesCallSite(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {}
 
-  /// See AbstractAttribute::getState()
-  /// {
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
-  /// }
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for returned values are not "
+                     "supported for call sites yet!");
+  }
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_FUNCTION; }
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// ------------------------ NoSync Function Attribute -------------------------
+
+struct AANoSyncImpl : AANoSync {
+  AANoSyncImpl(const IRPosition &IRP) : AANoSync(IRP) {}
 
   const std::string getAsStr() const override {
     return getAssumed() ? "nosync" : "may-sync";
@@ -773,12 +1372,6 @@ struct AANoSyncFunction : AANoSync, BooleanState {
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override;
 
-  /// See AANoSync::isAssumedNoSync()
-  bool isAssumedNoSync() const override { return getAssumed(); }
-
-  /// See AANoSync::isKnownNoSync()
-  bool isKnownNoSync() const override { return getKnown(); }
-
   /// Helper function used to determine whether an instruction is non-relaxed
   /// atomic. In other words, if an atomic instruction does not have unordered
   /// or monotonic ordering
@@ -792,7 +1385,7 @@ struct AANoSyncFunction : AANoSync, BooleanState {
   static bool isNoSyncIntrinsic(Instruction *I);
 };
 
-bool AANoSyncFunction::isNonRelaxedAtomic(Instruction *I) {
+bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) {
   if (!I->isAtomic())
     return false;
 
@@ -841,7 +1434,7 @@ bool AANoSyncFunction::isNonRelaxedAtomic(Instruction *I) {
 
 /// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
 /// FIXME: We should ipmrove the handling of intrinsics.
-bool AANoSyncFunction::isNoSyncIntrinsic(Instruction *I) {
+bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
   if (auto *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
     /// Element wise atomic memory intrinsics are can only be unordered,
@@ -863,7 +1456,7 @@ bool AANoSyncFunction::isNoSyncIntrinsic(Instruction *I) {
   return false;
 }
 
-bool AANoSyncFunction::isVolatile(Instruction *I) {
+bool AANoSyncImpl::isVolatile(Instruction *I) {
   assert(!ImmutableCallSite(I) && !isa<CallBase>(I) &&
          "Calls should not be checked here");
 
@@ -881,482 +1474,4377 @@ bool AANoSyncFunction::isVolatile(Instruction *I) {
   }
 }
 
-ChangeStatus AANoSyncFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
 
-  /// We are looking for volatile instructions or Non-Relaxed atomics.
-  /// FIXME: We should ipmrove the handling of intrinsics.
-  for (Instruction *I : InfoCache.getReadOrWriteInstsForFunction(F)) {
-    ImmutableCallSite ICS(I);
-    auto *NoSyncAA = A.getAAFor<AANoSyncFunction>(*this, *I);
+  auto CheckRWInstForNoSync = [&](Instruction &I) {
+    /// We are looking for volatile instructions or Non-Relaxed atomics.
+    /// FIXME: We should improve the handling of intrinsics.
 
-    if (isa<IntrinsicInst>(I) && isNoSyncIntrinsic(I))
-      continue;
+    if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
+      return true;
 
-    if (ICS && (!NoSyncAA || !NoSyncAA->isAssumedNoSync()) &&
-        !ICS.hasFnAttr(Attribute::NoSync)) {
+    if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
+      if (ICS.hasFnAttr(Attribute::NoSync))
+        return true;
+
+      const auto &NoSyncAA =
+          A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(ICS));
+      if (NoSyncAA.isAssumedNoSync())
+        return true;
+      return false;
+    }
+
+    if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
+      return true;
+
+    return false;
+  };
+
+  auto CheckForNoSync = [&](Instruction &I) {
+    // At this point we handled all read/write effects and they are all
+    // nosync, so they can be skipped.
+    if (I.mayReadOrWriteMemory())
+      return true;
+
+    // non-convergent and readnone imply nosync.
+    return !ImmutableCallSite(&I).isConvergent();
+  };
+
+  if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
+      !A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
+    return indicatePessimisticFixpoint();
+
+  return ChangeStatus::UNCHANGED;
+}
+
+struct AANoSyncFunction final : public AANoSyncImpl {
+  AANoSyncFunction(const IRPosition &IRP) : AANoSyncImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) }
+};
+
+/// NoSync attribute deduction for a call sites.
+struct AANoSyncCallSite final : AANoSyncImpl {
+  AANoSyncCallSite(const IRPosition &IRP) : AANoSyncImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoSyncImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
       indicatePessimisticFixpoint();
-      return ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoSync::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
+};
+
+/// ------------------------ No-Free Attributes ----------------------------
+
+struct AANoFreeImpl : public AANoFree {
+  AANoFreeImpl(const IRPosition &IRP) : AANoFree(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForNoFree = [&](Instruction &I) {
+      ImmutableCallSite ICS(&I);
+      if (ICS.hasFnAttr(Attribute::NoFree))
+        return true;
+
+      const auto &NoFreeAA =
+          A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(ICS));
+      return NoFreeAA.isAssumedNoFree();
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nofree" : "may-free";
+  }
+};
+
+struct AANoFreeFunction final : public AANoFreeImpl {
+  AANoFreeFunction(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) }
+};
+
+/// NoFree attribute deduction for a call sites.
+struct AANoFreeCallSite final : AANoFreeImpl {
+  AANoFreeCallSite(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoFreeImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoFree::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); }
+};
+
+/// NoFree attribute for floating values.
+struct AANoFreeFloating : AANoFreeImpl {
+  AANoFreeFloating(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)}
+
+  /// See Abstract Attribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const IRPosition &IRP = getIRPosition();
+
+    const auto &NoFreeAA =
+        A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP));
+    if (NoFreeAA.isAssumedNoFree())
+      return ChangeStatus::UNCHANGED;
+
+    Value &AssociatedValue = getIRPosition().getAssociatedValue();
+    auto Pred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        if (CB->isBundleOperand(&U))
+          return false;
+        if (!CB->isArgOperand(&U))
+          return true;
+        unsigned ArgNo = CB->getArgOperandNo(&U);
+
+        const auto &NoFreeArg = A.getAAFor<AANoFree>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+        return NoFreeArg.isAssumedNoFree();
+      }
+
+      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+        Follow = true;
+        return true;
+      }
+
+      // Unknown user.
+      return false;
+    };
+    if (!A.checkForAllUses(Pred, *this, AssociatedValue))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+/// NoFree attribute for a call site argument.
+struct AANoFreeArgument final : AANoFreeFloating {
+  AANoFreeArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) }
+};
+
+/// NoFree attribute for call site arguments.
+struct AANoFreeCallSiteArgument final : AANoFreeFloating {
+  AANoFreeCallSiteArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoFree::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)};
+};
+
+/// NoFree attribute for function return value.
+struct AANoFreeReturned final : AANoFreeFloating {
+  AANoFreeReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// NoFree attribute deduction for a call site return value.
+struct AANoFreeCallSiteReturned final : AANoFreeFloating {
+  AANoFreeCallSiteReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) {}
+
+  ChangeStatus manifest(Attributor &A) override {
+    return ChangeStatus::UNCHANGED;
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
+};
+
+/// ------------------------ NonNull Argument Attribute ------------------------
+static int64_t getKnownNonNullAndDerefBytesForUse(
+    Attributor &A, AbstractAttribute &QueryingAA, Value &AssociatedValue,
+    const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
+  TrackUse = false;
+
+  const Value *UseV = U->get();
+  if (!UseV->getType()->isPointerTy())
+    return 0;
+
+  Type *PtrTy = UseV->getType();
+  const Function *F = I->getFunction();
+  bool NullPointerIsDefined =
+      F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true;
+  const DataLayout &DL = A.getInfoCache().getDL();
+  if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
+    if (ICS.isBundleOperand(U))
+      return 0;
+
+    if (ICS.isCallee(U)) {
+      IsNonNull |= !NullPointerIsDefined;
+      return 0;
     }
 
-    if (ICS)
-      continue;
+    unsigned ArgNo = ICS.getArgumentNo(U);
+    IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo);
+    // As long as we only use known information there is no need to track
+    // dependences here.
+    auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP,
+                                                  /* TrackDependence */ false);
+    IsNonNull |= DerefAA.isKnownNonNull();
+    return DerefAA.getKnownDereferenceableBytes();
+  }
 
-    if (!isVolatile(I) && !isNonRelaxedAtomic(I))
-      continue;
+  // We need to follow common pointer manipulation uses to the accesses they
+  // feed into. We can try to be smart to avoid looking through things we do not
+  // like for now, e.g., non-inbounds GEPs.
+  if (isa<CastInst>(I)) {
+    TrackUse = true;
+    return 0;
+  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    if (GEP->hasAllConstantIndices()) {
+      TrackUse = true;
+      return 0;
+    }
+
+  int64_t Offset;
+  if (const Value *Base = getBasePointerOfAccessPointerOperand(I, Offset, DL)) {
+    if (Base == &AssociatedValue &&
+        Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+      int64_t DerefBytes =
+          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
 
+      IsNonNull |= !NullPointerIsDefined;
+      return std::max(int64_t(0), DerefBytes);
+    }
+  }
+
+  /// Corner case when an offset is 0.
+  if (const Value *Base = getBasePointerOfAccessPointerOperand(
+          I, Offset, DL, /*AllowNonInbounds*/ true)) {
+    if (Offset == 0 && Base == &AssociatedValue &&
+        Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+      int64_t DerefBytes =
+          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
+      IsNonNull |= !NullPointerIsDefined;
+      return std::max(int64_t(0), DerefBytes);
+    }
+  }
+
+  return 0;
+}
+
+struct AANonNullImpl : AANonNull {
+  AANonNullImpl(const IRPosition &IRP)
+      : AANonNull(IRP),
+        NullIsDefined(NullPointerIsDefined(
+            getAnchorScope(),
+            getAssociatedValue().getType()->getPointerAddressSpace())) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (!NullIsDefined &&
+        hasAttr({Attribute::NonNull, Attribute::Dereferenceable}))
+      indicateOptimisticFixpoint();
+    else if (isa<ConstantPointerNull>(getAssociatedValue()))
+      indicatePessimisticFixpoint();
+    else
+      AANonNull::initialize(A);
+  }
+
+  /// See AAFromMustBeExecutedContext
+  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I,
+                                       IsNonNull, TrackUse);
+    setKnown(IsNonNull);
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nonnull" : "may-null";
+  }
+
+  /// Flag to determine if the underlying value can be null and still allow
+  /// valid accesses.
+  const bool NullIsDefined;
+};
+
+/// NonNull attribute for a floating value.
+struct AANonNullFloating
+    : AAFromMustBeExecutedContext<AANonNull, AANonNullImpl> {
+  using Base = AAFromMustBeExecutedContext<AANonNull, AANonNullImpl>;
+  AANonNullFloating(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = Base::updateImpl(A);
+    if (isKnownNonNull())
+      return Change;
+
+    if (!NullIsDefined) {
+      const auto &DerefAA =
+          A.getAAFor<AADereferenceable>(*this, getIRPosition());
+      if (DerefAA.getAssumedDereferenceableBytes())
+        return Change;
+    }
+
+    const DataLayout &DL = A.getDataLayout();
+
+    DominatorTree *DT = nullptr;
+    InformationCache &InfoCache = A.getInfoCache();
+    if (const Function *Fn = getAnchorScope())
+      DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn);
+
+    auto VisitValueCB = [&](Value &V, AANonNull::StateType &T,
+                            bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        if (!isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr, getCtxI(), DT))
+          T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
+        const AANonNull::StateType &NS =
+            static_cast<const AANonNull::StateType &>(AA.getState());
+        T ^= NS;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AANonNull, StateType>(A, getIRPosition(), *this,
+                                                     T, VisitValueCB))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function return value.
+struct AANonNullReturned final
+    : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl> {
+  AANonNullReturned(const IRPosition &IRP)
+      : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl>(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function argument.
+struct AANonNullArgument final
+    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull,
+                                                              AANonNullImpl> {
+  AANonNullArgument(const IRPosition &IRP)
+      : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull,
+                                                                AANonNullImpl>(
+            IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) }
+};
+
+struct AANonNullCallSiteArgument final : AANonNullFloating {
+  AANonNullCallSiteArgument(const IRPosition &IRP) : AANonNullFloating(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) }
+};
+
+/// NonNull attribute for a call site return position.
+struct AANonNullCallSiteReturned final
+    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull,
+                                                             AANonNullImpl> {
+  AANonNullCallSiteReturned(const IRPosition &IRP)
+      : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull,
+                                                               AANonNullImpl>(
+            IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
+};
+
+/// ------------------------ No-Recurse Attributes ----------------------------
+
+struct AANoRecurseImpl : public AANoRecurse {
+  AANoRecurseImpl(const IRPosition &IRP) : AANoRecurse(IRP) {}
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "norecurse" : "may-recurse";
+  }
+};
+
+struct AANoRecurseFunction final : AANoRecurseImpl {
+  AANoRecurseFunction(const IRPosition &IRP) : AANoRecurseImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    if (const Function *F = getAnchorScope())
+      if (A.getInfoCache().getSccSize(*F) == 1)
+        return;
     indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
   }
 
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
-  auto Opcodes = {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
-                  (unsigned)Instruction::Call};
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
 
-  for (unsigned Opcode : Opcodes) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
-      // At this point we handled all read/write effects and they are all
-      // nosync, so they can be skipped.
-      if (I->mayReadOrWriteMemory())
-        continue;
+    auto CheckForNoRecurse = [&](Instruction &I) {
+      ImmutableCallSite ICS(&I);
+      if (ICS.hasFnAttr(Attribute::NoRecurse))
+        return true;
 
-      ImmutableCallSite ICS(I);
+      const auto &NoRecurseAA =
+          A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(ICS));
+      if (!NoRecurseAA.isAssumedNoRecurse())
+        return false;
 
-      // non-convergent and readnone imply nosync.
-      if (!ICS.isConvergent())
-        continue;
+      // Recursion to the same function
+      if (ICS.getCalledFunction() == getAnchorScope())
+        return false;
+
+      return true;
+    };
 
+    if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) }
+};
+
+/// NoRecurse attribute deduction for a call sites.
+struct AANoRecurseCallSite final : AANoRecurseImpl {
+  AANoRecurseCallSite(const IRPosition &IRP) : AANoRecurseImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
       indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoRecurse::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
+};
+
+/// -------------------- Undefined-Behavior Attributes ------------------------
+
+struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
+  AAUndefinedBehaviorImpl(const IRPosition &IRP) : AAUndefinedBehavior(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  // through a pointer (i.e. also branches etc.)
+  ChangeStatus updateImpl(Attributor &A) override {
+    const size_t UBPrevSize = KnownUBInsts.size();
+    const size_t NoUBPrevSize = AssumedNoUBInsts.size();
+
+    auto InspectMemAccessInstForUB = [&](Instruction &I) {
+      // Skip instructions that are already saved.
+      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+        return true;
+
+      // If we reach here, we know we have an instruction
+      // that accesses memory through a pointer operand,
+      // for which getPointerOperand() should give it to us.
+      const Value *PtrOp =
+          Attributor::getPointerOperand(&I, /* AllowVolatile */ true);
+      assert(PtrOp &&
+             "Expected pointer operand of memory accessing instruction");
+
+      // A memory access through a pointer is considered UB
+      // only if the pointer has constant null value.
+      // TODO: Expand it to not only check constant values.
+      if (!isa<ConstantPointerNull>(PtrOp)) {
+        AssumedNoUBInsts.insert(&I);
+        return true;
+      }
+      const Type *PtrTy = PtrOp->getType();
+
+      // Because we only consider instructions inside functions,
+      // assume that a parent function exists.
+      const Function *F = I.getFunction();
+
+      // A memory access using constant null pointer is only considered UB
+      // if null pointer is _not_ defined for the target platform.
+      if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()))
+        AssumedNoUBInsts.insert(&I);
+      else
+        KnownUBInsts.insert(&I);
+      return true;
+    };
+
+    auto InspectBrInstForUB = [&](Instruction &I) {
+      // A conditional branch instruction is considered UB if it has `undef`
+      // condition.
+
+      // Skip instructions that are already saved.
+      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+        return true;
+
+      // We know we have a branch instruction.
+      auto BrInst = cast<BranchInst>(&I);
+
+      // Unconditional branches are never considered UB.
+      if (BrInst->isUnconditional())
+        return true;
+
+      // Either we stopped and the appropriate action was taken,
+      // or we got back a simplified value to continue.
+      Optional<Value *> SimplifiedCond =
+          stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst);
+      if (!SimplifiedCond.hasValue())
+        return true;
+      AssumedNoUBInsts.insert(&I);
+      return true;
+    };
+
+    A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
+                              {Instruction::Load, Instruction::Store,
+                               Instruction::AtomicCmpXchg,
+                               Instruction::AtomicRMW});
+    A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br});
+    if (NoUBPrevSize != AssumedNoUBInsts.size() ||
+        UBPrevSize != KnownUBInsts.size())
       return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  bool isKnownToCauseUB(Instruction *I) const override {
+    return KnownUBInsts.count(I);
+  }
+
+  bool isAssumedToCauseUB(Instruction *I) const override {
+    // In simple words, if an instruction is not in the assumed to _not_
+    // cause UB, then it is assumed UB (that includes those
+    // in the KnownUBInsts set). The rest is boilerplate
+    // is to ensure that it is one of the instructions we test
+    // for UB.
+
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::AtomicCmpXchg:
+    case Instruction::AtomicRMW:
+      return !AssumedNoUBInsts.count(I);
+    case Instruction::Br: {
+      auto BrInst = cast<BranchInst>(I);
+      if (BrInst->isUnconditional())
+        return false;
+      return !AssumedNoUBInsts.count(I);
+    } break;
+    default:
+      return false;
     }
+    return false;
   }
 
-  return ChangeStatus::UNCHANGED;
+  ChangeStatus manifest(Attributor &A) override {
+    if (KnownUBInsts.empty())
+      return ChangeStatus::UNCHANGED;
+    for (Instruction *I : KnownUBInsts)
+      A.changeToUnreachableAfterManifest(I);
+    return ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "undefined-behavior" : "no-ub";
+  }
+
+  /// Note: The correctness of this analysis depends on the fact that the
+  /// following 2 sets will stop changing after some point.
+  /// "Change" here means that their size changes.
+  /// The size of each set is monotonically increasing
+  /// (we only add items to them) and it is upper bounded by the number of
+  /// instructions in the processed function (we can never save more
+  /// elements in either set than this number). Hence, at some point,
+  /// they will stop increasing.
+  /// Consequently, at some point, both sets will have stopped
+  /// changing, effectively making the analysis reach a fixpoint.
+
+  /// Note: These 2 sets are disjoint and an instruction can be considered
+  /// one of 3 things:
+  /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in
+  ///    the KnownUBInsts set.
+  /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior
+  ///    has a reason to assume it).
+  /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior
+  ///    could not find a reason to assume or prove that it can cause UB,
+  ///    hence it assumes it doesn't. We have a set for these instructions
+  ///    so that we don't reprocess them in every update.
+  ///    Note however that instructions in this set may cause UB.
+
+protected:
+  /// A set of all live instructions _known_ to cause UB.
+  SmallPtrSet<Instruction *, 8> KnownUBInsts;
+
+private:
+  /// A set of all the (live) instructions that are assumed to _not_ cause UB.
+  SmallPtrSet<Instruction *, 8> AssumedNoUBInsts;
+
+  // Should be called on updates in which if we're processing an instruction
+  // \p I that depends on a value \p V, one of the following has to happen:
+  // - If the value is assumed, then stop.
+  // - If the value is known but undef, then consider it UB.
+  // - Otherwise, do specific processing with the simplified value.
+  // We return None in the first 2 cases to signify that an appropriate
+  // action was taken and the caller should stop.
+  // Otherwise, we return the simplified value that the caller should
+  // use for specific processing.
+  Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V,
+                                         Instruction *I) {
+    const auto &ValueSimplifyAA =
+        A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V));
+    Optional<Value *> SimplifiedV =
+        ValueSimplifyAA.getAssumedSimplifiedValue(A);
+    if (!ValueSimplifyAA.isKnown()) {
+      // Don't depend on assumed values.
+      return llvm::None;
+    }
+    if (!SimplifiedV.hasValue()) {
+      // If it is known (which we tested above) but it doesn't have a value,
+      // then we can assume `undef` and hence the instruction is UB.
+      KnownUBInsts.insert(I);
+      return llvm::None;
+    }
+    Value *Val = SimplifiedV.getValue();
+    if (isa<UndefValue>(Val)) {
+      KnownUBInsts.insert(I);
+      return llvm::None;
+    }
+    return Val;
+  }
+};
+
+struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl {
+  AAUndefinedBehaviorFunction(const IRPosition &IRP)
+      : AAUndefinedBehaviorImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECL(UndefinedBehaviorInstruction, Instruction,
+               "Number of instructions known to have UB");
+    BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) +=
+        KnownUBInsts.size();
+  }
+};
+
+/// ------------------------ Will-Return Attributes ----------------------------
+
+// Helper function that checks whether a function has any cycle.
+// TODO: Replace with more efficent code
+static bool containsCycle(Function &F) {
+  SmallPtrSet<BasicBlock *, 32> Visited;
+
+  // Traverse BB by dfs and check whether successor is already visited.
+  for (BasicBlock *BB : depth_first(&F)) {
+    Visited.insert(BB);
+    for (auto *SuccBB : successors(BB)) {
+      if (Visited.count(SuccBB))
+        return true;
+    }
+  }
+  return false;
 }
 
-/// ------------------------ No-Free Attributes ----------------------------
+// Helper function that checks the function have a loop which might become an
+// endless loop
+// FIXME: Any cycle is regarded as endless loop for now.
+//        We have to allow some patterns.
+static bool containsPossiblyEndlessLoop(Function *F) {
+  return !F || !F->hasExactDefinition() || containsCycle(*F);
+}
 
-struct AANoFreeFunction : AbstractAttribute, BooleanState {
+struct AAWillReturnImpl : public AAWillReturn {
+  AAWillReturnImpl(const IRPosition &IRP) : AAWillReturn(IRP) {}
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AANoFreeFunction(Function &F, InformationCache &InfoCache)
-      : AbstractAttribute(F, InfoCache) {}
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAWillReturn::initialize(A);
 
-  /// See AbstractAttribute::getState()
-  ///{
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
-  ///}
+    Function *F = getAssociatedFunction();
+    if (containsPossiblyEndlessLoop(F))
+      indicatePessimisticFixpoint();
+  }
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_FUNCTION; }
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForWillReturn = [&](Instruction &I) {
+      IRPosition IPos = IRPosition::callsite_function(ImmutableCallSite(&I));
+      const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos);
+      if (WillReturnAA.isKnownWillReturn())
+        return true;
+      if (!WillReturnAA.isAssumedWillReturn())
+        return false;
+      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos);
+      return NoRecurseAA.isAssumedNoRecurse();
+    };
 
-  /// See AbstractAttribute::getAsStr().
+    if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr()
   const std::string getAsStr() const override {
-    return getAssumed() ? "nofree" : "may-free";
+    return getAssumed() ? "willreturn" : "may-noreturn";
+  }
+};
+
+struct AAWillReturnFunction final : AAWillReturnImpl {
+  AAWillReturnFunction(const IRPosition &IRP) : AAWillReturnImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) }
+};
+
+/// WillReturn attribute deduction for a call sites.
+struct AAWillReturnCallSite final : AAWillReturnImpl {
+  AAWillReturnCallSite(const IRPosition &IRP) : AAWillReturnImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAWillReturnImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AAWillReturn::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
+};
 
-  /// See AbstractAttribute::getAttrKind().
-  Attribute::AttrKind getAttrKind() const override { return ID; }
+/// -------------------AAReachability Attribute--------------------------
 
-  /// Return true if "nofree" is assumed.
-  bool isAssumedNoFree() const { return getAssumed(); }
+struct AAReachabilityImpl : AAReachability {
+  AAReachabilityImpl(const IRPosition &IRP) : AAReachability(IRP) {}
 
-  /// Return true if "nofree" is known.
-  bool isKnownNoFree() const { return getKnown(); }
+  const std::string getAsStr() const override {
+    // TODO: Return the number of reachable queries.
+    return "reachable";
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override { indicatePessimisticFixpoint(); }
 
-  /// The identifier used by the Attributor for this class of attributes.
-  static constexpr Attribute::AttrKind ID = Attribute::NoFree;
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
 };
 
-ChangeStatus AANoFreeFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+struct AAReachabilityFunction final : public AAReachabilityImpl {
+  AAReachabilityFunction(const IRPosition &IRP) : AAReachabilityImpl(IRP) {}
 
-  // The map from instruction opcodes to those instructions in the function.
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
+};
 
-  for (unsigned Opcode :
-       {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
-        (unsigned)Instruction::Call}) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
+/// ------------------------ NoAlias Argument Attribute ------------------------
+
+struct AANoAliasImpl : AANoAlias {
+  AANoAliasImpl(const IRPosition &IRP) : AANoAlias(IRP) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noalias" : "may-alias";
+  }
+};
+
+/// NoAlias attribute for a floating value.
+struct AANoAliasFloating final : AANoAliasImpl {
+  AANoAliasFloating(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Value &Val = getAssociatedValue();
+    if (isa<AllocaInst>(Val))
+      indicateOptimisticFixpoint();
+    if (isa<ConstantPointerNull>(Val) &&
+        Val.getType()->getPointerAddressSpace() == 0)
+      indicateOptimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Implement this.
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(noalias)
+  }
+};
+
+/// NoAlias attribute for an argument.
+struct AANoAliasArgument final
+    : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> {
+  using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>;
+  AANoAliasArgument(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::update(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // We have to make sure no-alias on the argument does not break
+    // synchronization when this is a callback argument, see also [1] below.
+    // If synchronization cannot be affected, we delegate to the base updateImpl
+    // function, otherwise we give up for now.
+
+    // If the function is no-sync, no-alias cannot break synchronization.
+    const auto &NoSyncAA = A.getAAFor<AANoSync>(
+        *this, IRPosition::function_scope(getIRPosition()));
+    if (NoSyncAA.isAssumedNoSync())
+      return Base::updateImpl(A);
+
+    // If the argument is read-only, no-alias cannot break synchronization.
+    const auto &MemBehaviorAA =
+        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+    if (MemBehaviorAA.isAssumedReadOnly())
+      return Base::updateImpl(A);
+
+    // If the argument is never passed through callbacks, no-alias cannot break
+    // synchronization.
+    if (A.checkForAllCallSites(
+            [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this,
+            true))
+      return Base::updateImpl(A);
+
+    // TODO: add no-alias but make sure it doesn't break synchronization by
+    // introducing fake uses. See:
+    // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel,
+    //     International Workshop on OpenMP 2018,
+    //     http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf
+
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) }
+};
+
+struct AANoAliasCallSiteArgument final : AANoAliasImpl {
+  AANoAliasCallSiteArgument(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // See callsite argument attribute and callee argument attribute.
+    ImmutableCallSite ICS(&getAnchorValue());
+    if (ICS.paramHasAttr(getArgNo(), Attribute::NoAlias))
+      indicateOptimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // We can deduce "noalias" if the following conditions hold.
+    // (i)   Associated value is assumed to be noalias in the definition.
+    // (ii)  Associated value is assumed to be no-capture in all the uses
+    //       possibly executed before this callsite.
+    // (iii) There is no other pointer argument which could alias with the
+    //       value.
+
+    const Value &V = getAssociatedValue();
+    const IRPosition IRP = IRPosition::value(V);
+
+    // (i) Check whether noalias holds in the definition.
+
+    auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP);
+    LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] check definition: " << V
+                      << " :: " << NoAliasAA << "\n");
+
+    if (!NoAliasAA.isAssumedNoAlias())
+      return indicatePessimisticFixpoint();
+
+    LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] " << V
+                      << " is assumed NoAlias in the definition\n");
+
+    // (ii) Check whether the value is captured in the scope using AANoCapture.
+    //      FIXME: This is conservative though, it is better to look at CFG and
+    //             check only uses possibly executed before this callsite.
+
+    auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
+    if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      LLVM_DEBUG(
+          dbgs() << "[Attributor][AANoAliasCSArg] " << V
+                 << " cannot be noalias as it is potentially captured\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    // (iii) Check there is no other pointer argument which could alias with the
+    // value.
+    // TODO: AbstractCallSite
+    ImmutableCallSite ICS(&getAnchorValue());
+    for (unsigned i = 0; i < ICS.getNumArgOperands(); i++) {
+      if (getArgNo() == (int)i)
+        continue;
+      const Value *ArgOp = ICS.getArgOperand(i);
+      if (!ArgOp->getType()->isPointerTy())
+        continue;
+
+      if (const Function *F = getAnchorScope()) {
+        if (AAResults *AAR = A.getInfoCache().getAAResultsForFunction(*F)) {
+          bool IsAliasing = !AAR->isNoAlias(&getAssociatedValue(), ArgOp);
+          LLVM_DEBUG(dbgs()
+                     << "[Attributor][NoAliasCSArg] Check alias between "
+                        "callsite arguments "
+                     << AAR->isNoAlias(&getAssociatedValue(), ArgOp) << " "
+                     << getAssociatedValue() << " " << *ArgOp << " => "
+                     << (IsAliasing ? "" : "no-") << "alias \n");
+
+          if (!IsAliasing)
+            continue;
+        }
+      }
+      return indicatePessimisticFixpoint();
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) }
+};
+
+/// NoAlias attribute for function return value.
+struct AANoAliasReturned final : AANoAliasImpl {
+  AANoAliasReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
 
-      auto ICS = ImmutableCallSite(I);
-      auto *NoFreeAA = A.getAAFor<AANoFreeFunction>(*this, *I);
+    auto CheckReturnValue = [&](Value &RV) -> bool {
+      if (Constant *C = dyn_cast<Constant>(&RV))
+        if (C->isNullValue() || isa<UndefValue>(C))
+          return true;
 
-      if ((!NoFreeAA || !NoFreeAA->isAssumedNoFree()) &&
-          !ICS.hasFnAttr(Attribute::NoFree)) {
+      /// For now, we can only deduce noalias if we have call sites.
+      /// FIXME: add more support.
+      ImmutableCallSite ICS(&RV);
+      if (!ICS)
+        return false;
+
+      const IRPosition &RVPos = IRPosition::value(RV);
+      const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos);
+      if (!NoAliasAA.isAssumedNoAlias())
+        return false;
+
+      const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos);
+      return NoCaptureAA.isAssumedNoCaptureMaybeReturned();
+    };
+
+    if (!A.checkForAllReturnedValues(CheckReturnValue, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) }
+};
+
+/// NoAlias attribute deduction for a call site return value.
+struct AANoAliasCallSiteReturned final : AANoAliasImpl {
+  AANoAliasCallSiteReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::returned(*F);
+    auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoAlias::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
+};
+
+/// -------------------AAIsDead Function Attribute-----------------------
+
+struct AAIsDeadValueImpl : public AAIsDead {
+  AAIsDeadValueImpl(const IRPosition &IRP) : AAIsDead(IRP) {}
+
+  /// See AAIsDead::isAssumedDead().
+  bool isAssumedDead() const override { return getAssumed(); }
+
+  /// See AAIsDead::isAssumedDead(BasicBlock *).
+  bool isAssumedDead(const BasicBlock *BB) const override { return false; }
+
+  /// See AAIsDead::isKnownDead(BasicBlock *).
+  bool isKnownDead(const BasicBlock *BB) const override { return false; }
+
+  /// See AAIsDead::isAssumedDead(Instruction *I).
+  bool isAssumedDead(const Instruction *I) const override {
+    return I == getCtxI() && isAssumedDead();
+  }
+
+  /// See AAIsDead::isKnownDead(Instruction *I).
+  bool isKnownDead(const Instruction *I) const override {
+    return I == getCtxI() && getKnown();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return isAssumedDead() ? "assumed-dead" : "assumed-live";
+  }
+};
+
+struct AAIsDeadFloating : public AAIsDeadValueImpl {
+  AAIsDeadFloating(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (Instruction *I = dyn_cast<Instruction>(&getAssociatedValue()))
+      if (!wouldInstructionBeTriviallyDead(I))
         indicatePessimisticFixpoint();
+    if (isa<UndefValue>(getAssociatedValue()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto UsePred = [&](const Use &U, bool &Follow) {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      if (CallSite CS = CallSite(UserI)) {
+        if (!CS.isArgOperand(&U))
+          return false;
+        const IRPosition &CSArgPos =
+            IRPosition::callsite_argument(CS, CS.getArgumentNo(&U));
+        const auto &CSArgIsDead = A.getAAFor<AAIsDead>(*this, CSArgPos);
+        return CSArgIsDead.isAssumedDead();
+      }
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) {
+        const IRPosition &RetPos = IRPosition::returned(*RI->getFunction());
+        const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, RetPos);
+        return RetIsDeadAA.isAssumedDead();
+      }
+      Follow = true;
+      return wouldInstructionBeTriviallyDead(UserI);
+    };
+
+    if (!A.checkForAllUses(UsePred, *this, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    if (auto *I = dyn_cast<Instruction>(&V))
+      if (wouldInstructionBeTriviallyDead(I)) {
+        A.deleteAfterManifest(*I);
+        return ChangeStatus::CHANGED;
+      }
+
+    if (V.use_empty())
+      return ChangeStatus::UNCHANGED;
+
+    UndefValue &UV = *UndefValue::get(V.getType());
+    bool AnyChange = A.changeValueAfterManifest(V, UV);
+    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(IsDead)
+  }
+};
+
+struct AAIsDeadArgument : public AAIsDeadFloating {
+  AAIsDeadArgument(const IRPosition &IRP) : AAIsDeadFloating(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (!getAssociatedFunction()->hasExactDefinition())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = AAIsDeadFloating::manifest(A);
+    Argument &Arg = *getAssociatedArgument();
+    if (Arg.getParent()->hasLocalLinkage())
+      if (A.registerFunctionSignatureRewrite(
+              Arg, /* ReplacementTypes */ {},
+              Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{},
+              Attributor::ArgumentReplacementInfo::ACSRepairCBTy{}))
         return ChangeStatus::CHANGED;
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
+  AAIsDeadCallSiteArgument(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (isa<UndefValue>(getAssociatedValue()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AAIsDead::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    CallBase &CB = cast<CallBase>(getAnchorValue());
+    Use &U = CB.getArgOperandUse(getArgNo());
+    assert(!isa<UndefValue>(U.get()) &&
+           "Expected undef values to be filtered out!");
+    UndefValue &UV = *UndefValue::get(U->getType());
+    if (A.changeUseAfterManifest(U, UV))
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadReturned : public AAIsDeadValueImpl {
+  AAIsDeadReturned(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+
+    auto PredForCallSite = [&](AbstractCallSite ACS) {
+      if (ACS.isCallbackCall())
+        return false;
+      const IRPosition &CSRetPos =
+          IRPosition::callsite_returned(ACS.getCallSite());
+      const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, CSRetPos);
+      return RetIsDeadAA.isAssumedDead();
+    };
+
+    if (!A.checkForAllCallSites(PredForCallSite, *this, true))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // TODO: Rewrite the signature to return void?
+    bool AnyChange = false;
+    UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType());
+    auto RetInstPred = [&](Instruction &I) {
+      ReturnInst &RI = cast<ReturnInst>(I);
+      if (!isa<UndefValue>(RI.getReturnValue()))
+        AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV);
+      return true;
+    };
+    A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret});
+    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
+  AAIsDeadCallSiteReturned(const IRPosition &IRP) : AAIsDeadFloating(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(IsDead) }
+};
+
+struct AAIsDeadFunction : public AAIsDead {
+  AAIsDeadFunction(const IRPosition &IRP) : AAIsDead(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    const Function *F = getAssociatedFunction();
+    if (F && !F->isDeclaration()) {
+      ToBeExploredFrom.insert(&F->getEntryBlock().front());
+      assumeLive(A, F->getEntryBlock());
+    }
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" +
+           std::to_string(getAssociatedFunction()->size()) + "][#TBEP " +
+           std::to_string(ToBeExploredFrom.size()) + "][#KDE " +
+           std::to_string(KnownDeadEnds.size()) + "]";
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function &F = *getAssociatedFunction();
+
+    if (AssumedLiveBlocks.empty()) {
+      A.deleteAfterManifest(F);
+      return ChangeStatus::CHANGED;
+    }
+
+    // Flag to determine if we can change an invoke to a call assuming the
+    // callee is nounwind. This is not possible if the personality of the
+    // function allows to catch asynchronous exceptions.
+    bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
+
+    KnownDeadEnds.set_union(ToBeExploredFrom);
+    for (const Instruction *DeadEndI : KnownDeadEnds) {
+      auto *CB = dyn_cast<CallBase>(DeadEndI);
+      if (!CB)
+        continue;
+      const auto &NoReturnAA =
+          A.getAAFor<AANoReturn>(*this, IRPosition::callsite_function(*CB));
+      bool MayReturn = !NoReturnAA.isAssumedNoReturn();
+      if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
+        continue;
+
+      if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
+        A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
+      else
+        A.changeToUnreachableAfterManifest(
+            const_cast<Instruction *>(DeadEndI->getNextNode()));
+      HasChanged = ChangeStatus::CHANGED;
+    }
+
+    for (BasicBlock &BB : F)
+      if (!AssumedLiveBlocks.count(&BB))
+        A.deleteAfterManifest(BB);
+
+    return HasChanged;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// Returns true if the function is assumed dead.
+  bool isAssumedDead() const override { return false; }
+
+  /// See AAIsDead::isAssumedDead(BasicBlock *).
+  bool isAssumedDead(const BasicBlock *BB) const override {
+    assert(BB->getParent() == getAssociatedFunction() &&
+           "BB must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+    return !AssumedLiveBlocks.count(BB);
+  }
+
+  /// See AAIsDead::isKnownDead(BasicBlock *).
+  bool isKnownDead(const BasicBlock *BB) const override {
+    return getKnown() && isAssumedDead(BB);
+  }
+
+  /// See AAIsDead::isAssumed(Instruction *I).
+  bool isAssumedDead(const Instruction *I) const override {
+    assert(I->getParent()->getParent() == getAssociatedFunction() &&
+           "Instruction must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+
+    // If it is not in AssumedLiveBlocks then it for sure dead.
+    // Otherwise, it can still be after noreturn call in a live block.
+    if (!AssumedLiveBlocks.count(I->getParent()))
+      return true;
+
+    // If it is not after a liveness barrier it is live.
+    const Instruction *PrevI = I->getPrevNode();
+    while (PrevI) {
+      if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI))
+        return true;
+      PrevI = PrevI->getPrevNode();
+    }
+    return false;
+  }
+
+  /// See AAIsDead::isKnownDead(Instruction *I).
+  bool isKnownDead(const Instruction *I) const override {
+    return getKnown() && isAssumedDead(I);
+  }
+
+  /// Determine if \p F might catch asynchronous exceptions.
+  static bool mayCatchAsynchronousExceptions(const Function &F) {
+    return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F);
+  }
+
+  /// Assume \p BB is (partially) live now and indicate to the Attributor \p A
+  /// that internal function called from \p BB should now be looked at.
+  bool assumeLive(Attributor &A, const BasicBlock &BB) {
+    if (!AssumedLiveBlocks.insert(&BB).second)
+      return false;
+
+    // We assume that all of BB is (probably) live now and if there are calls to
+    // internal functions we will assume that those are now live as well. This
+    // is a performance optimization for blocks with calls to a lot of internal
+    // functions. It can however cause dead functions to be treated as live.
+    for (const Instruction &I : BB)
+      if (ImmutableCallSite ICS = ImmutableCallSite(&I))
+        if (const Function *F = ICS.getCalledFunction())
+          if (F->hasLocalLinkage())
+            A.markLiveInternalFunction(*F);
+    return true;
+  }
+
+  /// Collection of instructions that need to be explored again, e.g., we
+  /// did assume they do not transfer control to (one of their) successors.
+  SmallSetVector<const Instruction *, 8> ToBeExploredFrom;
+
+  /// Collection of instructions that are known to not transfer control.
+  SmallSetVector<const Instruction *, 8> KnownDeadEnds;
+
+  /// Collection of all assumed live BasicBlocks.
+  DenseSet<const BasicBlock *> AssumedLiveBlocks;
+};
+
+static bool
+identifyAliveSuccessors(Attributor &A, const CallBase &CB,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  const IRPosition &IPos = IRPosition::callsite_function(CB);
+
+  const auto &NoReturnAA = A.getAAFor<AANoReturn>(AA, IPos);
+  if (NoReturnAA.isAssumedNoReturn())
+    return !NoReturnAA.isKnownNoReturn();
+  if (CB.isTerminator())
+    AliveSuccessors.push_back(&CB.getSuccessor(0)->front());
+  else
+    AliveSuccessors.push_back(CB.getNextNode());
+  return false;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const InvokeInst &II,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation =
+      identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors);
+
+  // First, determine if we can change an invoke to a call assuming the
+  // callee is nounwind. This is not possible if the personality of the
+  // function allows to catch asynchronous exceptions.
+  if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) {
+    AliveSuccessors.push_back(&II.getUnwindDest()->front());
+  } else {
+    const IRPosition &IPos = IRPosition::callsite_function(II);
+    const auto &AANoUnw = A.getAAFor<AANoUnwind>(AA, IPos);
+    if (AANoUnw.isAssumedNoUnwind()) {
+      UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind();
+    } else {
+      AliveSuccessors.push_back(&II.getUnwindDest()->front());
+    }
+  }
+  return UsedAssumedInformation;
+}
+
+static Optional<ConstantInt *>
+getAssumedConstant(Attributor &A, const Value &V, AbstractAttribute &AA,
+                   bool &UsedAssumedInformation) {
+  const auto &ValueSimplifyAA =
+      A.getAAFor<AAValueSimplify>(AA, IRPosition::value(V));
+  Optional<Value *> SimplifiedV = ValueSimplifyAA.getAssumedSimplifiedValue(A);
+  UsedAssumedInformation |= !ValueSimplifyAA.isKnown();
+  if (!SimplifiedV.hasValue())
+    return llvm::None;
+  if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue()))
+    return llvm::None;
+  return dyn_cast_or_null<ConstantInt>(SimplifiedV.getValue());
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation = false;
+  if (BI.getNumSuccessors() == 1) {
+    AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+  } else {
+    Optional<ConstantInt *> CI =
+        getAssumedConstant(A, *BI.getCondition(), AA, UsedAssumedInformation);
+    if (!CI.hasValue()) {
+      // No value yet, assume both edges are dead.
+    } else if (CI.getValue()) {
+      const BasicBlock *SuccBB =
+          BI.getSuccessor(1 - CI.getValue()->getZExtValue());
+      AliveSuccessors.push_back(&SuccBB->front());
+    } else {
+      AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+      AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
+      UsedAssumedInformation = false;
+    }
+  }
+  return UsedAssumedInformation;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const SwitchInst &SI,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation = false;
+  Optional<ConstantInt *> CI =
+      getAssumedConstant(A, *SI.getCondition(), AA, UsedAssumedInformation);
+  if (!CI.hasValue()) {
+    // No value yet, assume all edges are dead.
+  } else if (CI.getValue()) {
+    for (auto &CaseIt : SI.cases()) {
+      if (CaseIt.getCaseValue() == CI.getValue()) {
+        AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front());
+        return UsedAssumedInformation;
       }
     }
+    AliveSuccessors.push_back(&SI.getDefaultDest()->front());
+    return UsedAssumedInformation;
+  } else {
+    for (const BasicBlock *SuccBB : successors(SI.getParent()))
+      AliveSuccessors.push_back(&SuccBB->front());
   }
-  return ChangeStatus::UNCHANGED;
+  return UsedAssumedInformation;
 }
 
-/// ------------------------ NonNull Argument Attribute ------------------------
-struct AANonNullImpl : AANonNull, BooleanState {
+ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
+  ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+  LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/"
+                    << getAssociatedFunction()->size() << "] BBs and "
+                    << ToBeExploredFrom.size() << " exploration points and "
+                    << KnownDeadEnds.size() << " known dead ends\n");
 
-  AANonNullImpl(Value &V, InformationCache &InfoCache)
-      : AANonNull(V, InfoCache) {}
+  // Copy and clear the list of instructions we need to explore from. It is
+  // refilled with instructions the next update has to look at.
+  SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(),
+                                               ToBeExploredFrom.end());
+  decltype(ToBeExploredFrom) NewToBeExploredFrom;
 
-  AANonNullImpl(Value *AssociatedVal, Value &AnchoredValue,
-                InformationCache &InfoCache)
-      : AANonNull(AssociatedVal, AnchoredValue, InfoCache) {}
+  SmallVector<const Instruction *, 8> AliveSuccessors;
+  while (!Worklist.empty()) {
+    const Instruction *I = Worklist.pop_back_val();
+    LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n");
+
+    AliveSuccessors.clear();
+
+    bool UsedAssumedInformation = false;
+    switch (I->getOpcode()) {
+    // TODO: look for (assumed) UB to backwards propagate "deadness".
+    default:
+      if (I->isTerminator()) {
+        for (const BasicBlock *SuccBB : successors(I->getParent()))
+          AliveSuccessors.push_back(&SuccBB->front());
+      } else {
+        AliveSuccessors.push_back(I->getNextNode());
+      }
+      break;
+    case Instruction::Call:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Invoke:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Br:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Switch:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    }
+
+    if (UsedAssumedInformation) {
+      NewToBeExploredFrom.insert(I);
+    } else {
+      Change = ChangeStatus::CHANGED;
+      if (AliveSuccessors.empty() ||
+          (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors()))
+        KnownDeadEnds.insert(I);
+    }
+
+    LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: "
+                      << AliveSuccessors.size() << " UsedAssumedInformation: "
+                      << UsedAssumedInformation << "\n");
+
+    for (const Instruction *AliveSuccessor : AliveSuccessors) {
+      if (!I->isTerminator()) {
+        assert(AliveSuccessors.size() == 1 &&
+               "Non-terminator expected to have a single successor!");
+        Worklist.push_back(AliveSuccessor);
+      } else {
+        if (assumeLive(A, *AliveSuccessor->getParent()))
+          Worklist.push_back(AliveSuccessor);
+      }
+    }
+  }
+
+  ToBeExploredFrom = std::move(NewToBeExploredFrom);
+
+  // If we know everything is live there is no need to query for liveness.
+  // Instead, indicating a pessimistic fixpoint will cause the state to be
+  // "invalid" and all queries to be answered conservatively without lookups.
+  // To be in this state we have to (1) finished the exploration and (3) not
+  // discovered any non-trivial dead end and (2) not ruled unreachable code
+  // dead.
+  if (ToBeExploredFrom.empty() &&
+      getAssociatedFunction()->size() == AssumedLiveBlocks.size() &&
+      llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) {
+        return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0;
+      }))
+    return indicatePessimisticFixpoint();
+  return Change;
+}
+
+/// Liveness information for a call sites.
+struct AAIsDeadCallSite final : AAIsDeadFunction {
+  AAIsDeadCallSite(const IRPosition &IRP) : AAIsDeadFunction(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for liveness are not "
+                     "supported for call sites yet!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// -------------------- Dereferenceable Argument Attribute --------------------
+
+template <>
+ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
+                                                     const DerefState &R) {
+  ChangeStatus CS0 =
+      clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState);
+  ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState);
+  return CS0 | CS1;
+}
+
+struct AADereferenceableImpl : AADereferenceable {
+  AADereferenceableImpl(const IRPosition &IRP) : AADereferenceable(IRP) {}
+  using StateType = DerefState;
+
+  void initialize(Attributor &A) override {
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
+             Attrs);
+    for (const Attribute &Attr : Attrs)
+      takeKnownDerefBytesMaximum(Attr.getValueAsInt());
+
+    NonNullAA = &A.getAAFor<AANonNull>(*this, getIRPosition());
+
+    const IRPosition &IRP = this->getIRPosition();
+    bool IsFnInterface = IRP.isFnInterfaceKind();
+    const Function *FnScope = IRP.getAnchorScope();
+    if (IsFnInterface && (!FnScope || !FnScope->hasExactDefinition()))
+      indicatePessimisticFixpoint();
+  }
 
   /// See AbstractAttribute::getState()
   /// {
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
+  StateType &getState() override { return *this; }
+  const StateType &getState() const override { return *this; }
   /// }
 
+  /// Helper function for collecting accessed bytes in must-be-executed-context
+  void addAccessedBytesForUse(Attributor &A, const Use *U,
+                              const Instruction *I) {
+    const Value *UseV = U->get();
+    if (!UseV->getType()->isPointerTy())
+      return;
+
+    Type *PtrTy = UseV->getType();
+    const DataLayout &DL = A.getDataLayout();
+    int64_t Offset;
+    if (const Value *Base = getBasePointerOfAccessPointerOperand(
+            I, Offset, DL, /*AllowNonInbounds*/ true)) {
+      if (Base == &getAssociatedValue() &&
+          Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+        uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
+        addAccessedBytes(Offset, Size);
+      }
+    }
+    return;
+  }
+
+  /// See AAFromMustBeExecutedContext
+  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
+        A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
+
+    addAccessedBytesForUse(A, U, I);
+    takeKnownDerefBytesMaximum(DerefBytes);
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Change = AADereferenceable::manifest(A);
+    if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) {
+      removeAttrs({Attribute::DereferenceableOrNull});
+      return ChangeStatus::CHANGED;
+    }
+    return Change;
+  }
+
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    // TODO: Add *_globally support
+    if (isAssumedNonNull())
+      Attrs.emplace_back(Attribute::getWithDereferenceableBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+    else
+      Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+  }
+
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
-    return getAssumed() ? "nonnull" : "may-null";
+    if (!getAssumedDereferenceableBytes())
+      return "unknown-dereferenceable";
+    return std::string("dereferenceable") +
+           (isAssumedNonNull() ? "" : "_or_null") +
+           (isAssumedGlobal() ? "_globally" : "") + "<" +
+           std::to_string(getKnownDereferenceableBytes()) + "-" +
+           std::to_string(getAssumedDereferenceableBytes()) + ">";
   }
+};
 
-  /// See AANonNull::isAssumedNonNull().
-  bool isAssumedNonNull() const override { return getAssumed(); }
+/// Dereferenceable attribute for a floating value.
+struct AADereferenceableFloating
+    : AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl> {
+  using Base =
+      AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableFloating(const IRPosition &IRP) : Base(IRP) {}
 
-  /// See AANonNull::isKnownNonNull().
-  bool isKnownNonNull() const override { return getKnown(); }
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = Base::updateImpl(A);
+
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](Value &V, DerefState &T, bool Stripped) -> bool {
+      unsigned IdxWidth =
+          DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
+      APInt Offset(IdxWidth, 0);
+      const Value *Base =
+          V.stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+      const auto &AA =
+          A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base));
+      int64_t DerefBytes = 0;
+      if (!Stripped && this == &AA) {
+        // Use IR information if we did not strip anything.
+        // TODO: track globally.
+        bool CanBeNull;
+        DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
+        T.GlobalState.indicatePessimisticFixpoint();
+      } else {
+        const DerefState &DS = static_cast<const DerefState &>(AA.getState());
+        DerefBytes = DS.DerefBytesState.getAssumed();
+        T.GlobalState &= DS.GlobalState;
+      }
+
+      // TODO: Use `AAConstantRange` to infer dereferenceable bytes.
+
+      // For now we do not try to "increase" dereferenceability due to negative
+      // indices as we first have to come up with code to deal with loops and
+      // for overflows of the dereferenceable bytes.
+      int64_t OffsetSExt = Offset.getSExtValue();
+      if (OffsetSExt < 0)
+        OffsetSExt = 0;
+
+      T.takeAssumedDerefBytesMinimum(
+          std::max(int64_t(0), DerefBytes - OffsetSExt));
+
+      if (this == &AA) {
+        if (!Stripped) {
+          // If nothing was stripped IR information is all we got.
+          T.takeKnownDerefBytesMaximum(
+              std::max(int64_t(0), DerefBytes - OffsetSExt));
+          T.indicatePessimisticFixpoint();
+        } else if (OffsetSExt > 0) {
+          // If something was stripped but there is circular reasoning we look
+          // for the offset. If it is positive we basically decrease the
+          // dereferenceable bytes in a circluar loop now, which will simply
+          // drive them down to the known value in a very slow way which we
+          // can accelerate.
+          T.indicatePessimisticFixpoint();
+        }
+      }
+
+      return T.isValidState();
+    };
 
-  /// Generate a predicate that checks if a given value is assumed nonnull.
-  /// The generated function returns true if a value satisfies any of
-  /// following conditions.
-  /// (i) A value is known nonZero(=nonnull).
-  /// (ii) A value is associated with AANonNull and its isAssumedNonNull() is
-  /// true.
-  std::function<bool(Value &)> generatePredicate(Attributor &);
+    DerefState T;
+    if (!genericValueTraversal<AADereferenceable, DerefState>(
+            A, getIRPosition(), *this, T, VisitValueCB))
+      return indicatePessimisticFixpoint();
+
+    return Change | clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(dereferenceable)
+  }
 };
 
-std::function<bool(Value &)> AANonNullImpl::generatePredicate(Attributor &A) {
-  // FIXME: The `AAReturnedValues` should provide the predicate with the
-  // `ReturnInst` vector as well such that we can use the control flow sensitive
-  // version of `isKnownNonZero`. This should fix `test11` in
-  // `test/Transforms/FunctionAttrs/nonnull.ll`
+/// Dereferenceable attribute for a return value.
+struct AADereferenceableReturned final
+    : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl,
+                                   DerefState> {
+  AADereferenceableReturned(const IRPosition &IRP)
+      : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl,
+                                     DerefState>(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(dereferenceable)
+  }
+};
 
-  std::function<bool(Value &)> Pred = [&](Value &RV) -> bool {
-    if (isKnownNonZero(&RV, getAnchorScope().getParent()->getDataLayout()))
-      return true;
+/// Dereferenceable attribute for an argument
+struct AADereferenceableArgument final
+    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<
+          AADereferenceable, AADereferenceableImpl, DerefState> {
+  using Base = AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<
+      AADereferenceable, AADereferenceableImpl, DerefState>;
+  AADereferenceableArgument(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(dereferenceable)
+  }
+};
 
-    auto *NonNullAA = A.getAAFor<AANonNull>(*this, RV);
+/// Dereferenceable attribute for a call site argument.
+struct AADereferenceableCallSiteArgument final : AADereferenceableFloating {
+  AADereferenceableCallSiteArgument(const IRPosition &IRP)
+      : AADereferenceableFloating(IRP) {}
 
-    ImmutableCallSite ICS(&RV);
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(dereferenceable)
+  }
+};
 
-    if ((!NonNullAA || !NonNullAA->isAssumedNonNull()) &&
-        (!ICS || !ICS.hasRetAttr(Attribute::NonNull)))
-      return false;
+/// Dereferenceable attribute deduction for a call site return value.
+struct AADereferenceableCallSiteReturned final
+    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<
+          AADereferenceable, AADereferenceableImpl> {
+  using Base = AACallSiteReturnedFromReturnedAndMustBeExecutedContext<
+      AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableCallSiteReturned(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(dereferenceable);
+  }
+};
 
-    return true;
-  };
+// ------------------------ Align Argument Attribute ------------------------
+
+static unsigned int getKnownAlignForUse(Attributor &A,
+                                        AbstractAttribute &QueryingAA,
+                                        Value &AssociatedValue, const Use *U,
+                                        const Instruction *I, bool &TrackUse) {
+  // We need to follow common pointer manipulation uses to the accesses they
+  // feed into.
+  if (isa<CastInst>(I)) {
+    // Follow all but ptr2int casts.
+    TrackUse = !isa<PtrToIntInst>(I);
+    return 0;
+  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    if (GEP->hasAllConstantIndices()) {
+      TrackUse = true;
+      return 0;
+    }
+  }
+
+  unsigned Alignment = 0;
+  if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
+    if (ICS.isBundleOperand(U) || ICS.isCallee(U))
+      return 0;
+
+    unsigned ArgNo = ICS.getArgumentNo(U);
+    IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo);
+    // As long as we only use known information there is no need to track
+    // dependences here.
+    auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP,
+                                        /* TrackDependence */ false);
+    Alignment = AlignAA.getKnownAlign();
+  }
+
+  const Value *UseV = U->get();
+  if (auto *SI = dyn_cast<StoreInst>(I))
+    Alignment = SI->getAlignment();
+  else if (auto *LI = dyn_cast<LoadInst>(I))
+    Alignment = LI->getAlignment();
 
-  return Pred;
+  if (Alignment <= 1)
+    return 0;
+
+  auto &DL = A.getDataLayout();
+  int64_t Offset;
+
+  if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) {
+    if (Base == &AssociatedValue) {
+      // BasePointerAddr + Offset = Alignment * Q for some integer Q.
+      // So we can say that the maximum power of two which is a divisor of
+      // gcd(Offset, Alignment) is an alignment.
+
+      uint32_t gcd =
+          greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment);
+      Alignment = llvm::PowerOf2Floor(gcd);
+    }
+  }
+
+  return Alignment;
 }
+struct AAAlignImpl : AAAlign {
+  AAAlignImpl(const IRPosition &IRP) : AAAlign(IRP) {}
 
-/// NonNull attribute for function return value.
-struct AANonNullReturned : AANonNullImpl {
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Alignment}, Attrs);
+    for (const Attribute &Attr : Attrs)
+      takeKnownMaximum(Attr.getValueAsInt());
+
+    if (getIRPosition().isFnInterfaceKind() &&
+        (!getAssociatedFunction() ||
+         !getAssociatedFunction()->hasExactDefinition()))
+      indicatePessimisticFixpoint();
+  }
 
-  AANonNullReturned(Function &F, InformationCache &InfoCache)
-      : AANonNullImpl(F, InfoCache) {}
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    // Check for users that allow alignment annotations.
+    Value &AnchorVal = getIRPosition().getAnchorValue();
+    for (const Use &U : AnchorVal.uses()) {
+      if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
+        if (SI->getPointerOperand() == &AnchorVal)
+          if (SI->getAlignment() < getAssumedAlign()) {
+            STATS_DECLTRACK(AAAlign, Store,
+                            "Number of times alignment added to a store");
+            SI->setAlignment(Align(getAssumedAlign()));
+            Changed = ChangeStatus::CHANGED;
+          }
+      } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
+        if (LI->getPointerOperand() == &AnchorVal)
+          if (LI->getAlignment() < getAssumedAlign()) {
+            LI->setAlignment(Align(getAssumedAlign()));
+            STATS_DECLTRACK(AAAlign, Load,
+                            "Number of times alignment added to a load");
+            Changed = ChangeStatus::CHANGED;
+          }
+      }
+    }
+
+    return AAAlign::manifest(A) | Changed;
+  }
+
+  // TODO: Provide a helper to determine the implied ABI alignment and check in
+  //       the existing manifest method and a new one for AAAlignImpl that value
+  //       to avoid making the alignment explicit if it did not improve.
+
+  /// See AbstractAttribute::getDeducedAttributes
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (getAssumedAlign() > 1)
+      Attrs.emplace_back(
+          Attribute::getWithAlignment(Ctx, Align(getAssumedAlign())));
+  }
+  /// See AAFromMustBeExecutedContext
+  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
+    bool TrackUse = false;
+
+    unsigned int KnownAlign =
+        getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse);
+    takeKnownMaximum(KnownAlign);
+
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
+                                "-" + std::to_string(getAssumedAlign()) + ">")
+                             : "unknown-align";
+  }
+};
+
+/// Align attribute for a floating value.
+struct AAAlignFloating : AAFromMustBeExecutedContext<AAAlign, AAAlignImpl> {
+  using Base = AAFromMustBeExecutedContext<AAAlign, AAAlignImpl>;
+  AAAlignFloating(const IRPosition &IRP) : Base(IRP) {}
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_RETURNED; }
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    Base::updateImpl(A);
+
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](Value &V, AAAlign::StateType &T,
+                            bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        // Use only IR information if we did not strip anything.
+        const MaybeAlign PA = V.getPointerAlignment(DL);
+        T.takeKnownMaximum(PA ? PA->value() : 0);
+        T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
+        const AAAlign::StateType &DS =
+            static_cast<const AAAlign::StateType &>(AA.getState());
+        T ^= DS;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T,
+                                                   VisitValueCB))
+      return indicatePessimisticFixpoint();
+
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) }
+};
 
-  /// See AbstractAttriubute::initialize(...).
+/// Align attribute for function return value.
+struct AAAlignReturned final
+    : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
+  AAAlignReturned(const IRPosition &IRP)
+      : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
+};
+
+/// Align attribute for function argument.
+struct AAAlignArgument final
+    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign,
+                                                              AAAlignImpl> {
+  AAAlignArgument(const IRPosition &IRP)
+      : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign,
+                                                                AAAlignImpl>(
+            IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) }
+};
+
+struct AAAlignCallSiteArgument final : AAAlignFloating {
+  AAAlignCallSiteArgument(const IRPosition &IRP) : AAAlignFloating(IRP) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    return AAAlignImpl::manifest(A);
+  }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = AAAlignFloating::updateImpl(A);
+    if (Argument *Arg = getAssociatedArgument()) {
+      const auto &ArgAlignAA = A.getAAFor<AAAlign>(
+          *this, IRPosition::argument(*Arg), /* TrackDependence */ false,
+          DepClassTy::OPTIONAL);
+      takeKnownMaximum(ArgAlignAA.getKnownAlign());
+    }
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) }
+};
+
+/// Align attribute deduction for a call site return value.
+struct AAAlignCallSiteReturned final
+    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign,
+                                                             AAAlignImpl> {
+  using Base =
+      AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign,
+                                                             AAAlignImpl>;
+  AAAlignCallSiteReturned(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    Function &F = getAnchorScope();
+    Base::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
+};
 
-    // Already nonnull.
-    if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                       Attribute::NonNull))
+/// ------------------ Function No-Return Attribute ----------------------------
+struct AANoReturnImpl : public AANoReturn {
+  AANoReturnImpl(const IRPosition &IRP) : AANoReturn(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoReturn::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noreturn" : "may-return";
+  }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForNoReturn = [](Instruction &) { return false; };
+    if (!A.checkForAllInstructions(CheckForNoReturn, *this,
+                                   {(unsigned)Instruction::Ret}))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoReturnFunction final : AANoReturnImpl {
+  AANoReturnFunction(const IRPosition &IRP) : AANoReturnImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
+};
+
+/// NoReturn attribute deduction for a call sites.
+struct AANoReturnCallSite final : AANoReturnImpl {
+  AANoReturnCallSite(const IRPosition &IRP) : AANoReturnImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoReturn::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
+};
+
+/// ----------------------- Variable Capturing ---------------------------------
+
+/// A class to hold the state of for no-capture attributes.
+struct AANoCaptureImpl : public AANoCapture {
+  AANoCaptureImpl(const IRPosition &IRP) : AANoCapture(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) {
       indicateOptimisticFixpoint();
+      return;
+    }
+    Function *AnchorScope = getAnchorScope();
+    if (isFnInterfaceKind() &&
+        (!AnchorScope || !AnchorScope->hasExactDefinition())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // You cannot "capture" null in the default address space.
+    if (isa<ConstantPointerNull>(getAssociatedValue()) &&
+        getAssociatedValue().getType()->getPointerAddressSpace() == 0) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    const Function *F = getArgNo() >= 0 ? getAssociatedFunction() : AnchorScope;
+
+    // Check what state the associated function can actually capture.
+    if (F)
+      determineFunctionCaptureCapabilities(getIRPosition(), *F, *this);
+    else
+      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override;
+
+  /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...).
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (!isAssumedNoCaptureMaybeReturned())
+      return;
+
+    if (getArgNo() >= 0) {
+      if (isAssumedNoCapture())
+        Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
+      else if (ManifestInternal)
+        Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned"));
+    }
+  }
+
+  /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known
+  /// depending on the ability of the function associated with \p IRP to capture
+  /// state in memory and through "returning/throwing", respectively.
+  static void determineFunctionCaptureCapabilities(const IRPosition &IRP,
+                                                   const Function &F,
+                                                   BitIntegerState &State) {
+    // TODO: Once we have memory behavior attributes we should use them here.
+
+    // If we know we cannot communicate or write to memory, we do not care about
+    // ptr2int anymore.
+    if (F.onlyReadsMemory() && F.doesNotThrow() &&
+        F.getReturnType()->isVoidTy()) {
+      State.addKnownBits(NO_CAPTURE);
+      return;
+    }
+
+    // A function cannot capture state in memory if it only reads memory, it can
+    // however return/throw state and the state might be influenced by the
+    // pointer value, e.g., loading from a returned pointer might reveal a bit.
+    if (F.onlyReadsMemory())
+      State.addKnownBits(NOT_CAPTURED_IN_MEM);
+
+    // A function cannot communicate state back if it does not through
+    // exceptions and doesn not return values.
+    if (F.doesNotThrow() && F.getReturnType()->isVoidTy())
+      State.addKnownBits(NOT_CAPTURED_IN_RET);
+
+    // Check existing "returned" attributes.
+    int ArgNo = IRP.getArgNo();
+    if (F.doesNotThrow() && ArgNo >= 0) {
+      for (unsigned u = 0, e = F.arg_size(); u < e; ++u)
+        if (F.hasParamAttribute(u, Attribute::Returned)) {
+          if (u == unsigned(ArgNo))
+            State.removeAssumedBits(NOT_CAPTURED_IN_RET);
+          else if (F.onlyReadsMemory())
+            State.addKnownBits(NO_CAPTURE);
+          else
+            State.addKnownBits(NOT_CAPTURED_IN_RET);
+          break;
+        }
+    }
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isKnownNoCapture())
+      return "known not-captured";
+    if (isAssumedNoCapture())
+      return "assumed not-captured";
+    if (isKnownNoCaptureMaybeReturned())
+      return "known not-captured-maybe-returned";
+    if (isAssumedNoCaptureMaybeReturned())
+      return "assumed not-captured-maybe-returned";
+    return "assumed-captured";
+  }
 };
 
-ChangeStatus AANonNullReturned::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+/// Attributor-aware capture tracker.
+struct AACaptureUseTracker final : public CaptureTracker {
 
-  auto *AARetVal = A.getAAFor<AAReturnedValues>(*this, F);
-  if (!AARetVal) {
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
+  /// Create a capture tracker that can lookup in-flight abstract attributes
+  /// through the Attributor \p A.
+  ///
+  /// If a use leads to a potential capture, \p CapturedInMemory is set and the
+  /// search is stopped. If a use leads to a return instruction,
+  /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
+  /// If a use leads to a ptr2int which may capture the value,
+  /// \p CapturedInInteger is set. If a use is found that is currently assumed
+  /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
+  /// set. All values in \p PotentialCopies are later tracked as well. For every
+  /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
+  /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
+  /// conservatively set to true.
+  AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
+                      const AAIsDead &IsDeadAA, AANoCapture::StateType &State,
+                      SmallVectorImpl<const Value *> &PotentialCopies,
+                      unsigned &RemainingUsesToExplore)
+      : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
+        PotentialCopies(PotentialCopies),
+        RemainingUsesToExplore(RemainingUsesToExplore) {}
+
+  /// Determine if \p V maybe captured. *Also updates the state!*
+  bool valueMayBeCaptured(const Value *V) {
+    if (V->getType()->isPointerTy()) {
+      PointerMayBeCaptured(V, this);
+    } else {
+      State.indicatePessimisticFixpoint();
+    }
+    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
   }
 
-  std::function<bool(Value &)> Pred = this->generatePredicate(A);
-  if (!AARetVal->checkForallReturnedValues(Pred)) {
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
+  /// See CaptureTracker::tooManyUses().
+  void tooManyUses() override {
+    State.removeAssumedBits(AANoCapture::NO_CAPTURE);
   }
-  return ChangeStatus::UNCHANGED;
+
+  bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
+    if (CaptureTracker::isDereferenceableOrNull(O, DL))
+      return true;
+    const auto &DerefAA =
+        A.getAAFor<AADereferenceable>(NoCaptureAA, IRPosition::value(*O));
+    return DerefAA.getAssumedDereferenceableBytes();
+  }
+
+  /// See CaptureTracker::captured(...).
+  bool captured(const Use *U) override {
+    Instruction *UInst = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
+                      << "\n");
+
+    // Because we may reuse the tracker multiple times we keep track of the
+    // number of explored uses ourselves as well.
+    if (RemainingUsesToExplore-- == 0) {
+      LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+    }
+
+    // Deal with ptr2int by following uses.
+    if (isa<PtrToIntInst>(UInst)) {
+      LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
+      return valueMayBeCaptured(UInst);
+    }
+
+    // Explicitly catch return instructions.
+    if (isa<ReturnInst>(UInst))
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ true);
+
+    // For now we only use special logic for call sites. However, the tracker
+    // itself knows about a lot of other non-capturing cases already.
+    CallSite CS(UInst);
+    if (!CS || !CS.isArgOperand(U))
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+
+    unsigned ArgNo = CS.getArgumentNo(U);
+    const IRPosition &CSArgPos = IRPosition::callsite_argument(CS, ArgNo);
+    // If we have a abstract no-capture attribute for the argument we can use
+    // it to justify a non-capture attribute here. This allows recursion!
+    auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos);
+    if (ArgNoCaptureAA.isAssumedNoCapture())
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      addPotentialCopy(CS);
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    }
+
+    // Lastly, we could not find a reason no-capture can be assumed so we don't.
+    return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                        /* Return */ true);
+  }
+
+  /// Register \p CS as potential copy of the value we are checking.
+  void addPotentialCopy(CallSite CS) {
+    PotentialCopies.push_back(CS.getInstruction());
+  }
+
+  /// See CaptureTracker::shouldExplore(...).
+  bool shouldExplore(const Use *U) override {
+    // Check liveness.
+    return !IsDeadAA.isAssumedDead(cast<Instruction>(U->getUser()));
+  }
+
+  /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
+  /// \p CapturedInRet, then return the appropriate value for use in the
+  /// CaptureTracker::captured() interface.
+  bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
+                    bool CapturedInRet) {
+    LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
+                      << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
+    if (CapturedInMem)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM);
+    if (CapturedInInt)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
+    if (CapturedInRet)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
+    return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+private:
+  /// The attributor providing in-flight abstract attributes.
+  Attributor &A;
+
+  /// The abstract attribute currently updated.
+  AANoCapture &NoCaptureAA;
+
+  /// The abstract liveness state.
+  const AAIsDead &IsDeadAA;
+
+  /// The state currently updated.
+  AANoCapture::StateType &State;
+
+  /// Set of potential copies of the tracked value.
+  SmallVectorImpl<const Value *> &PotentialCopies;
+
+  /// Global counter to limit the number of explored uses.
+  unsigned &RemainingUsesToExplore;
+};
+
+ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
+  const IRPosition &IRP = getIRPosition();
+  const Value *V =
+      getArgNo() >= 0 ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue();
+  if (!V)
+    return indicatePessimisticFixpoint();
+
+  const Function *F =
+      getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
+  assert(F && "Expected a function!");
+  const IRPosition &FnPos = IRPosition::function(*F);
+  const auto &IsDeadAA = A.getAAFor<AAIsDead>(*this, FnPos);
+
+  AANoCapture::StateType T;
+
+  // Readonly means we cannot capture through memory.
+  const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+  if (FnMemAA.isAssumedReadOnly()) {
+    T.addKnownBits(NOT_CAPTURED_IN_MEM);
+    if (FnMemAA.isKnownReadOnly())
+      addKnownBits(NOT_CAPTURED_IN_MEM);
+  }
+
+  // Make sure all returned values are different than the underlying value.
+  // TODO: we could do this in a more sophisticated way inside
+  //       AAReturnedValues, e.g., track all values that escape through returns
+  //       directly somehow.
+  auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
+    bool SeenConstant = false;
+    for (auto &It : RVAA.returned_values()) {
+      if (isa<Constant>(It.first)) {
+        if (SeenConstant)
+          return false;
+        SeenConstant = true;
+      } else if (!isa<Argument>(It.first) ||
+                 It.first == getAssociatedArgument())
+        return false;
+    }
+    return true;
+  };
+
+  const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(*this, FnPos);
+  if (NoUnwindAA.isAssumedNoUnwind()) {
+    bool IsVoidTy = F->getReturnType()->isVoidTy();
+    const AAReturnedValues *RVAA =
+        IsVoidTy ? nullptr : &A.getAAFor<AAReturnedValues>(*this, FnPos);
+    if (IsVoidTy || CheckReturnedArgs(*RVAA)) {
+      T.addKnownBits(NOT_CAPTURED_IN_RET);
+      if (T.isKnown(NOT_CAPTURED_IN_MEM))
+        return ChangeStatus::UNCHANGED;
+      if (NoUnwindAA.isKnownNoUnwind() &&
+          (IsVoidTy || RVAA->getState().isAtFixpoint())) {
+        addKnownBits(NOT_CAPTURED_IN_RET);
+        if (isKnown(NOT_CAPTURED_IN_MEM))
+          return indicateOptimisticFixpoint();
+      }
+    }
+  }
+
+  // Use the CaptureTracker interface and logic with the specialized tracker,
+  // defined in AACaptureUseTracker, that can look at in-flight abstract
+  // attributes and directly updates the assumed state.
+  SmallVector<const Value *, 4> PotentialCopies;
+  unsigned RemainingUsesToExplore = DefaultMaxUsesToExplore;
+  AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
+                              RemainingUsesToExplore);
+
+  // Check all potential copies of the associated value until we can assume
+  // none will be captured or we have to assume at least one might be.
+  unsigned Idx = 0;
+  PotentialCopies.push_back(V);
+  while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
+    Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
+
+  AANoCapture::StateType &S = getState();
+  auto Assumed = S.getAssumed();
+  S.intersectAssumedBits(T.getAssumed());
+  if (!isAssumedNoCaptureMaybeReturned())
+    return indicatePessimisticFixpoint();
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
 }
 
-/// NonNull attribute for function argument.
-struct AANonNullArgument : AANonNullImpl {
+/// NoCapture attribute for function arguments.
+struct AANoCaptureArgument final : AANoCaptureImpl {
+  AANoCaptureArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
 
-  AANonNullArgument(Argument &A, InformationCache &InfoCache)
-      : AANonNullImpl(A, InfoCache) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) }
+};
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_ARGUMENT; }
+/// NoCapture attribute for call site arguments.
+struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
+  AANoCaptureCallSiteArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
 
-  /// See AbstractAttriubute::initialize(...).
+  /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    Argument *Arg = cast<Argument>(getAssociatedValue());
-    if (Arg->hasNonNullAttr())
-      indicateOptimisticFixpoint();
+    if (Argument *Arg = getAssociatedArgument())
+      if (Arg->hasByValAttr())
+        indicateOptimisticFixpoint();
+    AANoCaptureImpl::initialize(A);
   }
 
   /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoCapture::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)};
 };
 
-/// NonNull attribute for a call site argument.
-struct AANonNullCallSiteArgument : AANonNullImpl {
+/// NoCapture attribute for floating values.
+struct AANoCaptureFloating final : AANoCaptureImpl {
+  AANoCaptureFloating(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
 
-  /// See AANonNullImpl::AANonNullImpl(...).
-  AANonNullCallSiteArgument(CallSite CS, unsigned ArgNo,
-                            InformationCache &InfoCache)
-      : AANonNullImpl(CS.getArgOperand(ArgNo), *CS.getInstruction(), InfoCache),
-        ArgNo(ArgNo) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(nocapture)
+  }
+};
+
+/// NoCapture attribute for function return value.
+struct AANoCaptureReturned final : AANoCaptureImpl {
+  AANoCaptureReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    CallSite CS(&getAnchoredValue());
-    if (isKnownNonZero(getAssociatedValue(),
-                       getAnchorScope().getParent()->getDataLayout()) ||
-        CS.paramHasAttr(ArgNo, getAttrKind()))
-      indicateOptimisticFixpoint();
+    llvm_unreachable("NoCapture is not applicable to function returns!");
   }
 
-  /// See AbstractAttribute::updateImpl(Attributor &A).
-  ChangeStatus updateImpl(Attributor &A) override;
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override {
-    return MP_CALL_SITE_ARGUMENT;
-  };
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
 
-  // Return argument index of associated value.
-  int getArgNo() const { return ArgNo; }
+/// NoCapture attribute deduction for a call site return value.
+struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
+  AANoCaptureCallSiteReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
 
-private:
-  unsigned ArgNo;
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(nocapture)
+  }
 };
-ChangeStatus AANonNullArgument::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
-  Argument &Arg = cast<Argument>(getAnchoredValue());
 
-  unsigned ArgNo = Arg.getArgNo();
+/// ------------------ Value Simplify Attribute ----------------------------
+struct AAValueSimplifyImpl : AAValueSimplify {
+  AAValueSimplifyImpl(const IRPosition &IRP) : AAValueSimplify(IRP) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple")
+                        : "not-simple";
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// See AAValueSimplify::getAssumedSimplifiedValue()
+  Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override {
+    if (!getAssumed())
+      return const_cast<Value *>(&getAssociatedValue());
+    return SimplifiedAssociatedValue;
+  }
+  void initialize(Attributor &A) override {}
+
+  /// Helper function for querying AAValueSimplify and updating candicate.
+  /// \param QueryingValue Value trying to unify with SimplifiedValue
+  /// \param AccumulatedSimplifiedValue Current simplification result.
+  static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA,
+                             Value &QueryingValue,
+                             Optional<Value *> &AccumulatedSimplifiedValue) {
+    // FIXME: Add a typecast support.
 
-  // Callback function
-  std::function<bool(CallSite)> CallSiteCheck = [&](CallSite CS) {
-    assert(CS && "Sanity check: Call site was not initialized properly!");
+    auto &ValueSimpifyAA = A.getAAFor<AAValueSimplify>(
+        QueryingAA, IRPosition::value(QueryingValue));
 
-    auto *NonNullAA = A.getAAFor<AANonNull>(*this, *CS.getInstruction(), ArgNo);
+    Optional<Value *> QueryingValueSimplified =
+        ValueSimpifyAA.getAssumedSimplifiedValue(A);
 
-    // Check that NonNullAA is AANonNullCallSiteArgument.
-    if (NonNullAA) {
-      ImmutableCallSite ICS(&NonNullAA->getAnchoredValue());
-      if (ICS && CS.getInstruction() == ICS.getInstruction())
-        return NonNullAA->isAssumedNonNull();
+    if (!QueryingValueSimplified.hasValue())
+      return true;
+
+    if (!QueryingValueSimplified.getValue())
       return false;
+
+    Value &QueryingValueSimplifiedUnwrapped =
+        *QueryingValueSimplified.getValue();
+
+    if (isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+      return true;
+
+    if (AccumulatedSimplifiedValue.hasValue())
+      return AccumulatedSimplifiedValue == QueryingValueSimplified;
+
+    LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << QueryingValue
+                      << " is assumed to be "
+                      << QueryingValueSimplifiedUnwrapped << "\n");
+
+    AccumulatedSimplifiedValue = QueryingValueSimplified;
+    return true;
+  }
+
+  bool askSimplifiedValueForAAValueConstantRange(Attributor &A) {
+    if (!getAssociatedValue().getType()->isIntegerTy())
+      return false;
+
+    const auto &ValueConstantRangeAA =
+        A.getAAFor<AAValueConstantRange>(*this, getIRPosition());
+
+    Optional<ConstantInt *> COpt =
+        ValueConstantRangeAA.getAssumedConstantInt(A);
+    if (COpt.hasValue()) {
+      if (auto *C = COpt.getValue())
+        SimplifiedAssociatedValue = C;
+      else
+        return false;
+    } else {
+      // FIXME: It should be llvm::None but if you set llvm::None,
+      //        values are mistakenly infered as `undef` now.
+      SimplifiedAssociatedValue = &getAssociatedValue();
+    }
+    return true;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (!SimplifiedAssociatedValue.hasValue() ||
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    if (auto *C = dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())) {
+      // We can replace the AssociatedValue with the constant.
+      Value &V = getAssociatedValue();
+      if (!V.user_empty() && &V != C && V.getType() == C->getType()) {
+        LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << V << " -> " << *C
+                          << "\n");
+        A.changeValueAfterManifest(V, *C);
+        Changed = ChangeStatus::CHANGED;
+      }
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...).
+  ChangeStatus indicatePessimisticFixpoint() override {
+    // NOTE: Associated value will be returned in a pessimistic fixpoint and is
+    // regarded as known. That's why`indicateOptimisticFixpoint` is called.
+    SimplifiedAssociatedValue = &getAssociatedValue();
+    indicateOptimisticFixpoint();
+    return ChangeStatus::CHANGED;
+  }
+
+protected:
+  // An assumed simplified value. Initially, it is set to Optional::None, which
+  // means that the value is not clear under current assumption. If in the
+  // pessimistic state, getAssumedSimplifiedValue doesn't return this value but
+  // returns orignal associated value.
+  Optional<Value *> SimplifiedAssociatedValue;
+};
+
+struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
+  AAValueSimplifyArgument(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  void initialize(Attributor &A) override {
+    AAValueSimplifyImpl::initialize(A);
+    if (!getAssociatedFunction() || getAssociatedFunction()->isDeclaration())
+      indicatePessimisticFixpoint();
+    if (hasAttr({Attribute::InAlloca, Attribute::StructRet, Attribute::Nest},
+                /* IgnoreSubsumingPositions */ true))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // Byval is only replacable if it is readonly otherwise we would write into
+    // the replaced value and not the copy that byval creates implicitly.
+    Argument *Arg = getAssociatedArgument();
+    if (Arg->hasByValAttr()) {
+      const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+      if (!MemAA.isAssumedReadOnly())
+        return indicatePessimisticFixpoint();
+    }
+
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForCallSite = [&](AbstractCallSite ACS) {
+      // Check if we have an associated argument or not (which can happen for
+      // callback calls).
+      Value *ArgOp = ACS.getCallArgOperand(getArgNo());
+      if (!ArgOp)
+        return false;
+      // We can only propagate thread independent values through callbacks.
+      // This is different to direct/indirect call sites because for them we
+      // know the thread executing the caller and callee is the same. For
+      // callbacks this is not guaranteed, thus a thread dependent value could
+      // be different for the caller and callee, making it invalid to propagate.
+      if (ACS.isCallbackCall())
+        if (auto *C = dyn_cast<Constant>(ArgOp))
+          if (C->isThreadDependent())
+            return false;
+      return checkAndUpdate(A, *this, *ArgOp, SimplifiedAssociatedValue);
+    };
+
+    if (!A.checkForAllCallSites(PredForCallSite, *this, true))
+      if (!askSimplifiedValueForAAValueConstantRange(A))
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyReturned : AAValueSimplifyImpl {
+  AAValueSimplifyReturned(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForReturned = [&](Value &V) {
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    if (!A.checkForAllReturnedValues(PredForReturned, *this))
+      if (!askSimplifiedValueForAAValueConstantRange(A))
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFloating : AAValueSimplifyImpl {
+  AAValueSimplifyFloating(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Value &V = getAnchorValue();
+
+    // TODO: add other stuffs
+    if (isa<Constant>(V))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto VisitValueCB = [&](Value &V, BooleanState, bool Stripped) -> bool {
+      auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        // TODO: Look the instruction and check recursively.
+
+        LLVM_DEBUG(
+            dbgs() << "[Attributor][ValueSimplify] Can't be stripped more : "
+                   << V << "\n");
+        return false;
+      }
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    if (!genericValueTraversal<AAValueSimplify, BooleanState>(
+            A, getIRPosition(), *this, static_cast<BooleanState &>(*this),
+            VisitValueCB))
+      if (!askSimplifiedValueForAAValueConstantRange(A))
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFunction : AAValueSimplifyImpl {
+  AAValueSimplifyFunction(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SimplifiedAssociatedValue = &getAnchorValue();
+    indicateOptimisticFixpoint();
+  }
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable(
+        "AAValueSimplify(Function|CallSite)::updateImpl will not be called");
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FN_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSite : AAValueSimplifyFunction {
+  AAValueSimplifyCallSite(const IRPosition &IRP)
+      : AAValueSimplifyFunction(IRP) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned {
+  AAValueSimplifyCallSiteReturned(const IRPosition &IRP)
+      : AAValueSimplifyReturned(IRP) {}
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(value_simplify)
+  }
+};
+struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
+  AAValueSimplifyCallSiteArgument(const IRPosition &IRP)
+      : AAValueSimplifyFloating(IRP) {}
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(value_simplify)
+  }
+};
+
+/// ----------------------- Heap-To-Stack Conversion ---------------------------
+struct AAHeapToStackImpl : public AAHeapToStack {
+  AAHeapToStackImpl(const IRPosition &IRP) : AAHeapToStack(IRP) {}
+
+  const std::string getAsStr() const override {
+    return "[H2S] Mallocs: " + std::to_string(MallocCalls.size());
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function *F = getAssociatedFunction();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+    for (Instruction *MallocCall : MallocCalls) {
+      // This malloc cannot be replaced.
+      if (BadMallocCalls.count(MallocCall))
+        continue;
+
+      for (Instruction *FreeCall : FreesForMalloc[MallocCall]) {
+        LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n");
+        A.deleteAfterManifest(*FreeCall);
+        HasChanged = ChangeStatus::CHANGED;
+      }
+
+      LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
+                        << "\n");
+
+      Constant *Size;
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
+        auto *SizeT = dyn_cast<ConstantInt>(MallocCall->getOperand(1));
+        APInt TotalSize = SizeT->getValue() * Num->getValue();
+        Size =
+            ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
+      } else {
+        Size = cast<ConstantInt>(MallocCall->getOperand(0));
+      }
+
+      unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
+      Instruction *AI = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
+                                       Size, "", MallocCall->getNextNode());
+
+      if (AI->getType() != MallocCall->getType())
+        AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
+                             AI->getNextNode());
+
+      replaceAllInstructionUsesWith(*MallocCall, *AI);
+
+      if (auto *II = dyn_cast<InvokeInst>(MallocCall)) {
+        auto *NBB = II->getNormalDest();
+        BranchInst::Create(NBB, MallocCall->getParent());
+        A.deleteAfterManifest(*MallocCall);
+      } else {
+        A.deleteAfterManifest(*MallocCall);
+      }
+
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc",
+                                   AI->getNextNode());
+        Value *Ops[] = {
+            BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
+            ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
+
+        Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()};
+        Module *M = F->getParent();
+        Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
+        CallInst::Create(Fn, Ops, "", BI->getNextNode());
+      }
+      HasChanged = ChangeStatus::CHANGED;
     }
 
-    if (CS.paramHasAttr(ArgNo, Attribute::NonNull))
+    return HasChanged;
+  }
+
+  /// Collection of all malloc calls in a function.
+  SmallSetVector<Instruction *, 4> MallocCalls;
+
+  /// Collection of malloc calls that cannot be converted.
+  DenseSet<const Instruction *> BadMallocCalls;
+
+  /// A map for each malloc call to the set of associated free calls.
+  DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc;
+
+  ChangeStatus updateImpl(Attributor &A) override;
+};
+
+ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
+  const Function *F = getAssociatedFunction();
+  const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+  MustBeExecutedContextExplorer &Explorer =
+      A.getInfoCache().getMustBeExecutedContextExplorer();
+
+  auto FreeCheck = [&](Instruction &I) {
+    const auto &Frees = FreesForMalloc.lookup(&I);
+    if (Frees.size() != 1)
+      return false;
+    Instruction *UniqueFree = *Frees.begin();
+    return Explorer.findInContextOf(UniqueFree, I.getNextNode());
+  };
+
+  auto UsesCheck = [&](Instruction &I) {
+    bool ValidUsesOnly = true;
+    bool MustUse = true;
+    auto Pred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      if (isa<LoadInst>(UserI))
+        return true;
+      if (auto *SI = dyn_cast<StoreInst>(UserI)) {
+        if (SI->getValueOperand() == U.get()) {
+          LLVM_DEBUG(dbgs()
+                     << "[H2S] escaping store to memory: " << *UserI << "\n");
+          ValidUsesOnly = false;
+        } else {
+          // A store into the malloc'ed memory is fine.
+        }
+        return true;
+      }
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd())
+          return true;
+        // Record malloc.
+        if (isFreeCall(UserI, TLI)) {
+          if (MustUse) {
+            FreesForMalloc[&I].insert(UserI);
+          } else {
+            LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: "
+                              << *UserI << "\n");
+            ValidUsesOnly = false;
+          }
+          return true;
+        }
+
+        unsigned ArgNo = CB->getArgOperandNo(&U);
+
+        const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+        // If a callsite argument use is nofree, we are fine.
+        const auto &ArgNoFreeAA = A.getAAFor<AANoFree>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+        if (!NoCaptureAA.isAssumedNoCapture() ||
+            !ArgNoFreeAA.isAssumedNoFree()) {
+          LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
+          ValidUsesOnly = false;
+        }
+        return true;
+      }
+
+      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+        MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI));
+        Follow = true;
+        return true;
+      }
+      // Unknown user for which we can not track uses further (in a way that
+      // makes sense).
+      LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n");
+      ValidUsesOnly = false;
       return true;
+    };
+    A.checkForAllUses(Pred, *this, I);
+    return ValidUsesOnly;
+  };
 
-    Value *V = CS.getArgOperand(ArgNo);
-    if (isKnownNonZero(V, getAnchorScope().getParent()->getDataLayout()))
+  auto MallocCallocCheck = [&](Instruction &I) {
+    if (BadMallocCalls.count(&I))
       return true;
 
-    return false;
+    bool IsMalloc = isMallocLikeFn(&I, TLI);
+    bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
+    if (!IsMalloc && !IsCalloc) {
+      BadMallocCalls.insert(&I);
+      return true;
+    }
+
+    if (IsMalloc) {
+      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (Size->getValue().ule(MaxHeapToStackSize))
+          if (UsesCheck(I) || FreeCheck(I)) {
+            MallocCalls.insert(&I);
+            return true;
+          }
+    } else if (IsCalloc) {
+      bool Overflow = false;
+      if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+          if ((Size->getValue().umul_ov(Num->getValue(), Overflow))
+                  .ule(MaxHeapToStackSize))
+            if (!Overflow && (UsesCheck(I) || FreeCheck(I))) {
+              MallocCalls.insert(&I);
+              return true;
+            }
+    }
+
+    BadMallocCalls.insert(&I);
+    return true;
   };
-  if (!A.checkForAllCallSites(F, CallSiteCheck, true)) {
-    indicatePessimisticFixpoint();
+
+  size_t NumBadMallocs = BadMallocCalls.size();
+
+  A.checkForAllCallLikeInstructions(MallocCallocCheck, *this);
+
+  if (NumBadMallocs != BadMallocCalls.size())
     return ChangeStatus::CHANGED;
-  }
+
   return ChangeStatus::UNCHANGED;
 }
 
-ChangeStatus AANonNullCallSiteArgument::updateImpl(Attributor &A) {
-  // NOTE: Never look at the argument of the callee in this method.
-  //       If we do this, "nonnull" is always deduced because of the assumption.
+struct AAHeapToStackFunction final : public AAHeapToStackImpl {
+  AAHeapToStackFunction(const IRPosition &IRP) : AAHeapToStackImpl(IRP) {}
 
-  Value &V = *getAssociatedValue();
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECL(MallocCalls, Function,
+               "Number of malloc calls converted to allocas");
+    for (auto *C : MallocCalls)
+      if (!BadMallocCalls.count(C))
+        ++BUILD_STAT_NAME(MallocCalls, Function);
+  }
+};
 
-  auto *NonNullAA = A.getAAFor<AANonNull>(*this, V);
+/// -------------------- Memory Behavior Attributes ----------------------------
+/// Includes read-none, read-only, and write-only.
+/// ----------------------------------------------------------------------------
+struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
+  AAMemoryBehaviorImpl(const IRPosition &IRP) : AAMemoryBehavior(IRP) {}
 
-  if (!NonNullAA || !NonNullAA->isAssumedNonNull()) {
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    getKnownStateFromValue(getIRPosition(), getState());
+    IRAttribute::initialize(A);
   }
 
-  return ChangeStatus::UNCHANGED;
-}
+  /// Return the memory behavior information encoded in the IR for \p IRP.
+  static void getKnownStateFromValue(const IRPosition &IRP,
+                                     BitIntegerState &State,
+                                     bool IgnoreSubsumingPositions = false) {
+    SmallVector<Attribute, 2> Attrs;
+    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
+    for (const Attribute &Attr : Attrs) {
+      switch (Attr.getKindAsEnum()) {
+      case Attribute::ReadNone:
+        State.addKnownBits(NO_ACCESSES);
+        break;
+      case Attribute::ReadOnly:
+        State.addKnownBits(NO_WRITES);
+        break;
+      case Attribute::WriteOnly:
+        State.addKnownBits(NO_READS);
+        break;
+      default:
+        llvm_unreachable("Unexpcted attribute!");
+      }
+    }
 
-/// ------------------------ Will-Return Attributes ----------------------------
+    if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) {
+      if (!I->mayReadFromMemory())
+        State.addKnownBits(NO_READS);
+      if (!I->mayWriteToMemory())
+        State.addKnownBits(NO_WRITES);
+    }
+  }
 
-struct AAWillReturnImpl : public AAWillReturn, BooleanState {
+  /// See AbstractAttribute::getDeducedAttributes(...).
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    assert(Attrs.size() == 0);
+    if (isAssumedReadNone())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+    else if (isAssumedReadOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly));
+    else if (isAssumedWriteOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly));
+    assert(Attrs.size() <= 1);
+  }
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAWillReturnImpl(Function &F, InformationCache &InfoCache)
-      : AAWillReturn(F, InfoCache) {}
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    const IRPosition &IRP = getIRPosition();
+
+    // Check if we would improve the existing attributes first.
+    SmallVector<Attribute, 4> DeducedAttrs;
+    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+          return IRP.hasAttr(Attr.getKindAsEnum(),
+                             /* IgnoreSubsumingPositions */ true);
+        }))
+      return ChangeStatus::UNCHANGED;
+
+    // Clear existing attributes.
+    IRP.removeAttrs(AttrKinds);
+
+    // Use the generic manifest method.
+    return IRAttribute::manifest(A);
+  }
 
-  /// See AAWillReturn::isKnownWillReturn().
-  bool isKnownWillReturn() const override { return getKnown(); }
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isAssumedReadNone())
+      return "readnone";
+    if (isAssumedReadOnly())
+      return "readonly";
+    if (isAssumedWriteOnly())
+      return "writeonly";
+    return "may-read/write";
+  }
 
-  /// See AAWillReturn::isAssumedWillReturn().
-  bool isAssumedWillReturn() const override { return getAssumed(); }
+  /// The set of IR attributes AAMemoryBehavior deals with.
+  static const Attribute::AttrKind AttrKinds[3];
+};
 
-  /// See AbstractAttribute::getState(...).
-  AbstractState &getState() override { return *this; }
+const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = {
+    Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly};
 
-  /// See AbstractAttribute::getState(...).
-  const AbstractState &getState() const override { return *this; }
+/// Memory behavior attribute for a floating value.
+struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFloating(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
 
-  /// See AbstractAttribute::getAsStr()
-  const std::string getAsStr() const override {
-    return getAssumed() ? "willreturn" : "may-noreturn";
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    // Initialize the use vector with all direct uses of the associated value.
+    for (const Use &U : getAssociatedValue().uses())
+      Uses.insert(&U);
   }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FLOATING_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(writeonly)
+  }
+
+private:
+  /// Return true if users of \p UserI might access the underlying
+  /// variable/location described by \p U and should therefore be analyzed.
+  bool followUsersOfUseIn(Attributor &A, const Use *U,
+                          const Instruction *UserI);
+
+  /// Update the state according to the effect of use \p U in \p UserI.
+  void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
+
+protected:
+  /// Container for (transitive) uses of the associated argument.
+  SetVector<const Use *> Uses;
 };
 
-struct AAWillReturnFunction final : AAWillReturnImpl {
+/// Memory behavior attribute for function argument.
+struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorArgument(const IRPosition &IRP)
+      : AAMemoryBehaviorFloating(IRP) {}
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAWillReturnFunction(Function &F, InformationCache &InfoCache)
-      : AAWillReturnImpl(F, InfoCache) {}
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    const IRPosition &IRP = getIRPosition();
+    // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we
+    // can query it when we use has/getAttr. That would allow us to reuse the
+    // initialize of the base class here.
+    bool HasByVal =
+        IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true);
+    getKnownStateFromValue(IRP, getState(),
+                           /* IgnoreSubsumingPositions */ HasByVal);
+
+    // Initialize the use vector with all direct uses of the associated value.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg || !Arg->getParent()->hasExactDefinition()) {
+      indicatePessimisticFixpoint();
+    } else {
+      // Initialize the use vector with all direct uses of the associated value.
+      for (const Use &U : Arg->uses())
+        Uses.insert(&U);
+    }
+  }
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override {
-    return MP_FUNCTION;
+  ChangeStatus manifest(Attributor &A) override {
+    // TODO: From readattrs.ll: "inalloca parameters are always
+    //                           considered written"
+    if (hasAttr({Attribute::InAlloca})) {
+      removeKnownBits(NO_WRITES);
+      removeAssumedBits(NO_WRITES);
+    }
+    return AAMemoryBehaviorFloating::manifest(A);
   }
 
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_ARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_ARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_ARG_ATTR(writeonly)
+  }
+};
+
+struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
+  AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP)
+      : AAMemoryBehaviorArgument(IRP) {}
+
   /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override;
+  void initialize(Attributor &A) override {
+    if (Argument *Arg = getAssociatedArgument()) {
+      if (Arg->hasByValAttr()) {
+        addKnownBits(NO_WRITES);
+        removeKnownBits(NO_READS);
+        removeAssumedBits(NO_READS);
+      }
+    } else {
+    }
+    AAMemoryBehaviorArgument::initialize(A);
+  }
 
   /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AAMemoryBehavior::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CSARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CSARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CSARG_ATTR(writeonly)
+  }
 };
 
-// Helper function that checks whether a function has any cycle.
-// TODO: Replace with more efficent code
-bool containsCycle(Function &F) {
-  SmallPtrSet<BasicBlock *, 32> Visited;
+/// Memory behavior attribute for a call site return position.
+struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP)
+      : AAMemoryBehaviorFloating(IRP) {}
 
-  // Traverse BB by dfs and check whether successor is already visited.
-  for (BasicBlock *BB : depth_first(&F)) {
-    Visited.insert(BB);
-    for (auto *SuccBB : successors(BB)) {
-      if (Visited.count(SuccBB))
-        return true;
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // We do not annotate returned values.
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFunction(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    Function &F = cast<Function>(getAnchorValue());
+    if (isAssumedReadNone()) {
+      F.removeFnAttr(Attribute::ArgMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
     }
+    return AAMemoryBehaviorImpl::manifest(A);
   }
-  return false;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FN_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FN_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FN_ATTR(writeonly)
+  }
+};
+
+/// AAMemoryBehavior attribute for call sites.
+struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorCallSite(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || !F->hasExactDefinition())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AAMemoryBehavior::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CS_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CS_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CS_ATTR(writeonly)
+  }
+};
+} // namespace
+
+ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
+
+  // The current assumed state used to determine a change.
+  auto AssumedState = getAssumed();
+
+  auto CheckRWInst = [&](Instruction &I) {
+    // If the instruction has an own memory behavior state, use it to restrict
+    // the local state. No further analysis is required as the other memory
+    // state is as optimistic as it gets.
+    if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
+      const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+          *this, IRPosition::callsite_function(ICS));
+      intersectAssumedBits(MemBehaviorAA.getAssumed());
+      return !isAtFixpoint();
+    }
+
+    // Remove access kind modifiers if necessary.
+    if (I.mayReadFromMemory())
+      removeAssumedBits(NO_READS);
+    if (I.mayWriteToMemory())
+      removeAssumedBits(NO_WRITES);
+    return !isAtFixpoint();
+  };
+
+  if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+    return indicatePessimisticFixpoint();
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
 }
 
-// Helper function that checks the function have a loop which might become an
-// endless loop
-// FIXME: Any cycle is regarded as endless loop for now.
-//        We have to allow some patterns.
-bool containsPossiblyEndlessLoop(Function &F) { return containsCycle(F); }
+ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
+
+  const IRPosition &IRP = getIRPosition();
+  const IRPosition &FnPos = IRPosition::function_scope(IRP);
+  AAMemoryBehavior::StateType &S = getState();
+
+  // First, check the function scope. We take the known information and we avoid
+  // work if the assumed information implies the current assumed information for
+  // this attribute. This is a valid for all but byval arguments.
+  Argument *Arg = IRP.getAssociatedArgument();
+  AAMemoryBehavior::base_t FnMemAssumedState =
+      AAMemoryBehavior::StateType::getWorstState();
+  if (!Arg || !Arg->hasByValAttr()) {
+    const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+    FnMemAssumedState = FnMemAA.getAssumed();
+    S.addKnownBits(FnMemAA.getKnown());
+    if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed())
+      return ChangeStatus::UNCHANGED;
+  }
 
-void AAWillReturnFunction::initialize(Attributor &A) {
-  Function &F = getAnchorScope();
+  // Make sure the value is not captured (except through "return"), if
+  // it is, any information derived would be irrelevant anyway as we cannot
+  // check the potential aliases introduced by the capture. However, no need
+  // to fall back to anythign less optimistic than the function state.
+  const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
+      *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+  if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+    S.intersectAssumedBits(FnMemAssumedState);
+    return ChangeStatus::CHANGED;
+  }
 
-  if (containsPossiblyEndlessLoop(F))
-    indicatePessimisticFixpoint();
+  // The current assumed state used to determine a change.
+  auto AssumedState = S.getAssumed();
+
+  // Liveness information to exclude dead users.
+  // TODO: Take the FnPos once we have call site specific liveness information.
+  const auto &LivenessAA = A.getAAFor<AAIsDead>(
+      *this, IRPosition::function(*IRP.getAssociatedFunction()));
+
+  // Visit and expand uses until all are analyzed or a fixpoint is reached.
+  for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
+    const Use *U = Uses[i];
+    Instruction *UserI = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
+                      << " [Dead: " << (LivenessAA.isAssumedDead(UserI))
+                      << "]\n");
+    if (LivenessAA.isAssumedDead(UserI))
+      continue;
+
+    // Check if the users of UserI should also be visited.
+    if (followUsersOfUseIn(A, U, UserI))
+      for (const Use &UserIUse : UserI->uses())
+        Uses.insert(&UserIUse);
+
+    // If UserI might touch memory we analyze the use in detail.
+    if (UserI->mayReadOrWriteMemory())
+      analyzeUseIn(A, U, UserI);
+  }
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
 }
 
-ChangeStatus AAWillReturnFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
+                                                  const Instruction *UserI) {
+  // The loaded value is unrelated to the pointer argument, no need to
+  // follow the users of the load.
+  if (isa<LoadInst>(UserI))
+    return false;
 
-  // The map from instruction opcodes to those instructions in the function.
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+  // By default we follow all uses assuming UserI might leak information on U,
+  // we have special handling for call sites operands though.
+  ImmutableCallSite ICS(UserI);
+  if (!ICS || !ICS.isArgOperand(U))
+    return true;
 
-  for (unsigned Opcode :
-       {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
-        (unsigned)Instruction::Call}) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
-      auto ICS = ImmutableCallSite(I);
+  // If the use is a call argument known not to be captured, the users of
+  // the call do not need to be visited because they have to be unrelated to
+  // the input. Note that this check is not trivial even though we disallow
+  // general capturing of the underlying argument. The reason is that the
+  // call might the argument "through return", which we allow and for which we
+  // need to check call users.
+  unsigned ArgNo = ICS.getArgumentNo(U);
+  const auto &ArgNoCaptureAA =
+      A.getAAFor<AANoCapture>(*this, IRPosition::callsite_argument(ICS, ArgNo));
+  return !ArgNoCaptureAA.isAssumedNoCapture();
+}
 
-      if (ICS.hasFnAttr(Attribute::WillReturn))
-        continue;
+void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U,
+                                            const Instruction *UserI) {
+  assert(UserI->mayReadOrWriteMemory());
 
-      auto *WillReturnAA = A.getAAFor<AAWillReturn>(*this, *I);
-      if (!WillReturnAA || !WillReturnAA->isAssumedWillReturn()) {
-        indicatePessimisticFixpoint();
-        return ChangeStatus::CHANGED;
-      }
+  switch (UserI->getOpcode()) {
+  default:
+    // TODO: Handle all atomics and other side-effect operations we know of.
+    break;
+  case Instruction::Load:
+    // Loads cause the NO_READS property to disappear.
+    removeAssumedBits(NO_READS);
+    return;
+
+  case Instruction::Store:
+    // Stores cause the NO_WRITES property to disappear if the use is the
+    // pointer operand. Note that we do assume that capturing was taken care of
+    // somewhere else.
+    if (cast<StoreInst>(UserI)->getPointerOperand() == U->get())
+      removeAssumedBits(NO_WRITES);
+    return;
 
-      auto *NoRecurseAA = A.getAAFor<AANoRecurse>(*this, *I);
+  case Instruction::Call:
+  case Instruction::CallBr:
+  case Instruction::Invoke: {
+    // For call sites we look at the argument memory behavior attribute (this
+    // could be recursive!) in order to restrict our own state.
+    ImmutableCallSite ICS(UserI);
 
-      // FIXME: (i) Prohibit any recursion for now.
-      //        (ii) AANoRecurse isn't implemented yet so currently any call is
-      //        regarded as having recursion.
-      //       Code below should be
-      //       if ((!NoRecurseAA || !NoRecurseAA->isAssumedNoRecurse()) &&
-      if (!NoRecurseAA && !ICS.hasFnAttr(Attribute::NoRecurse)) {
-        indicatePessimisticFixpoint();
-        return ChangeStatus::CHANGED;
-      }
+    // Give up on operand bundles.
+    if (ICS.isBundleOperand(U)) {
+      indicatePessimisticFixpoint();
+      return;
     }
+
+    // Calling a function does read the function pointer, maybe write it if the
+    // function is self-modifying.
+    if (ICS.isCallee(U)) {
+      removeAssumedBits(NO_READS);
+      break;
+    }
+
+    // Adjust the possible access behavior based on the information on the
+    // argument.
+    unsigned ArgNo = ICS.getArgumentNo(U);
+    const IRPosition &ArgPos = IRPosition::callsite_argument(ICS, ArgNo);
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
+    // "assumed" has at most the same bits as the MemBehaviorAA assumed
+    // and at least "known".
+    intersectAssumedBits(MemBehaviorAA.getAssumed());
+    return;
   }
+  };
 
-  return ChangeStatus::UNCHANGED;
+  // Generally, look at the "may-properties" and adjust the assumed state if we
+  // did not trigger special handling before.
+  if (UserI->mayReadFromMemory())
+    removeAssumedBits(NO_READS);
+  if (UserI->mayWriteToMemory())
+    removeAssumedBits(NO_WRITES);
 }
+/// ------------------ Value Constant Range Attribute -------------------------
+
+struct AAValueConstantRangeImpl : AAValueConstantRange {
+  using StateType = IntegerRangeState;
+  AAValueConstantRangeImpl(const IRPosition &IRP) : AAValueConstantRange(IRP) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    std::string Str;
+    llvm::raw_string_ostream OS(Str);
+    OS << "range(" << getBitWidth() << ")<";
+    getKnown().print(OS);
+    OS << " / ";
+    getAssumed().print(OS);
+    OS << ">";
+    return OS.str();
+  }
+
+  /// Helper function to get a SCEV expr for the associated value at program
+  /// point \p I.
+  const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const {
+    if (!getAnchorScope())
+      return nullptr;
+
+    ScalarEvolution *SE =
+        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+            *getAnchorScope());
+
+    LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(
+        *getAnchorScope());
+
+    if (!SE || !LI)
+      return nullptr;
+
+    const SCEV *S = SE->getSCEV(&getAssociatedValue());
+    if (!I)
+      return S;
+
+    return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent()));
+  }
+
+  /// Helper function to get a range from SCEV for the associated value at
+  /// program point \p I.
+  ConstantRange getConstantRangeFromSCEV(Attributor &A,
+                                         const Instruction *I = nullptr) const {
+    if (!getAnchorScope())
+      return getWorstState(getBitWidth());
+
+    ScalarEvolution *SE =
+        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+            *getAnchorScope());
+
+    const SCEV *S = getSCEV(A, I);
+    if (!SE || !S)
+      return getWorstState(getBitWidth());
+
+    return SE->getUnsignedRange(S);
+  }
+
+  /// Helper function to get a range from LVI for the associated value at
+  /// program point \p I.
+  ConstantRange
+  getConstantRangeFromLVI(Attributor &A,
+                          const Instruction *CtxI = nullptr) const {
+    if (!getAnchorScope())
+      return getWorstState(getBitWidth());
+
+    LazyValueInfo *LVI =
+        A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>(
+            *getAnchorScope());
+
+    if (!LVI || !CtxI)
+      return getWorstState(getBitWidth());
+    return LVI->getConstantRange(&getAssociatedValue(),
+                                 const_cast<BasicBlock *>(CtxI->getParent()),
+                                 const_cast<Instruction *>(CtxI));
+  }
+
+  /// See AAValueConstantRange::getKnownConstantRange(..).
+  ConstantRange
+  getKnownConstantRange(Attributor &A,
+                        const Instruction *CtxI = nullptr) const override {
+    if (!CtxI || CtxI == getCtxI())
+      return getKnown();
+
+    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+    return getKnown().intersectWith(SCEVR).intersectWith(LVIR);
+  }
+
+  /// See AAValueConstantRange::getAssumedConstantRange(..).
+  ConstantRange
+  getAssumedConstantRange(Attributor &A,
+                          const Instruction *CtxI = nullptr) const override {
+    // TODO: Make SCEV use Attributor assumption.
+    //       We may be able to bound a variable range via assumptions in
+    //       Attributor. ex.) If x is assumed to be in [1, 3] and y is known to
+    //       evolve to x^2 + x, then we can say that y is in [2, 12].
+
+    if (!CtxI || CtxI == getCtxI())
+      return getAssumed();
+
+    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+    return getAssumed().intersectWith(SCEVR).intersectWith(LVIR);
+  }
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    // Intersect a range given by SCEV.
+    intersectKnown(getConstantRangeFromSCEV(A, getCtxI()));
+
+    // Intersect a range given by LVI.
+    intersectKnown(getConstantRangeFromLVI(A, getCtxI()));
+  }
+
+  /// Helper function to create MDNode for range metadata.
+  static MDNode *
+  getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx,
+                            const ConstantRange &AssumedConstantRange) {
+    Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get(
+                                  Ty, AssumedConstantRange.getLower())),
+                              ConstantAsMetadata::get(ConstantInt::get(
+                                  Ty, AssumedConstantRange.getUpper()))};
+    return MDNode::get(Ctx, LowAndHigh);
+  }
+
+  /// Return true if \p Assumed is included in \p KnownRanges.
+  static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) {
+
+    if (Assumed.isFullSet())
+      return false;
+
+    if (!KnownRanges)
+      return true;
+
+    // If multiple ranges are annotated in IR, we give up to annotate assumed
+    // range for now.
+
+    // TODO:  If there exists a known range which containts assumed range, we
+    // can say assumed range is better.
+    if (KnownRanges->getNumOperands() > 2)
+      return false;
+
+    ConstantInt *Lower =
+        mdconst::extract<ConstantInt>(KnownRanges->getOperand(0));
+    ConstantInt *Upper =
+        mdconst::extract<ConstantInt>(KnownRanges->getOperand(1));
+
+    ConstantRange Known(Lower->getValue(), Upper->getValue());
+    return Known.contains(Assumed) && Known != Assumed;
+  }
+
+  /// Helper function to set range metadata.
+  static bool
+  setRangeMetadataIfisBetterRange(Instruction *I,
+                                  const ConstantRange &AssumedConstantRange) {
+    auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range);
+    if (isBetterRange(AssumedConstantRange, OldRangeMD)) {
+      if (!AssumedConstantRange.isEmptySet()) {
+        I->setMetadata(LLVMContext::MD_range,
+                       getMDNodeForConstantRange(I->getType(), I->getContext(),
+                                                 AssumedConstantRange));
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// See AbstractAttribute::manifest()
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    ConstantRange AssumedConstantRange = getAssumedConstantRange(A);
+    assert(!AssumedConstantRange.isFullSet() && "Invalid state");
+
+    auto &V = getAssociatedValue();
+    if (!AssumedConstantRange.isEmptySet() &&
+        !AssumedConstantRange.isSingleElement()) {
+      if (Instruction *I = dyn_cast<Instruction>(&V))
+        if (isa<CallInst>(I) || isa<LoadInst>(I))
+          if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange))
+            Changed = ChangeStatus::CHANGED;
+    }
+
+    return Changed;
+  }
+};
+
+struct AAValueConstantRangeArgument final : public AAValueConstantRangeImpl {
+
+  AAValueConstantRangeArgument(const IRPosition &IRP)
+      : AAValueConstantRangeImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Use AAArgumentFromCallSiteArguments
+
+    IntegerRangeState S(getBitWidth());
+    clampCallSiteArgumentStates<AAValueConstantRange, IntegerRangeState>(
+        A, *this, S);
+
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<IntegerRangeState>(this->getState(), S);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeReturned : AAValueConstantRangeImpl {
+  AAValueConstantRangeReturned(const IRPosition &IRP)
+      : AAValueConstantRangeImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Use AAReturnedFromReturnedValues
+
+    // TODO: If we know we visited all returned values, thus no are assumed
+    // dead, we can take the known information from the state T.
+
+    IntegerRangeState S(getBitWidth());
+
+    clampReturnedValueStates<AAValueConstantRange, IntegerRangeState>(A, *this,
+                                                                      S);
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
+  AAValueConstantRangeFloating(const IRPosition &IRP)
+      : AAValueConstantRangeImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAValueConstantRange::initialize(A);
+    Value &V = getAssociatedValue();
+
+    if (auto *C = dyn_cast<ConstantInt>(&V)) {
+      unionAssumed(ConstantRange(C->getValue()));
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (isa<UndefValue>(&V)) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (auto *I = dyn_cast<Instruction>(&V))
+      if (isa<BinaryOperator>(I) || isa<CmpInst>(I)) {
+        Value *LHS = I->getOperand(0);
+        Value *RHS = I->getOperand(1);
+
+        if (LHS->getType()->isIntegerTy() && RHS->getType()->isIntegerTy())
+          return;
+      }
+
+    // If it is a load instruction with range metadata, use it.
+    if (LoadInst *LI = dyn_cast<LoadInst>(&V))
+      if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) {
+        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+        return;
+      }
+
+    // Otherwise we give up.
+    indicatePessimisticFixpoint();
+
+    LLVM_DEBUG(dbgs() << "[Attributor][AAValueConstantRange] We give up: "
+                      << getAssociatedValue());
+  }
+
+  bool calculateBinaryOperator(Attributor &A, BinaryOperator *BinOp,
+                               IntegerRangeState &T, Instruction *CtxI) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+
+    auto &LHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+
+    auto &RHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
+
+    auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange);
+
+    T.unionAssumed(AssumedRange);
+
+    // TODO: Track a known state too.
+
+    return T.isValidState();
+  }
+
+  bool calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T,
+                        Instruction *CtxI) {
+    Value *LHS = CmpI->getOperand(0);
+    Value *RHS = CmpI->getOperand(1);
+
+    auto &LHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+    auto &RHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+
+    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
 
+    // If one of them is empty set, we can't decide.
+    if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet())
+      return true;
+
+    bool MustTrue = false, MustFalse = false;
+
+    auto AllowedRegion =
+        ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange);
+
+    auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion(
+        CmpI->getPredicate(), RHSAARange);
+
+    if (AllowedRegion.intersectWith(LHSAARange).isEmptySet())
+      MustFalse = true;
+
+    if (SatisfyingRegion.contains(LHSAARange))
+      MustTrue = true;
+
+    assert((!MustTrue || !MustFalse) &&
+           "Either MustTrue or MustFalse should be false!");
+
+    if (MustTrue)
+      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1)));
+    else if (MustFalse)
+      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0)));
+    else
+      T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true));
+
+    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA
+                      << " " << RHSAA << "\n");
+
+    // TODO: Track a known state too.
+    return T.isValidState();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    Instruction *CtxI = getCtxI();
+    auto VisitValueCB = [&](Value &V, IntegerRangeState &T,
+                            bool Stripped) -> bool {
+      Instruction *I = dyn_cast<Instruction>(&V);
+      if (!I) {
+
+        // If the value is not instruction, we query AA to Attributor.
+        const auto &AA =
+            A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V));
+
+        // Clamp operator is not used to utilize a program point CtxI.
+        T.unionAssumed(AA.getAssumedConstantRange(A, CtxI));
+
+        return T.isValidState();
+      }
+
+      if (auto *BinOp = dyn_cast<BinaryOperator>(I))
+        return calculateBinaryOperator(A, BinOp, T, CtxI);
+      else if (auto *CmpI = dyn_cast<CmpInst>(I))
+        return calculateCmpInst(A, CmpI, T, CtxI);
+      else {
+        // Give up with other instructions.
+        // TODO: Add other instructions
+
+        T.indicatePessimisticFixpoint();
+        return false;
+      }
+    };
+
+    IntegerRangeState T(getBitWidth());
+
+    if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>(
+            A, getIRPosition(), *this, T, VisitValueCB))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
+  AAValueConstantRangeFunction(const IRPosition &IRP)
+      : AAValueConstantRangeImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will "
+                     "not be called");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction {
+  AAValueConstantRangeCallSite(const IRPosition &IRP)
+      : AAValueConstantRangeFunction(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSiteReturned : AAValueConstantRangeReturned {
+  AAValueConstantRangeCallSiteReturned(const IRPosition &IRP)
+      : AAValueConstantRangeReturned(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // If it is a load instruction with range metadata, use the metadata.
+    if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue()))
+      if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range))
+        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+
+    AAValueConstantRangeReturned::initialize(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(value_range)
+  }
+};
+struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
+  AAValueConstantRangeCallSiteArgument(const IRPosition &IRP)
+      : AAValueConstantRangeFloating(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(value_range)
+  }
+};
 /// ----------------------------------------------------------------------------
 ///                               Attributor
 /// ----------------------------------------------------------------------------
 
-bool Attributor::checkForAllCallSites(Function &F,
-                                      std::function<bool(CallSite)> &Pred,
-                                      bool RequireAllCallSites) {
+bool Attributor::isAssumedDead(const AbstractAttribute &AA,
+                               const AAIsDead *LivenessAA) {
+  const Instruction *CtxI = AA.getIRPosition().getCtxI();
+  if (!CtxI)
+    return false;
+
+  // TODO: Find a good way to utilize fine and coarse grained liveness
+  // information.
+  if (!LivenessAA)
+    LivenessAA =
+        &getAAFor<AAIsDead>(AA, IRPosition::function(*CtxI->getFunction()),
+                            /* TrackDependence */ false);
+
+  // Don't check liveness for AAIsDead.
+  if (&AA == LivenessAA)
+    return false;
+
+  if (!LivenessAA->isAssumedDead(CtxI))
+    return false;
+
+  // We actually used liveness information so we have to record a dependence.
+  recordDependence(*LivenessAA, AA, DepClassTy::OPTIONAL);
+
+  return true;
+}
+
+bool Attributor::checkForAllUses(
+    const function_ref<bool(const Use &, bool &)> &Pred,
+    const AbstractAttribute &QueryingAA, const Value &V) {
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  SmallVector<const Use *, 16> Worklist;
+  SmallPtrSet<const Use *, 16> Visited;
+
+  for (const Use &U : V.uses())
+    Worklist.push_back(&U);
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size()
+                    << " initial uses to check\n");
+
+  if (Worklist.empty())
+    return true;
+
+  bool AnyDead = false;
+  const Function *ScopeFn = IRP.getAnchorScope();
+  const auto *LivenessAA =
+      ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn),
+                                    /* TrackDependence */ false)
+              : nullptr;
+
+  while (!Worklist.empty()) {
+    const Use *U = Worklist.pop_back_val();
+    if (!Visited.insert(U).second)
+      continue;
+    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << "\n");
+    if (Instruction *UserI = dyn_cast<Instruction>(U->getUser()))
+      if (LivenessAA && LivenessAA->isAssumedDead(UserI)) {
+        LLVM_DEBUG(dbgs() << "[Attributor] Dead user: " << *UserI << ": "
+                          << *LivenessAA << "\n");
+        AnyDead = true;
+        continue;
+      }
+
+    bool Follow = false;
+    if (!Pred(*U, Follow))
+      return false;
+    if (!Follow)
+      continue;
+    for (const Use &UU : U->getUser()->uses())
+      Worklist.push_back(&UU);
+  }
+
+  if (AnyDead)
+    recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+
+  return true;
+}
+
+bool Attributor::checkForAllCallSites(
+    const function_ref<bool(AbstractCallSite)> &Pred,
+    const AbstractAttribute &QueryingAA, bool RequireAllCallSites) {
   // We can try to determine information from
   // the call sites. However, this is only possible all call sites are known,
   // hence the function has internal linkage.
-  if (RequireAllCallSites && !F.hasInternalLinkage()) {
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction) {
+    LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP
+                      << "\n");
+    return false;
+  }
+
+  return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites,
+                              &QueryingAA);
+}
+
+bool Attributor::checkForAllCallSites(
+    const function_ref<bool(AbstractCallSite)> &Pred, const Function &Fn,
+    bool RequireAllCallSites, const AbstractAttribute *QueryingAA) {
+  if (RequireAllCallSites && !Fn.hasLocalLinkage()) {
     LLVM_DEBUG(
         dbgs()
-        << "Attributor: Function " << F.getName()
+        << "[Attributor] Function " << Fn.getName()
         << " has no internal linkage, hence not all call sites are known\n");
     return false;
   }
 
-  for (const Use &U : F.uses()) {
+  for (const Use &U : Fn.uses()) {
+    AbstractCallSite ACS(&U);
+    if (!ACS) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName()
+                        << " has non call site use " << *U.get() << " in "
+                        << *U.getUser() << "\n");
+      // BlockAddress users are allowed.
+      if (isa<BlockAddress>(U.getUser()))
+        continue;
+      return false;
+    }
+
+    Instruction *I = ACS.getInstruction();
+    Function *Caller = I->getFunction();
+
+    const auto *LivenessAA =
+        lookupAAFor<AAIsDead>(IRPosition::function(*Caller), QueryingAA,
+                              /* TrackDependence */ false);
+
+    // Skip dead calls.
+    if (LivenessAA && LivenessAA->isAssumedDead(I)) {
+      // We actually used liveness information so we have to record a
+      // dependence.
+      if (QueryingAA)
+        recordDependence(*LivenessAA, *QueryingAA, DepClassTy::OPTIONAL);
+      continue;
+    }
 
-    CallSite CS(U.getUser());
-    if (!CS || !CS.isCallee(&U) || !CS.getCaller()->hasExactDefinition()) {
+    const Use *EffectiveUse =
+        ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
+    if (!ACS.isCallee(EffectiveUse)) {
       if (!RequireAllCallSites)
         continue;
-
-      LLVM_DEBUG(dbgs() << "Attributor: User " << *U.getUser()
-                        << " is an invalid use of " << F.getName() << "\n");
+      LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser()
+                        << " is an invalid use of " << Fn.getName() << "\n");
       return false;
     }
 
-    if (Pred(CS))
+    if (Pred(ACS))
       continue;
 
-    LLVM_DEBUG(dbgs() << "Attributor: Call site callback failed for "
-                      << *CS.getInstruction() << "\n");
+    LLVM_DEBUG(dbgs() << "[Attributor] Call site callback failed for "
+                      << *ACS.getInstruction() << "\n");
     return false;
   }
 
   return true;
 }
 
-ChangeStatus Attributor::run() {
-  // Initialize all abstract attributes.
-  for (AbstractAttribute *AA : AllAbstractAttributes)
-    AA->initialize(*this);
+bool Attributor::checkForAllReturnedValuesAndReturnInsts(
+    const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+        &Pred,
+    const AbstractAttribute &QueryingAA) {
+
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  // Since we need to provide return instructions we have to have an exact
+  // definition.
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // If this is a call site query we use the call site specific return values
+  // and liveness information.
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+  if (!AARetVal.getState().isValidState())
+    return false;
+
+  return AARetVal.checkForAllReturnedValuesAndReturnInsts(Pred);
+}
+
+bool Attributor::checkForAllReturnedValues(
+    const function_ref<bool(Value &)> &Pred,
+    const AbstractAttribute &QueryingAA) {
+
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+  if (!AARetVal.getState().isValidState())
+    return false;
+
+  return AARetVal.checkForAllReturnedValuesAndReturnInsts(
+      [&](Value &RV, const SmallSetVector<ReturnInst *, 4> &) {
+        return Pred(RV);
+      });
+}
+
+static bool
+checkForAllInstructionsImpl(InformationCache::OpcodeInstMapTy &OpcodeInstMap,
+                            const function_ref<bool(Instruction &)> &Pred,
+                            const AAIsDead *LivenessAA, bool &AnyDead,
+                            const ArrayRef<unsigned> &Opcodes) {
+  for (unsigned Opcode : Opcodes) {
+    for (Instruction *I : OpcodeInstMap[Opcode]) {
+      // Skip dead instructions.
+      if (LivenessAA && LivenessAA->isAssumedDead(I)) {
+        AnyDead = true;
+        continue;
+      }
+
+      if (!Pred(*I))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool Attributor::checkForAllInstructions(
+    const llvm::function_ref<bool(Instruction &)> &Pred,
+    const AbstractAttribute &QueryingAA, const ArrayRef<unsigned> &Opcodes) {
+
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  // Since we need to provide instructions we have to have an exact definition.
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &LivenessAA =
+      getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
+  bool AnyDead = false;
+
+  auto &OpcodeInstMap =
+      InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
+  if (!checkForAllInstructionsImpl(OpcodeInstMap, Pred, &LivenessAA, AnyDead,
+                                   Opcodes))
+    return false;
+
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+
+  return true;
+}
+
+bool Attributor::checkForAllReadWriteInstructions(
+    const llvm::function_ref<bool(Instruction &)> &Pred,
+    AbstractAttribute &QueryingAA) {
+
+  const Function *AssociatedFunction =
+      QueryingAA.getIRPosition().getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &LivenessAA =
+      getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
+  bool AnyDead = false;
+
+  for (Instruction *I :
+       InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) {
+    // Skip dead instructions.
+    if (LivenessAA.isAssumedDead(I)) {
+      AnyDead = true;
+      continue;
+    }
+
+    if (!Pred(*I))
+      return false;
+  }
 
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+
+  return true;
+}
+
+ChangeStatus Attributor::run(Module &M) {
   LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized "
                     << AllAbstractAttributes.size()
                     << " abstract attributes.\n");
@@ -1367,41 +5855,107 @@ ChangeStatus Attributor::run() {
   unsigned IterationCounter = 1;
 
   SmallVector<AbstractAttribute *, 64> ChangedAAs;
-  SetVector<AbstractAttribute *> Worklist;
+  SetVector<AbstractAttribute *> Worklist, InvalidAAs;
   Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end());
 
+  bool RecomputeDependences = false;
+
   do {
+    // Remember the size to determine new attributes.
+    size_t NumAAs = AllAbstractAttributes.size();
     LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter
                       << ", Worklist size: " << Worklist.size() << "\n");
 
+    // For invalid AAs we can fix dependent AAs that have a required dependence,
+    // thereby folding long dependence chains in a single step without the need
+    // to run updates.
+    for (unsigned u = 0; u < InvalidAAs.size(); ++u) {
+      AbstractAttribute *InvalidAA = InvalidAAs[u];
+      auto &QuerriedAAs = QueryMap[InvalidAA];
+      LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has "
+                        << QuerriedAAs.RequiredAAs.size() << "/"
+                        << QuerriedAAs.OptionalAAs.size()
+                        << " required/optional dependences\n");
+      for (AbstractAttribute *DepOnInvalidAA : QuerriedAAs.RequiredAAs) {
+        AbstractState &DOIAAState = DepOnInvalidAA->getState();
+        DOIAAState.indicatePessimisticFixpoint();
+        ++NumAttributesFixedDueToRequiredDependences;
+        assert(DOIAAState.isAtFixpoint() && "Expected fixpoint state!");
+        if (!DOIAAState.isValidState())
+          InvalidAAs.insert(DepOnInvalidAA);
+      }
+      if (!RecomputeDependences)
+        Worklist.insert(QuerriedAAs.OptionalAAs.begin(),
+                        QuerriedAAs.OptionalAAs.end());
+    }
+
+    // If dependences (=QueryMap) are recomputed we have to look at all abstract
+    // attributes again, regardless of what changed in the last iteration.
+    if (RecomputeDependences) {
+      LLVM_DEBUG(
+          dbgs() << "[Attributor] Run all AAs to recompute dependences\n");
+      QueryMap.clear();
+      ChangedAAs.clear();
+      Worklist.insert(AllAbstractAttributes.begin(),
+                      AllAbstractAttributes.end());
+    }
+
     // Add all abstract attributes that are potentially dependent on one that
     // changed to the work list.
     for (AbstractAttribute *ChangedAA : ChangedAAs) {
       auto &QuerriedAAs = QueryMap[ChangedAA];
-      Worklist.insert(QuerriedAAs.begin(), QuerriedAAs.end());
+      Worklist.insert(QuerriedAAs.OptionalAAs.begin(),
+                      QuerriedAAs.OptionalAAs.end());
+      Worklist.insert(QuerriedAAs.RequiredAAs.begin(),
+                      QuerriedAAs.RequiredAAs.end());
     }
 
-    // Reset the changed set.
+    LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter
+                      << ", Worklist+Dependent size: " << Worklist.size()
+                      << "\n");
+
+    // Reset the changed and invalid set.
     ChangedAAs.clear();
+    InvalidAAs.clear();
 
     // Update all abstract attribute in the work list and record the ones that
     // changed.
     for (AbstractAttribute *AA : Worklist)
-      if (AA->update(*this) == ChangeStatus::CHANGED)
-        ChangedAAs.push_back(AA);
+      if (!AA->getState().isAtFixpoint() && !isAssumedDead(*AA, nullptr)) {
+        QueriedNonFixAA = false;
+        if (AA->update(*this) == ChangeStatus::CHANGED) {
+          ChangedAAs.push_back(AA);
+          if (!AA->getState().isValidState())
+            InvalidAAs.insert(AA);
+        } else if (!QueriedNonFixAA) {
+          // If the attribute did not query any non-fix information, the state
+          // will not change and we can indicate that right away.
+          AA->getState().indicateOptimisticFixpoint();
+        }
+      }
+
+    // Check if we recompute the dependences in the next iteration.
+    RecomputeDependences = (DepRecomputeInterval > 0 &&
+                            IterationCounter % DepRecomputeInterval == 0);
+
+    // Add attributes to the changed set if they have been created in the last
+    // iteration.
+    ChangedAAs.append(AllAbstractAttributes.begin() + NumAAs,
+                      AllAbstractAttributes.end());
 
     // Reset the work list and repopulate with the changed abstract attributes.
     // Note that dependent ones are added above.
     Worklist.clear();
     Worklist.insert(ChangedAAs.begin(), ChangedAAs.end());
 
-  } while (!Worklist.empty() && ++IterationCounter < MaxFixpointIterations);
+  } while (!Worklist.empty() && (IterationCounter++ < MaxFixpointIterations ||
+                                 VerifyMaxFixpointIterations));
 
   LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
                     << IterationCounter << "/" << MaxFixpointIterations
                     << " iterations\n");
 
-  bool FinishedAtFixpoint = Worklist.empty();
+  size_t NumFinalAAs = AllAbstractAttributes.size();
 
   // Reset abstract arguments not settled in a sound fixpoint by now. This
   // happens when we stopped the fixpoint iteration early. Note that only the
@@ -1422,7 +5976,10 @@ ChangeStatus Attributor::run() {
     }
 
     auto &QuerriedAAs = QueryMap[ChangedAA];
-    ChangedAAs.append(QuerriedAAs.begin(), QuerriedAAs.end());
+    ChangedAAs.append(QuerriedAAs.OptionalAAs.begin(),
+                      QuerriedAAs.OptionalAAs.end());
+    ChangedAAs.append(QuerriedAAs.RequiredAAs.begin(),
+                      QuerriedAAs.RequiredAAs.end());
   }
 
   LLVM_DEBUG({
@@ -1448,8 +6005,14 @@ ChangeStatus Attributor::run() {
     if (!State.isValidState())
       continue;
 
+    // Skip dead code.
+    if (isAssumedDead(*AA, nullptr))
+      continue;
     // Manifest the state and record if we changed the IR.
     ChangeStatus LocalChange = AA->manifest(*this);
+    if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled())
+      AA->trackStatistics();
+
     ManifestChange = ManifestChange | LocalChange;
 
     NumAtFixpoint++;
@@ -1462,69 +6025,407 @@ ChangeStatus Attributor::run() {
                     << " arguments while " << NumAtFixpoint
                     << " were in a valid fixpoint state\n");
 
-  // If verification is requested, we finished this run at a fixpoint, and the
-  // IR was changed, we re-run the whole fixpoint analysis, starting at
-  // re-initialization of the arguments. This re-run should not result in an IR
-  // change. Though, the (virtual) state of attributes at the end of the re-run
-  // might be more optimistic than the known state or the IR state if the better
-  // state cannot be manifested.
-  if (VerifyAttributor && FinishedAtFixpoint &&
-      ManifestChange == ChangeStatus::CHANGED) {
-    VerifyAttributor = false;
-    ChangeStatus VerifyStatus = run();
-    if (VerifyStatus != ChangeStatus::UNCHANGED)
-      llvm_unreachable(
-          "Attributor verification failed, re-run did result in an IR change "
-          "even after a fixpoint was reached in the original run. (False "
-          "positives possible!)");
-    VerifyAttributor = true;
-  }
-
   NumAttributesManifested += NumManifested;
   NumAttributesValidFixpoint += NumAtFixpoint;
 
+  (void)NumFinalAAs;
+  assert(
+      NumFinalAAs == AllAbstractAttributes.size() &&
+      "Expected the final number of abstract attributes to remain unchanged!");
+
+  // Delete stuff at the end to avoid invalid references and a nice order.
+  {
+    LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
+                      << ToBeDeletedFunctions.size() << " functions and "
+                      << ToBeDeletedBlocks.size() << " blocks and "
+                      << ToBeDeletedInsts.size() << " instructions and "
+                      << ToBeChangedUses.size() << " uses\n");
+
+    SmallVector<Instruction *, 32> DeadInsts;
+    SmallVector<Instruction *, 32> TerminatorsToFold;
+
+    for (auto &It : ToBeChangedUses) {
+      Use *U = It.first;
+      Value *NewV = It.second;
+      Value *OldV = U->get();
+      LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser()
+                        << " instead of " << *OldV << "\n");
+      U->set(NewV);
+      if (Instruction *I = dyn_cast<Instruction>(OldV))
+        if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) &&
+            isInstructionTriviallyDead(I)) {
+          DeadInsts.push_back(I);
+        }
+      if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) {
+        Instruction *UserI = cast<Instruction>(U->getUser());
+        if (isa<UndefValue>(NewV)) {
+          ToBeChangedToUnreachableInsts.insert(UserI);
+        } else {
+          TerminatorsToFold.push_back(UserI);
+        }
+      }
+    }
+    for (auto &V : InvokeWithDeadSuccessor)
+      if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
+        bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
+        bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
+        bool Invoke2CallAllowed =
+            !AAIsDeadFunction::mayCatchAsynchronousExceptions(
+                *II->getFunction());
+        assert((UnwindBBIsDead || NormalBBIsDead) &&
+               "Invoke does not have dead successors!");
+        BasicBlock *BB = II->getParent();
+        BasicBlock *NormalDestBB = II->getNormalDest();
+        if (UnwindBBIsDead) {
+          Instruction *NormalNextIP = &NormalDestBB->front();
+          if (Invoke2CallAllowed) {
+            changeToCall(II);
+            NormalNextIP = BB->getTerminator();
+          }
+          if (NormalBBIsDead)
+            ToBeChangedToUnreachableInsts.insert(NormalNextIP);
+        } else {
+          assert(NormalBBIsDead && "Broken invariant!");
+          if (!NormalDestBB->getUniquePredecessor())
+            NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
+          ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
+        }
+      }
+    for (auto &V : ToBeChangedToUnreachableInsts)
+      if (Instruction *I = dyn_cast_or_null<Instruction>(V))
+        changeToUnreachable(I, /* UseLLVMTrap */ false);
+    for (Instruction *I : TerminatorsToFold)
+      ConstantFoldTerminator(I->getParent());
+
+    for (Instruction *I : ToBeDeletedInsts) {
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      if (!isa<PHINode>(I) && isInstructionTriviallyDead(I))
+        DeadInsts.push_back(I);
+      else
+        I->eraseFromParent();
+    }
+
+    RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+
+    if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
+      SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
+      ToBeDeletedBBs.reserve(NumDeadBlocks);
+      ToBeDeletedBBs.append(ToBeDeletedBlocks.begin(), ToBeDeletedBlocks.end());
+      // Actually we do not delete the blocks but squash them into a single
+      // unreachable but untangling branches that jump here is something we need
+      // to do in a more generic way.
+      DetatchDeadBlocks(ToBeDeletedBBs, nullptr);
+      STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted.");
+      BUILD_STAT_NAME(AAIsDead, BasicBlock) += ToBeDeletedBlocks.size();
+    }
+
+    // Identify dead internal functions and delete them. This happens outside
+    // the other fixpoint analysis as we might treat potentially dead functions
+    // as live to lower the number of iterations. If they happen to be dead, the
+    // below fixpoint loop will identify and eliminate them.
+    SmallVector<Function *, 8> InternalFns;
+    for (Function &F : M)
+      if (F.hasLocalLinkage())
+        InternalFns.push_back(&F);
+
+    bool FoundDeadFn = true;
+    while (FoundDeadFn) {
+      FoundDeadFn = false;
+      for (unsigned u = 0, e = InternalFns.size(); u < e; ++u) {
+        Function *F = InternalFns[u];
+        if (!F)
+          continue;
+
+        if (!checkForAllCallSites(
+                [this](AbstractCallSite ACS) {
+                  return ToBeDeletedFunctions.count(
+                      ACS.getInstruction()->getFunction());
+                },
+                *F, true, nullptr))
+          continue;
+
+        ToBeDeletedFunctions.insert(F);
+        InternalFns[u] = nullptr;
+        FoundDeadFn = true;
+      }
+    }
+  }
+
+  STATS_DECL(AAIsDead, Function, "Number of dead functions deleted.");
+  BUILD_STAT_NAME(AAIsDead, Function) += ToBeDeletedFunctions.size();
+
+  // Rewrite the functions as requested during manifest.
+  ManifestChange = ManifestChange | rewriteFunctionSignatures();
+
+  for (Function *Fn : ToBeDeletedFunctions) {
+    Fn->deleteBody();
+    Fn->replaceAllUsesWith(UndefValue::get(Fn->getType()));
+    Fn->eraseFromParent();
+  }
+
+  if (VerifyMaxFixpointIterations &&
+      IterationCounter != MaxFixpointIterations) {
+    errs() << "\n[Attributor] Fixpoint iteration done after: "
+           << IterationCounter << "/" << MaxFixpointIterations
+           << " iterations\n";
+    llvm_unreachable("The fixpoint was not reached with exactly the number of "
+                     "specified iterations!");
+  }
+
   return ManifestChange;
 }
 
-void Attributor::identifyDefaultAbstractAttributes(
-    Function &F, InformationCache &InfoCache,
-    DenseSet</* Attribute::AttrKind */ unsigned> *Whitelist) {
+bool Attributor::registerFunctionSignatureRewrite(
+    Argument &Arg, ArrayRef<Type *> ReplacementTypes,
+    ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB,
+    ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) {
 
-  // Every function can be nounwind.
-  registerAA(*new AANoUnwindFunction(F, InfoCache));
+  auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
+    // Forbid must-tail calls for now.
+    return !ACS.isCallbackCall() && !ACS.getCallSite().isMustTailCall();
+  };
 
-  // Every function might be marked "nosync"
-  registerAA(*new AANoSyncFunction(F, InfoCache));
+  Function *Fn = Arg.getParent();
+  // Avoid var-arg functions for now.
+  if (Fn->isVarArg()) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n");
+    return false;
+  }
 
-  // Every function might be "no-free".
-  registerAA(*new AANoFreeFunction(F, InfoCache));
+  // Avoid functions with complicated argument passing semantics.
+  AttributeList FnAttributeList = Fn->getAttributes();
+  if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) ||
+      FnAttributeList.hasAttrSomewhere(Attribute::StructRet) ||
+      FnAttributeList.hasAttrSomewhere(Attribute::InAlloca)) {
+    LLVM_DEBUG(
+        dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n");
+    return false;
+  }
 
-  // Return attributes are only appropriate if the return type is non void.
-  Type *ReturnType = F.getReturnType();
-  if (!ReturnType->isVoidTy()) {
-    // Argument attribute "returned" --- Create only one per function even
-    // though it is an argument attribute.
-    if (!Whitelist || Whitelist->count(AAReturnedValues::ID))
-      registerAA(*new AAReturnedValuesImpl(F, InfoCache));
+  // Avoid callbacks for now.
+  if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr)) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n");
+    return false;
+  }
+
+  auto InstPred = [](Instruction &I) {
+    if (auto *CI = dyn_cast<CallInst>(&I))
+      return !CI->isMustTailCall();
+    return true;
+  };
 
-    // Every function with pointer return type might be marked nonnull.
-    if (ReturnType->isPointerTy() &&
-        (!Whitelist || Whitelist->count(AANonNullReturned::ID)))
-      registerAA(*new AANonNullReturned(F, InfoCache));
+  // Forbid must-tail calls for now.
+  // TODO:
+  bool AnyDead;
+  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
+  if (!checkForAllInstructionsImpl(OpcodeInstMap, InstPred, nullptr, AnyDead,
+                                   {Instruction::Call})) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n");
+    return false;
   }
 
-  // Every argument with pointer type might be marked nonnull.
-  for (Argument &Arg : F.args()) {
-    if (Arg.getType()->isPointerTy())
-      registerAA(*new AANonNullArgument(Arg, InfoCache));
+  SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = ArgumentReplacementMap[Fn];
+  if (ARIs.size() == 0)
+    ARIs.resize(Fn->arg_size());
+
+  // If we have a replacement already with less than or equal new arguments,
+  // ignore this request.
+  ArgumentReplacementInfo *&ARI = ARIs[Arg.getArgNo()];
+  if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n");
+    return false;
   }
 
-  // Every function might be "will-return".
-  registerAA(*new AAWillReturnFunction(F, InfoCache));
+  // If we have a replacement already but we like the new one better, delete
+  // the old.
+  if (ARI)
+    delete ARI;
+
+  // Remember the replacement.
+  ARI = new ArgumentReplacementInfo(*this, Arg, ReplacementTypes,
+                                    std::move(CalleeRepairCB),
+                                    std::move(ACSRepairCB));
+
+  return true;
+}
+
+ChangeStatus Attributor::rewriteFunctionSignatures() {
+  ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+  for (auto &It : ArgumentReplacementMap) {
+    Function *OldFn = It.getFirst();
+
+    // Deleted functions do not require rewrites.
+    if (ToBeDeletedFunctions.count(OldFn))
+      continue;
+
+    const SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = It.getSecond();
+    assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!");
+
+    SmallVector<Type *, 16> NewArgumentTypes;
+    SmallVector<AttributeSet, 16> NewArgumentAttributes;
+
+    // Collect replacement argument types and copy over existing attributes.
+    AttributeList OldFnAttributeList = OldFn->getAttributes();
+    for (Argument &Arg : OldFn->args()) {
+      if (ArgumentReplacementInfo *ARI = ARIs[Arg.getArgNo()]) {
+        NewArgumentTypes.append(ARI->ReplacementTypes.begin(),
+                                ARI->ReplacementTypes.end());
+        NewArgumentAttributes.append(ARI->getNumReplacementArgs(),
+                                     AttributeSet());
+      } else {
+        NewArgumentTypes.push_back(Arg.getType());
+        NewArgumentAttributes.push_back(
+            OldFnAttributeList.getParamAttributes(Arg.getArgNo()));
+      }
+    }
+
+    FunctionType *OldFnTy = OldFn->getFunctionType();
+    Type *RetTy = OldFnTy->getReturnType();
+
+    // Construct the new function type using the new arguments types.
+    FunctionType *NewFnTy =
+        FunctionType::get(RetTy, NewArgumentTypes, OldFnTy->isVarArg());
+
+    LLVM_DEBUG(dbgs() << "[Attributor] Function rewrite '" << OldFn->getName()
+                      << "' from " << *OldFn->getFunctionType() << " to "
+                      << *NewFnTy << "\n");
+
+    // Create the new function body and insert it into the module.
+    Function *NewFn = Function::Create(NewFnTy, OldFn->getLinkage(),
+                                       OldFn->getAddressSpace(), "");
+    OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn);
+    NewFn->takeName(OldFn);
+    NewFn->copyAttributesFrom(OldFn);
+
+    // Patch the pointer to LLVM function in debug info descriptor.
+    NewFn->setSubprogram(OldFn->getSubprogram());
+    OldFn->setSubprogram(nullptr);
+
+    // Recompute the parameter attributes list based on the new arguments for
+    // the function.
+    LLVMContext &Ctx = OldFn->getContext();
+    NewFn->setAttributes(AttributeList::get(
+        Ctx, OldFnAttributeList.getFnAttributes(),
+        OldFnAttributeList.getRetAttributes(), NewArgumentAttributes));
+
+    // Since we have now created the new function, splice the body of the old
+    // function right into the new function, leaving the old rotting hulk of the
+    // function empty.
+    NewFn->getBasicBlockList().splice(NewFn->begin(),
+                                      OldFn->getBasicBlockList());
+
+    // Set of all "call-like" instructions that invoke the old function mapped
+    // to their new replacements.
+    SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs;
+
+    // Callback to create a new "call-like" instruction for a given one.
+    auto CallSiteReplacementCreator = [&](AbstractCallSite ACS) {
+      CallBase *OldCB = cast<CallBase>(ACS.getInstruction());
+      const AttributeList &OldCallAttributeList = OldCB->getAttributes();
+
+      // Collect the new argument operands for the replacement call site.
+      SmallVector<Value *, 16> NewArgOperands;
+      SmallVector<AttributeSet, 16> NewArgOperandAttributes;
+      for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) {
+        unsigned NewFirstArgNum = NewArgOperands.size();
+        (void)NewFirstArgNum; // only used inside assert.
+        if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) {
+          if (ARI->ACSRepairCB)
+            ARI->ACSRepairCB(*ARI, ACS, NewArgOperands);
+          assert(ARI->getNumReplacementArgs() + NewFirstArgNum ==
+                     NewArgOperands.size() &&
+                 "ACS repair callback did not provide as many operand as new "
+                 "types were registered!");
+          // TODO: Exose the attribute set to the ACS repair callback
+          NewArgOperandAttributes.append(ARI->ReplacementTypes.size(),
+                                         AttributeSet());
+        } else {
+          NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum));
+          NewArgOperandAttributes.push_back(
+              OldCallAttributeList.getParamAttributes(OldArgNum));
+        }
+      }
+
+      assert(NewArgOperands.size() == NewArgOperandAttributes.size() &&
+             "Mismatch # argument operands vs. # argument operand attributes!");
+      assert(NewArgOperands.size() == NewFn->arg_size() &&
+             "Mismatch # argument operands vs. # function arguments!");
+
+      SmallVector<OperandBundleDef, 4> OperandBundleDefs;
+      OldCB->getOperandBundlesAsDefs(OperandBundleDefs);
+
+      // Create a new call or invoke instruction to replace the old one.
+      CallBase *NewCB;
+      if (InvokeInst *II = dyn_cast<InvokeInst>(OldCB)) {
+        NewCB =
+            InvokeInst::Create(NewFn, II->getNormalDest(), II->getUnwindDest(),
+                               NewArgOperands, OperandBundleDefs, "", OldCB);
+      } else {
+        auto *NewCI = CallInst::Create(NewFn, NewArgOperands, OperandBundleDefs,
+                                       "", OldCB);
+        NewCI->setTailCallKind(cast<CallInst>(OldCB)->getTailCallKind());
+        NewCB = NewCI;
+      }
+
+      // Copy over various properties and the new attributes.
+      uint64_t W;
+      if (OldCB->extractProfTotalWeight(W))
+        NewCB->setProfWeight(W);
+      NewCB->setCallingConv(OldCB->getCallingConv());
+      NewCB->setDebugLoc(OldCB->getDebugLoc());
+      NewCB->takeName(OldCB);
+      NewCB->setAttributes(AttributeList::get(
+          Ctx, OldCallAttributeList.getFnAttributes(),
+          OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes));
+
+      CallSitePairs.push_back({OldCB, NewCB});
+      return true;
+    };
+
+    // Use the CallSiteReplacementCreator to create replacement call sites.
+    bool Success =
+        checkForAllCallSites(CallSiteReplacementCreator, *OldFn, true, nullptr);
+    (void)Success;
+    assert(Success && "Assumed call site replacement to succeed!");
+
+    // Rewire the arguments.
+    auto OldFnArgIt = OldFn->arg_begin();
+    auto NewFnArgIt = NewFn->arg_begin();
+    for (unsigned OldArgNum = 0; OldArgNum < ARIs.size();
+         ++OldArgNum, ++OldFnArgIt) {
+      if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) {
+        if (ARI->CalleeRepairCB)
+          ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt);
+        NewFnArgIt += ARI->ReplacementTypes.size();
+      } else {
+        NewFnArgIt->takeName(&*OldFnArgIt);
+        OldFnArgIt->replaceAllUsesWith(&*NewFnArgIt);
+        ++NewFnArgIt;
+      }
+    }
+
+    // Eliminate the instructions *after* we visited all of them.
+    for (auto &CallSitePair : CallSitePairs) {
+      CallBase &OldCB = *CallSitePair.first;
+      CallBase &NewCB = *CallSitePair.second;
+      OldCB.replaceAllUsesWith(&NewCB);
+      OldCB.eraseFromParent();
+    }
+
+    ToBeDeletedFunctions.insert(OldFn);
+
+    Changed = ChangeStatus::CHANGED;
+  }
+
+  return Changed;
+}
 
-  // Walk all instructions to find more attribute opportunities and also
-  // interesting instructions that might be queried by abstract attributes
-  // during their initialization or update.
+void Attributor::initializeInformationCache(Function &F) {
+
+  // Walk all instructions to find interesting instructions that might be
+  // queried by abstract attributes during their initialization or update.
+  // This has to happen before we create attributes.
   auto &ReadOrWriteInsts = InfoCache.FuncRWInstsMap[&F];
   auto &InstOpcodeMap = InfoCache.FuncInstOpcodeMap[&F];
 
@@ -1540,13 +6441,20 @@ void Attributor::identifyDefaultAbstractAttributes(
     default:
       assert((!ImmutableCallSite(&I)) && (!isa<CallBase>(&I)) &&
              "New call site/base instruction type needs to be known int the "
-             "attributor.");
+             "Attributor.");
       break;
+    case Instruction::Load:
+      // The alignment of a pointer is interesting for loads.
+    case Instruction::Store:
+      // The alignment of a pointer is interesting for stores.
     case Instruction::Call:
     case Instruction::CallBr:
     case Instruction::Invoke:
     case Instruction::CleanupRet:
     case Instruction::CatchSwitch:
+    case Instruction::AtomicRMW:
+    case Instruction::AtomicCmpXchg:
+    case Instruction::Br:
     case Instruction::Resume:
     case Instruction::Ret:
       IsInterestingOpcode = true;
@@ -1555,18 +6463,209 @@ void Attributor::identifyDefaultAbstractAttributes(
       InstOpcodeMap[I.getOpcode()].push_back(&I);
     if (I.mayReadOrWriteMemory())
       ReadOrWriteInsts.push_back(&I);
+  }
+}
+
+void Attributor::recordDependence(const AbstractAttribute &FromAA,
+                                  const AbstractAttribute &ToAA,
+                                  DepClassTy DepClass) {
+  if (FromAA.getState().isAtFixpoint())
+    return;
+
+  if (DepClass == DepClassTy::REQUIRED)
+    QueryMap[&FromAA].RequiredAAs.insert(
+        const_cast<AbstractAttribute *>(&ToAA));
+  else
+    QueryMap[&FromAA].OptionalAAs.insert(
+        const_cast<AbstractAttribute *>(&ToAA));
+  QueriedNonFixAA = true;
+}
+
+void Attributor::identifyDefaultAbstractAttributes(Function &F) {
+  if (!VisitedFunctions.insert(&F).second)
+    return;
+  if (F.isDeclaration())
+    return;
+
+  IRPosition FPos = IRPosition::function(F);
 
+  // Check for dead BasicBlocks in every function.
+  // We need dead instruction detection because we do not want to deal with
+  // broken IR in which SSA rules do not apply.
+  getOrCreateAAFor<AAIsDead>(FPos);
+
+  // Every function might be "will-return".
+  getOrCreateAAFor<AAWillReturn>(FPos);
+
+  // Every function might contain instructions that cause "undefined behavior".
+  getOrCreateAAFor<AAUndefinedBehavior>(FPos);
+
+  // Every function can be nounwind.
+  getOrCreateAAFor<AANoUnwind>(FPos);
+
+  // Every function might be marked "nosync"
+  getOrCreateAAFor<AANoSync>(FPos);
+
+  // Every function might be "no-free".
+  getOrCreateAAFor<AANoFree>(FPos);
+
+  // Every function might be "no-return".
+  getOrCreateAAFor<AANoReturn>(FPos);
+
+  // Every function might be "no-recurse".
+  getOrCreateAAFor<AANoRecurse>(FPos);
+
+  // Every function might be "readnone/readonly/writeonly/...".
+  getOrCreateAAFor<AAMemoryBehavior>(FPos);
+
+  // Every function might be applicable for Heap-To-Stack conversion.
+  if (EnableHeapToStack)
+    getOrCreateAAFor<AAHeapToStack>(FPos);
+
+  // Return attributes are only appropriate if the return type is non void.
+  Type *ReturnType = F.getReturnType();
+  if (!ReturnType->isVoidTy()) {
+    // Argument attribute "returned" --- Create only one per function even
+    // though it is an argument attribute.
+    getOrCreateAAFor<AAReturnedValues>(FPos);
+
+    IRPosition RetPos = IRPosition::returned(F);
+
+    // Every returned value might be dead.
+    getOrCreateAAFor<AAIsDead>(RetPos);
+
+    // Every function might be simplified.
+    getOrCreateAAFor<AAValueSimplify>(RetPos);
+
+    if (ReturnType->isPointerTy()) {
+
+      // Every function with pointer return type might be marked align.
+      getOrCreateAAFor<AAAlign>(RetPos);
+
+      // Every function with pointer return type might be marked nonnull.
+      getOrCreateAAFor<AANonNull>(RetPos);
+
+      // Every function with pointer return type might be marked noalias.
+      getOrCreateAAFor<AANoAlias>(RetPos);
+
+      // Every function with pointer return type might be marked
+      // dereferenceable.
+      getOrCreateAAFor<AADereferenceable>(RetPos);
+    }
+  }
+
+  for (Argument &Arg : F.args()) {
+    IRPosition ArgPos = IRPosition::argument(Arg);
+
+    // Every argument might be simplified.
+    getOrCreateAAFor<AAValueSimplify>(ArgPos);
+
+    if (Arg.getType()->isPointerTy()) {
+      // Every argument with pointer type might be marked nonnull.
+      getOrCreateAAFor<AANonNull>(ArgPos);
+
+      // Every argument with pointer type might be marked noalias.
+      getOrCreateAAFor<AANoAlias>(ArgPos);
+
+      // Every argument with pointer type might be marked dereferenceable.
+      getOrCreateAAFor<AADereferenceable>(ArgPos);
+
+      // Every argument with pointer type might be marked align.
+      getOrCreateAAFor<AAAlign>(ArgPos);
+
+      // Every argument with pointer type might be marked nocapture.
+      getOrCreateAAFor<AANoCapture>(ArgPos);
+
+      // Every argument with pointer type might be marked
+      // "readnone/readonly/writeonly/..."
+      getOrCreateAAFor<AAMemoryBehavior>(ArgPos);
+
+      // Every argument with pointer type might be marked nofree.
+      getOrCreateAAFor<AANoFree>(ArgPos);
+    }
+  }
+
+  auto CallSitePred = [&](Instruction &I) -> bool {
     CallSite CS(&I);
-    if (CS && CS.getCalledFunction()) {
-      for (int i = 0, e = CS.getCalledFunction()->arg_size(); i < e; i++) {
+    if (Function *Callee = CS.getCalledFunction()) {
+      // Skip declerations except if annotations on their call sites were
+      // explicitly requested.
+      if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
+          !Callee->hasMetadata(LLVMContext::MD_callback))
+        return true;
+
+      if (!Callee->getReturnType()->isVoidTy() && !CS->use_empty()) {
+
+        IRPosition CSRetPos = IRPosition::callsite_returned(CS);
+
+        // Call site return values might be dead.
+        getOrCreateAAFor<AAIsDead>(CSRetPos);
+
+        // Call site return integer values might be limited by a constant range.
+        if (Callee->getReturnType()->isIntegerTy()) {
+          getOrCreateAAFor<AAValueConstantRange>(CSRetPos);
+        }
+      }
+
+      for (int i = 0, e = CS.getNumArgOperands(); i < e; i++) {
+
+        IRPosition CSArgPos = IRPosition::callsite_argument(CS, i);
+
+        // Every call site argument might be dead.
+        getOrCreateAAFor<AAIsDead>(CSArgPos);
+
+        // Call site argument might be simplified.
+        getOrCreateAAFor<AAValueSimplify>(CSArgPos);
+
         if (!CS.getArgument(i)->getType()->isPointerTy())
           continue;
 
         // Call site argument attribute "non-null".
-        registerAA(*new AANonNullCallSiteArgument(CS, i, InfoCache), i);
+        getOrCreateAAFor<AANonNull>(CSArgPos);
+
+        // Call site argument attribute "no-alias".
+        getOrCreateAAFor<AANoAlias>(CSArgPos);
+
+        // Call site argument attribute "dereferenceable".
+        getOrCreateAAFor<AADereferenceable>(CSArgPos);
+
+        // Call site argument attribute "align".
+        getOrCreateAAFor<AAAlign>(CSArgPos);
+
+        // Call site argument attribute
+        // "readnone/readonly/writeonly/..."
+        getOrCreateAAFor<AAMemoryBehavior>(CSArgPos);
+
+        // Call site argument attribute "nofree".
+        getOrCreateAAFor<AANoFree>(CSArgPos);
       }
     }
-  }
+    return true;
+  };
+
+  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+  bool Success, AnyDead = false;
+  Success = checkForAllInstructionsImpl(
+      OpcodeInstMap, CallSitePred, nullptr, AnyDead,
+      {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
+       (unsigned)Instruction::Call});
+  (void)Success;
+  assert(Success && !AnyDead && "Expected the check call to be successful!");
+
+  auto LoadStorePred = [&](Instruction &I) -> bool {
+    if (isa<LoadInst>(I))
+      getOrCreateAAFor<AAAlign>(
+          IRPosition::value(*cast<LoadInst>(I).getPointerOperand()));
+    else
+      getOrCreateAAFor<AAAlign>(
+          IRPosition::value(*cast<StoreInst>(I).getPointerOperand()));
+    return true;
+  };
+  Success = checkForAllInstructionsImpl(
+      OpcodeInstMap, LoadStorePred, nullptr, AnyDead,
+      {(unsigned)Instruction::Load, (unsigned)Instruction::Store});
+  (void)Success;
+  assert(Success && !AnyDead && "Expected the check call to be successful!");
 }
 
 /// Helpers to ease debugging through output streams and print calls.
@@ -1576,21 +6675,52 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, ChangeStatus S) {
   return OS << (S == ChangeStatus::CHANGED ? "changed" : "unchanged");
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS,
-                              AbstractAttribute::ManifestPosition AP) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) {
   switch (AP) {
-  case AbstractAttribute::MP_ARGUMENT:
+  case IRPosition::IRP_INVALID:
+    return OS << "inv";
+  case IRPosition::IRP_FLOAT:
+    return OS << "flt";
+  case IRPosition::IRP_RETURNED:
+    return OS << "fn_ret";
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+    return OS << "cs_ret";
+  case IRPosition::IRP_FUNCTION:
+    return OS << "fn";
+  case IRPosition::IRP_CALL_SITE:
+    return OS << "cs";
+  case IRPosition::IRP_ARGUMENT:
     return OS << "arg";
-  case AbstractAttribute::MP_CALL_SITE_ARGUMENT:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
     return OS << "cs_arg";
-  case AbstractAttribute::MP_FUNCTION:
-    return OS << "fn";
-  case AbstractAttribute::MP_RETURNED:
-    return OS << "fn_ret";
   }
   llvm_unreachable("Unknown attribute position!");
 }
 
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) {
+  const Value &AV = Pos.getAssociatedValue();
+  return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " ["
+            << Pos.getAnchorValue().getName() << "@" << Pos.getArgNo() << "]}";
+}
+
+template <typename base_ty, base_ty BestState, base_ty WorstState>
+raw_ostream &
+llvm::operator<<(raw_ostream &OS,
+                 const IntegerStateBase<base_ty, BestState, WorstState> &S) {
+  return OS << "(" << S.getKnown() << "-" << S.getAssumed() << ")"
+            << static_cast<const AbstractState &>(S);
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) {
+  OS << "range-state(" << S.getBitWidth() << ")<";
+  S.getKnown().print(OS);
+  OS << " / ";
+  S.getAssumed().print(OS);
+  OS << ">";
+
+  return OS << static_cast<const AbstractState &>(S);
+}
+
 raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) {
   return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : ""));
 }
@@ -1601,8 +6731,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) {
 }
 
 void AbstractAttribute::print(raw_ostream &OS) const {
-  OS << "[" << getManifestPosition() << "][" << getAsStr() << "]["
-     << AnchoredVal.getName() << "]";
+  OS << "[P: " << getIRPosition() << "][" << getAsStr() << "][S: " << getState()
+     << "]";
 }
 ///}
 
@@ -1610,7 +6740,7 @@ void AbstractAttribute::print(raw_ostream &OS) const {
 ///                       Pass (Manager) Boilerplate
 /// ----------------------------------------------------------------------------
 
-static bool runAttributorOnModule(Module &M) {
+static bool runAttributorOnModule(Module &M, AnalysisGetter &AG) {
   if (DisableAttributor)
     return false;
 
@@ -1619,39 +6749,41 @@ static bool runAttributorOnModule(Module &M) {
 
   // Create an Attributor and initially empty information cache that is filled
   // while we identify default attribute opportunities.
-  Attributor A;
-  InformationCache InfoCache;
+  InformationCache InfoCache(M, AG);
+  Attributor A(InfoCache, DepRecInterval);
+
+  for (Function &F : M)
+    A.initializeInformationCache(F);
 
   for (Function &F : M) {
-    // TODO: Not all attributes require an exact definition. Find a way to
-    //       enable deduction for some but not all attributes in case the
-    //       definition might be changed at runtime, see also
-    //       http://lists.llvm.org/pipermail/llvm-dev/2018-February/121275.html.
-    // TODO: We could always determine abstract attributes and if sufficient
-    //       information was found we could duplicate the functions that do not
-    //       have an exact definition.
-    if (!F.hasExactDefinition()) {
+    if (F.hasExactDefinition())
+      NumFnWithExactDefinition++;
+    else
       NumFnWithoutExactDefinition++;
-      continue;
-    }
-
-    // For now we ignore naked and optnone functions.
-    if (F.hasFnAttribute(Attribute::Naked) ||
-        F.hasFnAttribute(Attribute::OptimizeNone))
-      continue;
 
-    NumFnWithExactDefinition++;
+    // We look at internal functions only on-demand but if any use is not a
+    // direct call, we have to do it eagerly.
+    if (F.hasLocalLinkage()) {
+      if (llvm::all_of(F.uses(), [](const Use &U) {
+            return ImmutableCallSite(U.getUser()) &&
+                   ImmutableCallSite(U.getUser()).isCallee(&U);
+          }))
+        continue;
+    }
 
     // Populate the Attributor with abstract attribute opportunities in the
     // function and the information cache with IR information.
-    A.identifyDefaultAbstractAttributes(F, InfoCache);
+    A.identifyDefaultAbstractAttributes(F);
   }
 
-  return A.run() == ChangeStatus::CHANGED;
+  bool Changed = A.run(M) == ChangeStatus::CHANGED;
+  assert(!verifyModule(M, &errs()) && "Module verification failed!");
+  return Changed;
 }
 
 PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
-  if (runAttributorOnModule(M)) {
+  AnalysisGetter AG(AM);
+  if (runAttributorOnModule(M, AG)) {
     // FIXME: Think about passes we will preserve and add them here.
     return PreservedAnalyses::none();
   }
@@ -1670,12 +6802,14 @@ struct AttributorLegacyPass : public ModulePass {
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
-    return runAttributorOnModule(M);
+
+    AnalysisGetter AG;
+    return runAttributorOnModule(M, AG);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     // FIXME: Think about passes we will preserve and add them here.
-    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -1684,7 +6818,153 @@ struct AttributorLegacyPass : public ModulePass {
 Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); }
 
 char AttributorLegacyPass::ID = 0;
+
+const char AAReturnedValues::ID = 0;
+const char AANoUnwind::ID = 0;
+const char AANoSync::ID = 0;
+const char AANoFree::ID = 0;
+const char AANonNull::ID = 0;
+const char AANoRecurse::ID = 0;
+const char AAWillReturn::ID = 0;
+const char AAUndefinedBehavior::ID = 0;
+const char AANoAlias::ID = 0;
+const char AAReachability::ID = 0;
+const char AANoReturn::ID = 0;
+const char AAIsDead::ID = 0;
+const char AADereferenceable::ID = 0;
+const char AAAlign::ID = 0;
+const char AANoCapture::ID = 0;
+const char AAValueSimplify::ID = 0;
+const char AAHeapToStack::ID = 0;
+const char AAMemoryBehavior::ID = 0;
+const char AAValueConstantRange::ID = 0;
+
+// Macro magic to create the static generator function for attributes that
+// follow the naming scheme.
+
+#define SWITCH_PK_INV(CLASS, PK, POS_NAME)                                     \
+  case IRPosition::PK:                                                         \
+    llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!");
+
+#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX)                               \
+  case IRPosition::PK:                                                         \
+    AA = new CLASS##SUFFIX(IRP);                                               \
+    break;
+
+#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                 \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                    \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                      \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)            \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                  \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
+
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
+
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
+
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
+
+CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
+
+#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef SWITCH_PK_CREATE
+#undef SWITCH_PK_INV
+
 INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor",
                       "Deduce and propagate attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(AttributorLegacyPass, "attributor",
                     "Deduce and propagate attributes", false, false)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
index 6b68aa90c567..b49a92ad16b3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -17,6 +17,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp
index 6c365f3f3cbe..aec470ffadc4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -119,6 +120,8 @@ void BlockExtractor::loadFile() {
                /*KeepEmpty=*/false);
     if (LineSplit.empty())
       continue;
+    if (LineSplit.size()!=2)
+      report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'");
     SmallVector<StringRef, 4> BBNames;
     LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1,
                        /*KeepEmpty=*/false);
@@ -204,7 +207,8 @@ bool BlockExtractor::runOnModule(Module &M) {
       ++NumExtracted;
       Changed = true;
     }
-    Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion();
+    CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent());
+    Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC);
     if (F)
       LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName()
                         << "' in: " << F->getName() << '\n');
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
index 20cb3213628e..f28a399b1779 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -21,6 +21,8 @@
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index ad877ae1786c..ea1278aa108f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/IPO.h"
@@ -48,7 +49,7 @@ static void FindUsedValues(GlobalVariable *LLVMUsed,
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
-    Value *Operand = Inits->getOperand(i)->stripPointerCastsNoFollowAliases();
+    Value *Operand = Inits->getOperand(i)->stripPointerCasts();
     GlobalValue *GV = cast<GlobalValue>(Operand);
     UsedValues.insert(GV);
   }
@@ -120,7 +121,7 @@ static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) {
 
   // Bump the alignment if necessary.
   if (Old->getAlignment() || New->getAlignment())
-    New->setAlignment(std::max(getAlignment(Old), getAlignment(New)));
+    New->setAlignment(Align(std::max(getAlignment(Old), getAlignment(New))));
 
   copyDebugLocMetadata(Old, New);
   Old->replaceAllUsesWith(NewConstant);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index e30b33aa4872..2fe9a59ad210 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -84,13 +85,9 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   for (GlobalObject &GO : M.global_objects()) {
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
-    for (MDNode *Type : Types) {
-      // Sanity check. GO must not be a function declaration.
-      assert(!isa<Function>(&GO) || !cast<Function>(&GO)->isDeclaration());
-
+    for (MDNode *Type : Types)
       if (ConstantInt *TypeId = extractNumericTypeId(Type))
         TypeIds.insert(TypeId->getZExtValue());
-    }
   }
 
   NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
@@ -108,11 +105,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   FunctionCallee C = M.getOrInsertFunction(
       "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
       Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
-  Function *F = dyn_cast<Function>(C.getCallee());
+  Function *F = cast<Function>(C.getCallee());
   // Take over the existing function. The frontend emits a weak stub so that the
   // linker knows about the symbol; this pass replaces the function body.
   F->deleteBody();
-  F->setAlignment(4096);
+  F->setAlignment(Align(4096));
 
   Triple T(M.getTargetTriple());
   if (T.isARM() || T.isThumb())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 968a13110b16..61d519d8ae88 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index fc52db562c62..7f138d206fac 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index b38cb6d0ed3f..2cb184e8d4f4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -11,6 +11,8 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 5ccd8bc4b0fb..b6d0b2e35694 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -48,6 +48,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -78,11 +79,8 @@ STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
 STATISTIC(NumNoFree, "Number of functions marked as nofree");
 
-// FIXME: This is disabled by default to avoid exposing security vulnerabilities
-// in C/C++ code compiled by clang:
-// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html
 static cl::opt<bool> EnableNonnullArgPropagation(
-    "enable-nonnull-arg-prop", cl::Hidden,
+    "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
     cl::desc("Try to propagate nonnull argument attributes from callsites to "
              "caller functions."));
 
@@ -664,6 +662,25 @@ static bool addArgumentAttrsFromCallsites(Function &F) {
   return Changed;
 }
 
+static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
+  assert((R == Attribute::ReadOnly || R == Attribute::ReadNone)
+         && "Must be a Read attribute.");
+  assert(A && "Argument must not be null.");
+
+  // If the argument already has the attribute, nothing needs to be done.
+  if (A->hasAttribute(R))
+      return false;
+
+  // Otherwise, remove potentially conflicting attribute, add the new one,
+  // and update statistics.
+  A->removeAttr(Attribute::WriteOnly);
+  A->removeAttr(Attribute::ReadOnly);
+  A->removeAttr(Attribute::ReadNone);
+  A->addAttr(R);
+  R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+  return true;
+}
+
 /// Deduce nocapture attributes for the SCC.
 static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
   bool Changed = false;
@@ -732,11 +749,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         SmallPtrSet<Argument *, 8> Self;
         Self.insert(&*A);
         Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
-        if (R != Attribute::None) {
-          A->addAttr(R);
-          Changed = true;
-          R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
-        }
+        if (R != Attribute::None)
+          Changed = addReadAttr(A, R);
       }
     }
   }
@@ -833,12 +847,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (ReadAttr != Attribute::None) {
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
-        // Clear out existing readonly/readnone attributes
-        A->removeAttr(Attribute::ReadOnly);
-        A->removeAttr(Attribute::ReadNone);
-        A->addAttr(ReadAttr);
-        ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
-        Changed = true;
+        Changed = addReadAttr(A, ReadAttr);
       }
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 62c7fbd07223..be0446a946ec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IRReader/IRReader.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Linker/IRMover.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Object/SymbolicFile.h"
@@ -231,7 +232,8 @@ selectCallee(const ModuleSummaryIndex &Index,
           return false;
         }
 
-        if (Summary->instCount() > Threshold) {
+        if ((Summary->instCount() > Threshold) &&
+            !Summary->fflags().AlwaysInline) {
           Reason = FunctionImporter::ImportFailureReason::TooLarge;
           return false;
         }
@@ -280,7 +282,8 @@ updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
 }
 
 static void computeImportForReferencedGlobals(
-    const FunctionSummary &Summary, const GVSummaryMapTy &DefinedGVSummaries,
+    const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
+    const GVSummaryMapTy &DefinedGVSummaries,
     FunctionImporter::ImportMapTy &ImportList,
     StringMap<FunctionImporter::ExportSetTy> *ExportLists) {
   for (auto &VI : Summary.refs()) {
@@ -303,16 +306,28 @@ static void computeImportForReferencedGlobals(
              RefSummary->modulePath() != Summary.modulePath();
     };
 
+    auto MarkExported = [&](const ValueInfo &VI, const GlobalValueSummary *S) {
+      if (ExportLists)
+        (*ExportLists)[S->modulePath()].insert(VI);
+    };
+
     for (auto &RefSummary : VI.getSummaryList())
       if (isa<GlobalVarSummary>(RefSummary.get()) &&
-          canImportGlobalVar(RefSummary.get()) &&
+          Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) &&
           !LocalNotInModule(RefSummary.get())) {
         auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
         // Only update stat if we haven't already imported this variable.
         if (ILI.second)
           NumImportedGlobalVarsThinLink++;
-        if (ExportLists)
-          (*ExportLists)[RefSummary->modulePath()].insert(VI.getGUID());
+        MarkExported(VI, RefSummary.get());
+        // Promote referenced functions and variables. We don't promote
+        // objects referenced by writeonly variable initializer, because
+        // we convert such variables initializers to "zeroinitializer".
+        // See processGlobalForThinLTO.
+        if (!Index.isWriteOnly(cast<GlobalVarSummary>(RefSummary.get())))
+          for (const auto &VI : RefSummary->refs())
+            for (const auto &RefFn : VI.getSummaryList())
+              MarkExported(VI, RefFn.get());
         break;
       }
   }
@@ -351,8 +366,8 @@ static void computeImportForFunction(
     FunctionImporter::ImportMapTy &ImportList,
     StringMap<FunctionImporter::ExportSetTy> *ExportLists,
     FunctionImporter::ImportThresholdsTy &ImportThresholds) {
-  computeImportForReferencedGlobals(Summary, DefinedGVSummaries, ImportList,
-                                    ExportLists);
+  computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries,
+                                    ImportList, ExportLists);
   static int ImportCount = 0;
   for (auto &Edge : Summary.calls()) {
     ValueInfo VI = Edge.first;
@@ -450,7 +465,7 @@ static void computeImportForFunction(
         } else if (PrintImportFailures) {
           assert(!FailureInfo &&
                  "Expected no FailureInfo for newly rejected candidate");
-          FailureInfo = llvm::make_unique<FunctionImporter::ImportFailureInfo>(
+          FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>(
               VI, Edge.second.getHotness(), Reason, 1);
         }
         LLVM_DEBUG(
@@ -462,7 +477,8 @@ static void computeImportForFunction(
       CalleeSummary = CalleeSummary->getBaseObject();
       ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
 
-      assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
+      assert((ResolvedCalleeSummary->fflags().AlwaysInline ||
+	     (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
              "selectCallee() didn't honor the threshold");
 
       auto ExportModulePath = ResolvedCalleeSummary->modulePath();
@@ -481,7 +497,7 @@ static void computeImportForFunction(
       // Make exports in the source module.
       if (ExportLists) {
         auto &ExportList = (*ExportLists)[ExportModulePath];
-        ExportList.insert(VI.getGUID());
+        ExportList.insert(VI);
         if (!PreviouslyImported) {
           // This is the first time this function was exported from its source
           // module, so mark all functions and globals it references as exported
@@ -489,14 +505,11 @@ static void computeImportForFunction(
           // For efficiency, we unconditionally add all the referenced GUIDs
           // to the ExportList for this module, and will prune out any not
           // defined in the module later in a single pass.
-          for (auto &Edge : ResolvedCalleeSummary->calls()) {
-            auto CalleeGUID = Edge.first.getGUID();
-            ExportList.insert(CalleeGUID);
-          }
-          for (auto &Ref : ResolvedCalleeSummary->refs()) {
-            auto GUID = Ref.getGUID();
-            ExportList.insert(GUID);
-          }
+          for (auto &Edge : ResolvedCalleeSummary->calls())
+            ExportList.insert(Edge.first);
+
+          for (auto &Ref : ResolvedCalleeSummary->refs())
+            ExportList.insert(Ref);
         }
       }
     }
@@ -591,29 +604,64 @@ static void ComputeImportForModule(
 }
 
 #ifndef NDEBUG
+static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, ValueInfo VI) {
+  auto SL = VI.getSummaryList();
+  return SL.empty()
+             ? false
+             : SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind;
+}
+
 static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
                                GlobalValue::GUID G) {
-  if (const auto &VI = Index.getValueInfo(G)) {
-    auto SL = VI.getSummaryList();
-    if (!SL.empty())
-      return SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind;
-  }
+  if (const auto &VI = Index.getValueInfo(G))
+    return isGlobalVarSummary(Index, VI);
   return false;
 }
 
-static GlobalValue::GUID getGUID(GlobalValue::GUID G) { return G; }
-
 template <class T>
 static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
                                       T &Cont) {
   unsigned NumGVS = 0;
   for (auto &V : Cont)
-    if (isGlobalVarSummary(Index, getGUID(V)))
+    if (isGlobalVarSummary(Index, V))
       ++NumGVS;
   return NumGVS;
 }
 #endif
 
+#ifndef NDEBUG
+static bool
+checkVariableImport(const ModuleSummaryIndex &Index,
+                    StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+                    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+
+  DenseSet<GlobalValue::GUID> FlattenedImports;
+
+  for (auto &ImportPerModule : ImportLists)
+    for (auto &ExportPerModule : ImportPerModule.second)
+      FlattenedImports.insert(ExportPerModule.second.begin(),
+                              ExportPerModule.second.end());
+
+  // Checks that all GUIDs of read/writeonly vars we see in export lists
+  // are also in the import lists. Otherwise we my face linker undefs,
+  // because readonly and writeonly vars are internalized in their
+  // source modules.
+  auto IsReadOrWriteOnlyVar = [&](StringRef ModulePath, const ValueInfo &VI) {
+    auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
+        Index.findSummaryInModule(VI, ModulePath));
+    return GVS && (Index.isReadOnly(GVS) || Index.isWriteOnly(GVS));
+  };
+
+  for (auto &ExportPerModule : ExportLists)
+    for (auto &VI : ExportPerModule.second)
+      if (!FlattenedImports.count(VI.getGUID()) &&
+          IsReadOrWriteOnlyVar(ExportPerModule.first(), VI))
+        return false;
+
+  return true;
+}
+#endif
+
 /// Compute all the import and export for every module using the Index.
 void llvm::ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
@@ -639,13 +687,14 @@ void llvm::ComputeCrossModuleImport(
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ELI.first());
     for (auto EI = ELI.second.begin(); EI != ELI.second.end();) {
-      if (!DefinedGVSummaries.count(*EI))
-        EI = ELI.second.erase(EI);
+      if (!DefinedGVSummaries.count(EI->getGUID()))
+        ELI.second.erase(EI++);
       else
         ++EI;
     }
   }
 
+  assert(checkVariableImport(Index, ImportLists, ExportLists));
 #ifndef NDEBUG
   LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
                     << " modules:\n");
@@ -764,7 +813,7 @@ void llvm::computeDeadSymbols(
   }
 
   // Make value live and add it to the worklist if it was not live before.
-  auto visit = [&](ValueInfo VI) {
+  auto visit = [&](ValueInfo VI, bool IsAliasee) {
     // FIXME: If we knew which edges were created for indirect call profiles,
     // we could skip them here. Any that are live should be reached via
     // other edges, e.g. reference edges. Otherwise, using a profile collected
@@ -800,12 +849,15 @@ void llvm::computeDeadSymbols(
           Interposable = true;
       }
 
-      if (!KeepAliveLinkage)
-        return;
+      if (!IsAliasee) {
+        if (!KeepAliveLinkage)
+          return;
 
-      if (Interposable)
-        report_fatal_error(
-          "Interposable and available_externally/linkonce_odr/weak_odr symbol");
+        if (Interposable)
+          report_fatal_error(
+              "Interposable and available_externally/linkonce_odr/weak_odr "
+              "symbol");
+      }
     }
 
     for (auto &S : VI.getSummaryList())
@@ -821,16 +873,16 @@ void llvm::computeDeadSymbols(
         // If this is an alias, visit the aliasee VI to ensure that all copies
         // are marked live and it is added to the worklist for further
         // processing of its references.
-        visit(AS->getAliaseeVI());
+        visit(AS->getAliaseeVI(), true);
         continue;
       }
 
       Summary->setLive(true);
       for (auto Ref : Summary->refs())
-        visit(Ref);
+        visit(Ref, false);
       if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
         for (auto Call : FS->calls())
-          visit(Call.first);
+          visit(Call.first, false);
     }
   }
   Index.setWithGlobalValueDeadStripping();
@@ -849,18 +901,8 @@ void llvm::computeDeadSymbolsWithConstProp(
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
     bool ImportEnabled) {
   computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
-  if (ImportEnabled) {
+  if (ImportEnabled)
     Index.propagateAttributes(GUIDPreservedSymbols);
-  } else {
-    // If import is disabled we should drop read/write-only attribute
-    // from all summaries to prevent internalization.
-    for (auto &P : Index)
-      for (auto &S : P.second.SummaryList)
-        if (auto *GVS = dyn_cast<GlobalVarSummary>(S.get())) {
-          GVS->setReadOnly(false);
-          GVS->setWriteOnly(false);
-        }
-  }
 }
 
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
@@ -892,7 +934,7 @@ std::error_code llvm::EmitImportsFiles(
     StringRef ModulePath, StringRef OutputFilename,
     const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
   std::error_code EC;
-  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
+  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
   if (EC)
     return EC;
   for (auto &ILI : ModuleToSummariesForIndex)
@@ -948,23 +990,15 @@ void llvm::thinLTOResolvePrevailingInModule(
     auto NewLinkage = GS->second->linkage();
     if (NewLinkage == GV.getLinkage())
       return;
-
-    // Switch the linkage to weakany if asked for, e.g. we do this for
-    // linker redefined symbols (via --wrap or --defsym).
-    // We record that the visibility should be changed here in `addThinLTO`
-    // as we need access to the resolution vectors for each input file in
-    // order to find which symbols have been redefined.
-    // We may consider reorganizing this code and moving the linkage recording
-    // somewhere else, e.g. in thinLTOResolvePrevailingInIndex.
-    if (NewLinkage == GlobalValue::WeakAnyLinkage) {
-      GV.setLinkage(NewLinkage);
-      return;
-    }
-
     if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
+        // Don't internalize anything here, because the code below
+        // lacks necessary correctness checks. Leave this job to
+        // LLVM 'internalize' pass.
+        GlobalValue::isLocalLinkage(NewLinkage) ||
         // In case it was dead and already converted to declaration.
         GV.isDeclaration())
       return;
+
     // Check for a non-prevailing def that has interposable linkage
     // (e.g. non-odr weak or linkonce). In that case we can't simply
     // convert to available_externally, since it would lose the
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 86b7f3e49ee6..72b8d7522f04 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -17,10 +17,14 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
@@ -29,10 +33,15 @@ using namespace llvm;
 
 #define DEBUG_TYPE "globaldce"
 
+static cl::opt<bool>
+    ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+                cl::desc("Enable virtual function elimination"));
+
 STATISTIC(NumAliases  , "Number of global aliases removed");
 STATISTIC(NumFunctions, "Number of functions removed");
 STATISTIC(NumIFuncs,    "Number of indirect functions removed");
 STATISTIC(NumVariables, "Number of global variables removed");
+STATISTIC(NumVFuncs,    "Number of virtual functions removed");
 
 namespace {
   class GlobalDCELegacyPass : public ModulePass {
@@ -118,6 +127,15 @@ void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
     ComputeDependencies(User, Deps);
   Deps.erase(&GV); // Remove self-reference.
   for (GlobalValue *GVU : Deps) {
+    // If this is a dep from a vtable to a virtual function, and we have
+    // complete information about all virtual call sites which could call
+    // though this vtable, then skip it, because the call site information will
+    // be more precise.
+    if (VFESafeVTables.count(GVU) && isa<Function>(&GV)) {
+      LLVM_DEBUG(dbgs() << "Ignoring dep " << GVU->getName() << " -> "
+                        << GV.getName() << "\n");
+      continue;
+    }
     GVDependencies[GVU].insert(&GV);
   }
 }
@@ -132,12 +150,133 @@ void GlobalDCEPass::MarkLive(GlobalValue &GV,
   if (Updates)
     Updates->push_back(&GV);
   if (Comdat *C = GV.getComdat()) {
-    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
+    for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
       MarkLive(*CM.second, Updates); // Recursion depth is only two because only
                                      // globals in the same comdat are visited.
+    }
+  }
+}
+
+void GlobalDCEPass::ScanVTables(Module &M) {
+  SmallVector<MDNode *, 2> Types;
+  LLVM_DEBUG(dbgs() << "Building type info -> vtable map\n");
+
+  auto *LTOPostLinkMD =
+      cast_or_null<ConstantAsMetadata>(M.getModuleFlag("LTOPostLink"));
+  bool LTOPostLink =
+      LTOPostLinkMD &&
+      (cast<ConstantInt>(LTOPostLinkMD->getValue())->getZExtValue() != 0);
+
+  for (GlobalVariable &GV : M.globals()) {
+    Types.clear();
+    GV.getMetadata(LLVMContext::MD_type, Types);
+    if (GV.isDeclaration() || Types.empty())
+      continue;
+
+    // Use the typeid metadata on the vtable to build a mapping from typeids to
+    // the list of (GV, offset) pairs which are the possible vtables for that
+    // typeid.
+    for (MDNode *Type : Types) {
+      Metadata *TypeID = Type->getOperand(1).get();
+
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+
+      TypeIdMap[TypeID].insert(std::make_pair(&GV, Offset));
+    }
+
+    // If the type corresponding to the vtable is private to this translation
+    // unit, we know that we can see all virtual functions which might use it,
+    // so VFE is safe.
+    if (auto GO = dyn_cast<GlobalObject>(&GV)) {
+      GlobalObject::VCallVisibility TypeVis = GO->getVCallVisibility();
+      if (TypeVis == GlobalObject::VCallVisibilityTranslationUnit ||
+          (LTOPostLink &&
+           TypeVis == GlobalObject::VCallVisibilityLinkageUnit)) {
+        LLVM_DEBUG(dbgs() << GV.getName() << " is safe for VFE\n");
+        VFESafeVTables.insert(&GV);
+      }
+    }
+  }
+}
+
+void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
+                                   uint64_t CallOffset) {
+  for (auto &VTableInfo : TypeIdMap[TypeId]) {
+    GlobalVariable *VTable = VTableInfo.first;
+    uint64_t VTableOffset = VTableInfo.second;
+
+    Constant *Ptr =
+        getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset,
+                           *Caller->getParent());
+    if (!Ptr) {
+      LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
+      VFESafeVTables.erase(VTable);
+      return;
+    }
+
+    auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts());
+    if (!Callee) {
+      LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n");
+      VFESafeVTables.erase(VTable);
+      return;
+    }
+
+    LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> "
+                      << Callee->getName() << "\n");
+    GVDependencies[Caller].insert(Callee);
   }
 }
 
+void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
+  LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+
+  if (!TypeCheckedLoadFunc)
+    return;
+
+  for (auto U : TypeCheckedLoadFunc->users()) {
+    auto CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    Value *TypeIdValue = CI->getArgOperand(2);
+    auto *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+    if (Offset) {
+      ScanVTableLoad(CI->getFunction(), TypeId, Offset->getZExtValue());
+    } else {
+      // type.checked.load with a non-constant offset, so assume every entry in
+      // every matching vtable is used.
+      for (auto &VTableInfo : TypeIdMap[TypeId]) {
+        VFESafeVTables.erase(VTableInfo.first);
+      }
+    }
+  }
+}
+
+void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) {
+  if (!ClEnableVFE)
+    return;
+
+  ScanVTables(M);
+
+  if (VFESafeVTables.empty())
+    return;
+
+  ScanTypeCheckedLoadIntrinsics(M);
+
+  LLVM_DEBUG(
+    dbgs() << "VFE safe vtables:\n";
+    for (auto *VTable : VFESafeVTables)
+      dbgs() << "  " << VTable->getName() << "\n";
+  );
+}
+
 PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   bool Changed = false;
 
@@ -163,6 +302,10 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
     if (Comdat *C = GA.getComdat())
       ComdatMembers.insert(std::make_pair(C, &GA));
 
+  // Add dependencies between virtual call sites and the virtual functions they
+  // might call, if we have that information.
+  AddVirtualFunctionDependencies(M);
+
   // Loop over the module, adding globals which are obviously necessary.
   for (GlobalObject &GO : M.global_objects()) {
     Changed |= RemoveUnusedGlobalValue(GO);
@@ -257,8 +400,17 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   };
 
   NumFunctions += DeadFunctions.size();
-  for (Function *F : DeadFunctions)
+  for (Function *F : DeadFunctions) {
+    if (!F->use_empty()) {
+      // Virtual functions might still be referenced by one or more vtables,
+      // but if we've proven them to be unused then it's safe to replace the
+      // virtual function pointers with null, allowing us to remove the
+      // function itself.
+      ++NumVFuncs;
+      F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType()));
+    }
     EraseUnusedGlobalValue(F);
+  }
 
   NumVariables += DeadGlobalVars.size();
   for (GlobalVariable *GV : DeadGlobalVars)
@@ -277,6 +429,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   ConstantDependenciesCache.clear();
   GVDependencies.clear();
   ComdatMembers.clear();
+  TypeIdMap.clear();
+  VFESafeVTables.clear();
 
   if (Changed)
     return PreservedAnalyses::none();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index c4fb3ce77f6e..0fd966457ece 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -53,6 +52,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
@@ -65,6 +65,7 @@
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <utility>
@@ -155,7 +156,8 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) {
 /// Given a value that is stored to a global but never read, determine whether
 /// it's safe to remove the store and the chain of computation that feeds the
 /// store.
-static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
+static bool IsSafeComputationToRemove(
+    Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   do {
     if (isa<Constant>(V))
       return true;
@@ -164,7 +166,7 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
     if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
         isa<GlobalValue>(V))
       return false;
-    if (isAllocationFn(V, TLI))
+    if (isAllocationFn(V, GetTLI))
       return true;
 
     Instruction *I = cast<Instruction>(V);
@@ -184,8 +186,9 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
 /// This GV is a pointer root.  Loop over all users of the global and clean up
 /// any that obviously don't assign the global a value that isn't dynamically
 /// allocated.
-static bool CleanupPointerRootUsers(GlobalVariable *GV,
-                                    const TargetLibraryInfo *TLI) {
+static bool
+CleanupPointerRootUsers(GlobalVariable *GV,
+                        function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   // A brief explanation of leak checkers.  The goal is to find bugs where
   // pointers are forgotten, causing an accumulating growth in memory
   // usage over time.  The common strategy for leak checkers is to whitelist the
@@ -241,18 +244,18 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
-        CleanupPointerRootUsers(GV, TLI);
+        CleanupPointerRootUsers(GV, GetTLI);
         return true;
       }
     }
   }
 
   for (int i = 0, e = Dead.size(); i != e; ++i) {
-    if (IsSafeComputationToRemove(Dead[i].first, TLI)) {
+    if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) {
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-        if (isAllocationFn(I, TLI))
+        if (isAllocationFn(I, GetTLI))
           break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
@@ -270,9 +273,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 /// We just marked GV constant.  Loop over all users of the global, cleaning up
 /// the obvious ones.  This is largely just a quick scan over the use list to
 /// clean up the easy and obvious cruft.  This returns true if it made a change.
-static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
-                                       const DataLayout &DL,
-                                       TargetLibraryInfo *TLI) {
+static bool CleanupConstantGlobalUsers(
+    Value *V, Constant *Init, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   bool Changed = false;
   // Note that we need to use a weak value handle for the worklist items. When
   // we delete a constant array, we may also be holding pointer to one of its
@@ -302,12 +305,12 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
         Constant *SubInit = nullptr;
         if (Init)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
-        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, TLI);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
       } else if ((CE->getOpcode() == Instruction::BitCast &&
                   CE->getType()->isPointerTy()) ||
                  CE->getOpcode() == Instruction::AddrSpaceCast) {
         // Pointer cast, delete any stores and memsets to the global.
-        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, TLI);
+        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
       }
 
       if (CE->use_empty()) {
@@ -321,7 +324,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       Constant *SubInit = nullptr;
       if (!isa<ConstantExpr>(GEP->getOperand(0))) {
         ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
-            ConstantFoldInstruction(GEP, DL, TLI));
+            ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
         if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
 
@@ -331,7 +334,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
         if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
           SubInit = Constant::getNullValue(GEP->getResultElementType());
       }
-      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, TLI);
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);
 
       if (GEP->use_empty()) {
         GEP->eraseFromParent();
@@ -348,7 +351,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       // us, and if they are all dead, nuke them without remorse.
       if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
-        CleanupConstantGlobalUsers(V, Init, DL, TLI);
+        CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
         return true;
       }
     }
@@ -430,6 +433,20 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
   return true;
 }
 
+static bool CanDoGlobalSRA(GlobalVariable *GV) {
+  Constant *Init = GV->getInitializer();
+
+  if (isa<StructType>(Init->getType())) {
+    // nothing to check
+  } else if (SequentialType *STy = dyn_cast<SequentialType>(Init->getType())) {
+    if (STy->getNumElements() > 16 && GV->hasNUsesOrMore(16))
+      return false; // It's not worth it.
+  } else
+    return false;
+
+  return GlobalUsersSafeToSRA(GV);
+}
+
 /// Copy over the debug info for a variable to its SRA replacements.
 static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
                                  uint64_t FragmentOffsetInBits,
@@ -459,88 +476,94 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
 /// insert so that the caller can reprocess it.
 static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   // Make sure this global only has simple uses that we can SRA.
-  if (!GlobalUsersSafeToSRA(GV))
+  if (!CanDoGlobalSRA(GV))
     return nullptr;
 
   assert(GV->hasLocalLinkage());
   Constant *Init = GV->getInitializer();
   Type *Ty = Init->getType();
 
-  std::vector<GlobalVariable *> NewGlobals;
-  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+  std::map<unsigned, GlobalVariable *> NewGlobals;
 
   // Get the alignment of the global, either explicit or target-specific.
   unsigned StartAlignment = GV->getAlignment();
   if (StartAlignment == 0)
     StartAlignment = DL.getABITypeAlignment(GV->getType());
 
-  if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    unsigned NumElements = STy->getNumElements();
-    NewGlobals.reserve(NumElements);
-    const StructLayout &Layout = *DL.getStructLayout(STy);
-    for (unsigned i = 0, e = NumElements; i != e; ++i) {
-      Constant *In = Init->getAggregateElement(i);
-      assert(In && "Couldn't get element of initializer?");
-      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
-                                               GlobalVariable::InternalLinkage,
-                                               In, GV->getName()+"."+Twine(i),
-                                               GV->getThreadLocalMode(),
-                                              GV->getType()->getAddressSpace());
-      NGV->setExternallyInitialized(GV->isExternallyInitialized());
-      NGV->copyAttributesFrom(GV);
-      Globals.push_back(NGV);
-      NewGlobals.push_back(NGV);
+  // Loop over all users and create replacement variables for used aggregate
+  // elements.
+  for (User *GEP : GV->users()) {
+    assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() ==
+                                           Instruction::GetElementPtr) ||
+            isa<GetElementPtrInst>(GEP)) &&
+           "NonGEP CE's are not SRAable!");
+
+    // Ignore the 1th operand, which has to be zero or else the program is quite
+    // broken (undefined).  Get the 2nd operand, which is the structure or array
+    // index.
+    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (NewGlobals.count(ElementIdx) == 1)
+      continue; // we`ve already created replacement variable
+    assert(NewGlobals.count(ElementIdx) == 0);
+
+    Type *ElTy = nullptr;
+    if (StructType *STy = dyn_cast<StructType>(Ty))
+      ElTy = STy->getElementType(ElementIdx);
+    else if (SequentialType *STy = dyn_cast<SequentialType>(Ty))
+      ElTy = STy->getElementType();
+    assert(ElTy);
+
+    Constant *In = Init->getAggregateElement(ElementIdx);
+    assert(In && "Couldn't get element of initializer?");
+
+    GlobalVariable *NGV = new GlobalVariable(
+        ElTy, false, GlobalVariable::InternalLinkage, In,
+        GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(),
+        GV->getType()->getAddressSpace());
+    NGV->setExternallyInitialized(GV->isExternallyInitialized());
+    NGV->copyAttributesFrom(GV);
+    NewGlobals.insert(std::make_pair(ElementIdx, NGV));
+
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout &Layout = *DL.getStructLayout(STy);
 
       // Calculate the known alignment of the field.  If the original aggregate
       // had 256 byte alignment for example, something might depend on that:
       // propagate info to each field.
-      uint64_t FieldOffset = Layout.getElementOffset(i);
-      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
-      if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i)))
+      uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
+      Align NewAlign(MinAlign(StartAlignment, FieldOffset));
+      if (NewAlign >
+          Align(DL.getABITypeAlignment(STy->getElementType(ElementIdx))))
         NGV->setAlignment(NewAlign);
 
       // Copy over the debug info for the variable.
       uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
-      uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(i);
-      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, NumElements);
-    }
-  } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
-    unsigned NumElements = STy->getNumElements();
-    if (NumElements > 16 && GV->hasNUsesOrMore(16))
-      return nullptr; // It's not worth it.
-    NewGlobals.reserve(NumElements);
-    auto ElTy = STy->getElementType();
-    uint64_t EltSize = DL.getTypeAllocSize(ElTy);
-    unsigned EltAlign = DL.getABITypeAlignment(ElTy);
-    uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
-    for (unsigned i = 0, e = NumElements; i != e; ++i) {
-      Constant *In = Init->getAggregateElement(i);
-      assert(In && "Couldn't get element of initializer?");
-
-      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
-                                               GlobalVariable::InternalLinkage,
-                                               In, GV->getName()+"."+Twine(i),
-                                               GV->getThreadLocalMode(),
-                                              GV->getType()->getAddressSpace());
-      NGV->setExternallyInitialized(GV->isExternallyInitialized());
-      NGV->copyAttributesFrom(GV);
-      Globals.push_back(NGV);
-      NewGlobals.push_back(NGV);
+      uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
+      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size,
+                           STy->getNumElements());
+    } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+      uint64_t EltSize = DL.getTypeAllocSize(ElTy);
+      Align EltAlign(DL.getABITypeAlignment(ElTy));
+      uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
 
       // Calculate the known alignment of the field.  If the original aggregate
       // had 256 byte alignment for example, something might depend on that:
       // propagate info to each field.
-      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
+      Align NewAlign(MinAlign(StartAlignment, EltSize * ElementIdx));
       if (NewAlign > EltAlign)
         NGV->setAlignment(NewAlign);
-      transferSRADebugInfo(GV, NGV, FragmentSizeInBits * i, FragmentSizeInBits,
-                           NumElements);
+      transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
+                           FragmentSizeInBits, STy->getNumElements());
     }
   }
 
   if (NewGlobals.empty())
     return nullptr;
 
+  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+  for (auto NewGlobalVar : NewGlobals)
+    Globals.push_back(NewGlobalVar.second);
+
   LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
 
   Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
@@ -556,11 +579,11 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
     // Ignore the 1th operand, which has to be zero or else the program is quite
     // broken (undefined).  Get the 2nd operand, which is the structure or array
     // index.
-    unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
-    if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access.
+    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    assert(NewGlobals.count(ElementIdx) == 1);
 
-    Value *NewPtr = NewGlobals[Val];
-    Type *NewTy = NewGlobals[Val]->getValueType();
+    Value *NewPtr = NewGlobals[ElementIdx];
+    Type *NewTy = NewGlobals[ElementIdx]->getValueType();
 
     // Form a shorter GEP if needed.
     if (GEP->getNumOperands() > 3) {
@@ -578,7 +601,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
         for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
           Idxs.push_back(GEPI->getOperand(i));
         NewPtr = GetElementPtrInst::Create(
-            NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(Val), GEPI);
+            NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx),
+            GEPI);
       }
     }
     GEP->replaceAllUsesWith(NewPtr);
@@ -593,17 +617,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   Globals.erase(GV);
   ++NumSRA;
 
-  // Loop over the new globals array deleting any globals that are obviously
-  // dead.  This can arise due to scalarization of a structure or an array that
-  // has elements that are dead.
-  unsigned FirstGlobal = 0;
-  for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i)
-    if (NewGlobals[i]->use_empty()) {
-      Globals.erase(NewGlobals[i]);
-      if (FirstGlobal == i) ++FirstGlobal;
-    }
-
-  return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr;
+  assert(NewGlobals.size() > 0);
+  return NewGlobals.begin()->second;
 }
 
 /// Return true if all users of the specified value will trap if the value is
@@ -745,9 +760,9 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
 /// are uses of the loaded value that would trap if the loaded value is
 /// dynamically null, then we know that they cannot be reachable with a null
 /// optimize away the load.
-static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
-                                            const DataLayout &DL,
-                                            TargetLibraryInfo *TLI) {
+static bool OptimizeAwayTrappingUsesOfLoads(
+    GlobalVariable *GV, Constant *LV, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   bool Changed = false;
 
   // Keep track of whether we are able to remove all the uses of the global
@@ -793,10 +808,10 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   // nor is the global.
   if (AllNonStoreUsesGone) {
     if (isLeakCheckerRoot(GV)) {
-      Changed |= CleanupPointerRootUsers(GV, TLI);
+      Changed |= CleanupPointerRootUsers(GV, GetTLI);
     } else {
       Changed = true;
-      CleanupConstantGlobalUsers(GV, nullptr, DL, TLI);
+      CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
     }
     if (GV->use_empty()) {
       LLVM_DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
@@ -889,8 +904,8 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
   while (!GV->use_empty()) {
     if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
       // The global is initialized when the store to it occurs.
-      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0,
-                    SI->getOrdering(), SI->getSyncScopeID(), SI);
+      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false,
+                    None, SI->getOrdering(), SI->getSyncScopeID(), SI);
       SI->eraseFromParent();
       continue;
     }
@@ -907,7 +922,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
       // Replace the cmp X, 0 with a use of the bool value.
       // Sink the load to where the compare was, if atomic rules allow us to.
       Value *LV = new LoadInst(InitBool->getValueType(), InitBool,
-                               InitBool->getName() + ".val", false, 0,
+                               InitBool->getName() + ".val", false, None,
                                LI->getOrdering(), LI->getSyncScopeID(),
                                LI->isUnordered() ? (Instruction *)ICI : LI);
       InitBoolUsed = true;
@@ -1562,10 +1577,10 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
 
 // Try to optimize globals based on the knowledge that only one value (besides
 // its initializer) is ever stored to the global.
-static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
-                                     AtomicOrdering Ordering,
-                                     const DataLayout &DL,
-                                     TargetLibraryInfo *TLI) {
+static bool
+optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                         AtomicOrdering Ordering, const DataLayout &DL,
+                         function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   // Ignore no-op GEPs and bitcasts.
   StoredOnceVal = StoredOnceVal->stripPointerCasts();
 
@@ -1583,9 +1598,10 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
         SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
 
       // Optimize away any trapping uses of the loaded value.
-      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, TLI))
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
         return true;
-    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
+      auto *TLI = &GetTLI(*CI->getFunction());
       Type *MallocType = getMallocAllocatedType(CI, TLI);
       if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
                                                            Ordering, DL, TLI))
@@ -1643,10 +1659,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   // instead of a select to synthesize the desired value.
   bool IsOneZero = false;
   bool EmitOneOrZero = true;
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)){
+  auto *CI = dyn_cast<ConstantInt>(OtherVal);
+  if (CI && CI->getValue().getActiveBits() <= 64) {
     IsOneZero = InitVal->isNullValue() && CI->isOne();
 
-    if (ConstantInt *CIInit = dyn_cast<ConstantInt>(GV->getInitializer())){
+    auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer());
+    if (CIInit && CIInit->getValue().getActiveBits() <= 64) {
       uint64_t ValInit = CIInit->getZExtValue();
       uint64_t ValOther = CI->getZExtValue();
       uint64_t ValMinus = ValOther - ValInit;
@@ -1711,7 +1729,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
           assert(LI->getOperand(0) == GV && "Not a copy!");
           // Insert a new load, to preserve the saved value.
           StoreVal = new LoadInst(NewGV->getValueType(), NewGV,
-                                  LI->getName() + ".b", false, 0,
+                                  LI->getName() + ".b", false, None,
                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
         } else {
           assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
@@ -1721,15 +1739,15 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
         }
       }
       StoreInst *NSI =
-          new StoreInst(StoreVal, NewGV, false, 0, SI->getOrdering(),
+          new StoreInst(StoreVal, NewGV, false, None, SI->getOrdering(),
                         SI->getSyncScopeID(), SI);
       NSI->setDebugLoc(SI->getDebugLoc());
     } else {
       // Change the load into a load of bool then a select.
       LoadInst *LI = cast<LoadInst>(UI);
-      LoadInst *NLI =
-          new LoadInst(NewGV->getValueType(), NewGV, LI->getName() + ".b",
-                       false, 0, LI->getOrdering(), LI->getSyncScopeID(), LI);
+      LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV,
+                                   LI->getName() + ".b", false, None,
+                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
       Instruction *NSI;
       if (IsOneZero)
         NSI = new ZExtInst(NLI, LI->getType(), "", LI);
@@ -1914,9 +1932,10 @@ static void makeAllConstantUsesInstructions(Constant *C) {
 
 /// Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
-static bool processInternalGlobal(
-    GlobalVariable *GV, const GlobalStatus &GS, TargetLibraryInfo *TLI,
-    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+static bool
+processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
+                      function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+                      function_ref<DominatorTree &(Function &)> LookupDomTree) {
   auto &DL = GV->getParent()->getDataLayout();
   // If this is a first class global and has only one accessing function and
   // this function is non-recursive, we replace the global with a local alloca
@@ -1963,11 +1982,12 @@ static bool processInternalGlobal(
     bool Changed;
     if (isLeakCheckerRoot(GV)) {
       // Delete any constant stores to the global.
-      Changed = CleanupPointerRootUsers(GV, TLI);
+      Changed = CleanupPointerRootUsers(GV, GetTLI);
     } else {
       // Delete any stores we can find to the global.  We may not be able to
       // make it completely dead though.
-      Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+      Changed =
+          CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
     }
 
     // If the global is dead now, delete it.
@@ -1989,7 +2009,7 @@ static bool processInternalGlobal(
       GV->setConstant(true);
 
     // Clean up any obviously simplifiable users now.
-    CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+    CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
 
     // If the global is dead now, just nuke it.
     if (GV->use_empty()) {
@@ -2019,7 +2039,7 @@ static bool processInternalGlobal(
         GV->setInitializer(SOVConstant);
 
         // Clean up any obviously simplifiable users now.
-        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
 
         if (GV->use_empty()) {
           LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
@@ -2033,7 +2053,8 @@ static bool processInternalGlobal(
 
     // Try to optimize globals based on the knowledge that only one value
     // (besides its initializer) is ever stored to the global.
-    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI))
+    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
+                                 GetTLI))
       return true;
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
@@ -2054,7 +2075,8 @@ static bool processInternalGlobal(
 /// Analyze the specified global variable and optimize it if possible.  If we
 /// make a change, return true.
 static bool
-processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI,
+processGlobal(GlobalValue &GV,
+              function_ref<TargetLibraryInfo &(Function &)> GetTLI,
               function_ref<DominatorTree &(Function &)> LookupDomTree) {
   if (GV.getName().startswith("llvm."))
     return false;
@@ -2086,7 +2108,7 @@ processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI,
   if (GVar->isConstant() || !GVar->hasInitializer())
     return Changed;
 
-  return processInternalGlobal(GVar, GS, TLI, LookupDomTree) || Changed;
+  return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed;
 }
 
 /// Walk all of the direct calls of the specified function, changing them to
@@ -2234,7 +2256,8 @@ hasOnlyColdCalls(Function &F,
 }
 
 static bool
-OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
+OptimizeFunctions(Module &M,
+                  function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                   function_ref<TargetTransformInfo &(Function &)> GetTTI,
                   function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
                   function_ref<DominatorTree &(Function &)> LookupDomTree,
@@ -2285,7 +2308,7 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
       }
     }
 
-    Changed |= processGlobal(*F, TLI, LookupDomTree);
+    Changed |= processGlobal(*F, GetTLI, LookupDomTree);
 
     if (!F->hasLocalLinkage())
       continue;
@@ -2342,7 +2365,8 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
 }
 
 static bool
-OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
+OptimizeGlobalVars(Module &M,
+                   function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                    function_ref<DominatorTree &(Function &)> LookupDomTree,
                    SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
   bool Changed = false;
@@ -2357,7 +2381,10 @@ OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
     if (GV->hasInitializer())
       if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
         auto &DL = M.getDataLayout();
-        Constant *New = ConstantFoldConstant(C, DL, TLI);
+        // TLI is not used in the case of a Constant, so use default nullptr
+        // for that optional parameter, since we don't have a Function to
+        // provide GetTLI anyway.
+        Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
         if (New && New != C)
           GV->setInitializer(New);
       }
@@ -2367,7 +2394,7 @@ OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
       continue;
     }
 
-    Changed |= processGlobal(*GV, TLI, LookupDomTree);
+    Changed |= processGlobal(*GV, GetTLI, LookupDomTree);
   }
   return Changed;
 }
@@ -2581,8 +2608,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
 }
 
 static int compareNames(Constant *const *A, Constant *const *B) {
-  Value *AStripped = (*A)->stripPointerCastsNoFollowAliases();
-  Value *BStripped = (*B)->stripPointerCastsNoFollowAliases();
+  Value *AStripped = (*A)->stripPointerCasts();
+  Value *BStripped = (*B)->stripPointerCasts();
   return AStripped->getName().compare(BStripped->getName());
 }
 
@@ -2809,7 +2836,14 @@ OptimizeGlobalAliases(Module &M,
   return Changed;
 }
 
-static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
+static Function *
+FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  // Hack to get a default TLI before we have actual Function.
+  auto FuncIter = M.begin();
+  if (FuncIter == M.end())
+    return nullptr;
+  auto *TLI = &GetTLI(*FuncIter);
+
   LibFunc F = LibFunc_cxa_atexit;
   if (!TLI->has(F))
     return nullptr;
@@ -2818,6 +2852,9 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
   if (!Fn)
     return nullptr;
 
+  // Now get the actual TLI for Fn.
+  TLI = &GetTLI(*Fn);
+
   // Make sure that the function has the correct prototype.
   if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
     return nullptr;
@@ -2889,7 +2926,8 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
 }
 
 static bool optimizeGlobalsInModule(
-    Module &M, const DataLayout &DL, TargetLibraryInfo *TLI,
+    Module &M, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI,
     function_ref<TargetTransformInfo &(Function &)> GetTTI,
     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
     function_ref<DominatorTree &(Function &)> LookupDomTree) {
@@ -2914,24 +2952,24 @@ static bool optimizeGlobalsInModule(
           NotDiscardableComdats.insert(C);
 
     // Delete functions that are trivially dead, ccc -> fastcc
-    LocalChange |= OptimizeFunctions(M, TLI, GetTTI, GetBFI, LookupDomTree,
+    LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree,
                                      NotDiscardableComdats);
 
     // Optimize global_ctors list.
     LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
-      return EvaluateStaticConstructor(F, DL, TLI);
+      return EvaluateStaticConstructor(F, DL, &GetTLI(*F));
     });
 
     // Optimize non-address-taken globals.
-    LocalChange |= OptimizeGlobalVars(M, TLI, LookupDomTree,
-                                      NotDiscardableComdats);
+    LocalChange |=
+        OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);
 
     // Resolve aliases, when possible.
     LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
 
     // Try to remove trivial global destructors if they are not removed
     // already.
-    Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
+    Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI);
     if (CXAAtExitFn)
       LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
 
@@ -2946,12 +2984,14 @@ static bool optimizeGlobalsInModule(
 
 PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     auto &DL = M.getDataLayout();
-    auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
     auto &FAM =
         AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
     auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
       return FAM.getResult<DominatorTreeAnalysis>(F);
     };
+    auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+      return FAM.getResult<TargetLibraryAnalysis>(F);
+    };
     auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
       return FAM.getResult<TargetIRAnalysis>(F);
     };
@@ -2960,7 +3000,7 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
       return FAM.getResult<BlockFrequencyAnalysis>(F);
     };
 
-    if (!optimizeGlobalsInModule(M, DL, &TLI, GetTTI, GetBFI, LookupDomTree))
+    if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree))
       return PreservedAnalyses::all();
     return PreservedAnalyses::none();
 }
@@ -2979,10 +3019,12 @@ struct GlobalOptLegacyPass : public ModulePass {
       return false;
 
     auto &DL = M.getDataLayout();
-    auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     auto LookupDomTree = [this](Function &F) -> DominatorTree & {
       return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
     };
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
     auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
       return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     };
@@ -2991,7 +3033,8 @@ struct GlobalOptLegacyPass : public ModulePass {
       return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
     };
 
-    return optimizeGlobalsInModule(M, DL, TLI, GetTTI, GetBFI, LookupDomTree);
+    return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI,
+                                   LookupDomTree);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index 060043a40b89..4a319ead23c0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/IPO.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index ab1a9a79cad6..5e690714bfdf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -25,6 +25,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -53,13 +54,14 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -85,12 +87,6 @@ static cl::opt<int>
                                 "multiple of TCC_Basic)"));
 
 namespace {
-
-/// A sequence of basic blocks.
-///
-/// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
-using BlockSequence = SmallVector<BasicBlock *, 0>;
-
 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
 // this function unless you modify the MBB version as well.
 //
@@ -169,31 +165,6 @@ static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
   return Changed;
 }
 
-class HotColdSplitting {
-public:
-  HotColdSplitting(ProfileSummaryInfo *ProfSI,
-                   function_ref<BlockFrequencyInfo *(Function &)> GBFI,
-                   function_ref<TargetTransformInfo &(Function &)> GTTI,
-                   std::function<OptimizationRemarkEmitter &(Function &)> *GORE,
-                   function_ref<AssumptionCache *(Function &)> LAC)
-      : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE), LookupAC(LAC) {}
-  bool run(Module &M);
-
-private:
-  bool isFunctionCold(const Function &F) const;
-  bool shouldOutlineFrom(const Function &F) const;
-  bool outlineColdRegions(Function &F, bool HasProfileSummary);
-  Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
-                              BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
-                              OptimizationRemarkEmitter &ORE,
-                              AssumptionCache *AC, unsigned Count);
-  ProfileSummaryInfo *PSI;
-  function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
-  function_ref<TargetTransformInfo &(Function &)> GetTTI;
-  std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
-  function_ref<AssumptionCache *(Function &)> LookupAC;
-};
-
 class HotColdSplittingLegacyPass : public ModulePass {
 public:
   static char ID;
@@ -236,6 +207,11 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
   if (F.hasFnAttribute(Attribute::NoInline))
     return false;
 
+  // A function marked `noreturn` may contain unreachable terminators: these
+  // should not be considered cold, as the function may be a trampoline.
+  if (F.hasFnAttribute(Attribute::NoReturn))
+    return false;
+
   if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
       F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
       F.hasFnAttribute(Attribute::SanitizeThread) ||
@@ -321,13 +297,10 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
   return Penalty;
 }
 
-Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
-                                              DominatorTree &DT,
-                                              BlockFrequencyInfo *BFI,
-                                              TargetTransformInfo &TTI,
-                                              OptimizationRemarkEmitter &ORE,
-                                              AssumptionCache *AC,
-                                              unsigned Count) {
+Function *HotColdSplitting::extractColdRegion(
+    const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
+    DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+    OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
   assert(!Region.empty());
 
   // TODO: Pass BFI and BPI to update profile information.
@@ -349,7 +322,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
     return nullptr;
 
   Function *OrigF = Region[0]->getParent();
-  if (Function *OutF = CE.extractCodeRegion()) {
+  if (Function *OutF = CE.extractCodeRegion(CEAC)) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
     CallSite CS(CI);
@@ -360,6 +333,9 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
     }
     CI->setIsNoInline();
 
+    if (OrigF->hasSection())
+      OutF->setSection(OrigF->getSection());
+
     markFunctionCold(*OutF, BFI != nullptr);
 
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
@@ -607,9 +583,9 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
     });
 
     if (!DT)
-      DT = make_unique<DominatorTree>(F);
+      DT = std::make_unique<DominatorTree>(F);
     if (!PDT)
-      PDT = make_unique<PostDominatorTree>(F);
+      PDT = std::make_unique<PostDominatorTree>(F);
 
     auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
     for (OutliningRegion &Region : Regions) {
@@ -637,9 +613,14 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
     }
   }
 
+  if (OutliningWorklist.empty())
+    return Changed;
+
   // Outline single-entry cold regions, splitting up larger regions as needed.
   unsigned OutlinedFunctionID = 1;
-  while (!OutliningWorklist.empty()) {
+  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+  CodeExtractorAnalysisCache CEAC(F);
+  do {
     OutliningRegion Region = OutliningWorklist.pop_back_val();
     assert(!Region.empty() && "Empty outlining region in worklist");
     do {
@@ -650,14 +631,14 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
           BB->dump();
       });
 
-      Function *Outlined = extractColdRegion(SubRegion, *DT, BFI, TTI, ORE, AC,
-                                             OutlinedFunctionID);
+      Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
+                                             ORE, AC, OutlinedFunctionID);
       if (Outlined) {
         ++OutlinedFunctionID;
         Changed = true;
       }
     } while (!Region.empty());
-  }
+  } while (!OutliningWorklist.empty());
 
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
index 7dc4d9ee9e34..1bda13a9bdd8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 using namespace llvm;
@@ -254,7 +255,7 @@ static bool PropagateConstantReturn(Function &F) {
       // Find the index of the retval to replace with
       int index = -1;
       if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins))
-        if (EV->hasIndices())
+        if (EV->getNumIndices() == 1)
           index = *EV->idx_begin();
 
       // If this use uses a specific return value, and we have a replacement,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp
index 34db75dd8b03..8a15800cbdb5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp
@@ -43,7 +43,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeBlockExtractorPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
   initializeLowerTypeTestsPass(Registry);
-  initializeMergeFunctionsPass(Registry);
+  initializeMergeFunctionsLegacyPassPass(Registry);
   initializePartialInlinerLegacyPassPass(Registry);
   initializeAttributorLegacyPassPass(Registry);
   initializePostOrderFunctionAttrsLegacyPassPass(Registry);
@@ -114,6 +114,10 @@ void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createIPSCCPPass());
 }
 
+void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMergeFunctionsPass());
+}
+
 void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
   auto PreserveMain = [=](const GlobalValue &GV) {
     return AllButMain && GV.getName() == "main";
@@ -121,6 +125,15 @@ void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
   unwrap(PM)->add(createInternalizePass(PreserveMain));
 }
 
+void LLVMAddInternalizePassWithMustPreservePredicate(
+    LLVMPassManagerRef PM,
+    void *Context,
+    LLVMBool (*Pred)(LLVMValueRef, void *)) {
+  unwrap(PM)->add(createInternalizePass([=](const GlobalValue &GV) {
+    return Pred(wrap(&GV), Context) == 0 ? false : true;
+  }));
+}
+
 void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createStripDeadPrototypesPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 7f5511e008e1..685f8f7d7a00 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -11,6 +11,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
@@ -18,24 +19,28 @@ using namespace llvm;
 
 #define DEBUG_TYPE "inferattrs"
 
-static bool inferAllPrototypeAttributes(Module &M,
-                                        const TargetLibraryInfo &TLI) {
+static bool inferAllPrototypeAttributes(
+    Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   bool Changed = false;
 
   for (Function &F : M.functions())
     // We only infer things using the prototype and the name; we don't need
     // definitions.
     if (F.isDeclaration() && !F.hasOptNone())
-      Changed |= inferLibFuncAttributes(F, TLI);
+      Changed |= inferLibFuncAttributes(F, GetTLI(F));
 
   return Changed;
 }
 
 PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
                                               ModuleAnalysisManager &AM) {
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
 
-  if (!inferAllPrototypeAttributes(M, TLI))
+  if (!inferAllPrototypeAttributes(M, GetTLI))
     // If we didn't infer anything, preserve all analyses.
     return PreservedAnalyses::all();
 
@@ -60,8 +65,10 @@ struct InferFunctionAttrsLegacyPass : public ModulePass {
     if (skipModule(M))
       return false;
 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return inferAllPrototypeAttributes(M, TLI);
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return inferAllPrototypeAttributes(M, GetTLI);
   }
 };
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp
index efb71b73cbb7..e818743544e6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Inliner.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
index 945f8affae6e..4b72261131c1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -239,7 +239,7 @@ static void mergeInlinedArrayAllocas(
         }
 
         if (Align1 > Align2)
-          AvailableAlloca->setAlignment(AI->getAlignment());
+          AvailableAlloca->setAlignment(MaybeAlign(AI->getAlignment()));
       }
 
       AI->eraseFromParent();
@@ -527,7 +527,8 @@ static void setInlineRemark(CallSite &CS, StringRef message) {
 static bool
 inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
                 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
-                ProfileSummaryInfo *PSI, TargetLibraryInfo &TLI,
+                ProfileSummaryInfo *PSI,
+                std::function<TargetLibraryInfo &(Function &)> GetTLI,
                 bool InsertLifetime,
                 function_ref<InlineCost(CallSite CS)> GetInlineCost,
                 function_ref<AAResults &(Function &)> AARGetter,
@@ -626,7 +627,8 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
 
       Instruction *Instr = CS.getInstruction();
 
-      bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI);
+      bool IsTriviallyDead =
+          isInstructionTriviallyDead(Instr, &GetTLI(*Caller));
 
       int InlineHistoryID;
       if (!IsTriviallyDead) {
@@ -757,13 +759,16 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   ACT = &getAnalysis<AssumptionCacheTracker>();
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto GetTLI = [&](Function &F) -> TargetLibraryInfo & {
+    return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return ACT->getAssumptionCache(F);
   };
-  return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime,
-                         [this](CallSite CS) { return getInlineCost(CS); },
-                         LegacyAARGetter(*this), ImportedFunctionsStats);
+  return inlineCallsImpl(
+      SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime,
+      [this](CallSite CS) { return getInlineCost(CS); }, LegacyAARGetter(*this),
+      ImportedFunctionsStats);
 }
 
 /// Remove now-dead linkonce functions at the end of
@@ -879,7 +884,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   if (!ImportedFunctionsStats &&
       InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
     ImportedFunctionsStats =
-        llvm::make_unique<ImportedFunctionsInliningStatistics>();
+        std::make_unique<ImportedFunctionsInliningStatistics>();
     ImportedFunctionsStats->setModuleInfo(M);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp
index 2e269604e379..e1644819af61 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index 91c7b5f5f135..f7108e8002ac 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
@@ -141,10 +142,12 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (NumLoops == 0) return Changed;
     --NumLoops;
     AssumptionCache *AC = nullptr;
+    Function &Func = *L->getHeader()->getParent();
     if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
-      AC = ACT->lookupAssumptionCache(*L->getHeader()->getParent());
+      AC = ACT->lookupAssumptionCache(Func);
+    CodeExtractorAnalysisCache CEAC(Func);
     CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
-    if (Extractor.extractCodeRegion() != nullptr) {
+    if (Extractor.extractCodeRegion(CEAC) != nullptr) {
       Changed = true;
       // After extraction, the loop is replaced by a function call, so
       // we shouldn't try to run any more loop passes on it.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index f7371284f47e..fa664966faf7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -54,6 +54,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
@@ -230,6 +231,16 @@ void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits,
     Bytes[AllocByteOffset + B] |= AllocMask;
 }
 
+bool lowertypetests::isJumpTableCanonical(Function *F) {
+  if (F->isDeclarationForLinker())
+    return false;
+  auto *CI = mdconst::extract_or_null<ConstantInt>(
+      F->getParent()->getModuleFlag("CFI Canonical Jump Tables"));
+  if (!CI || CI->getZExtValue() != 0)
+    return true;
+  return F->hasFnAttribute("cfi-canonical-jump-table");
+}
+
 namespace {
 
 struct ByteArrayInfo {
@@ -251,9 +262,12 @@ class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
   GlobalObject *GO;
   size_t NTypes;
 
-  // For functions: true if this is a definition (either in the merged module or
-  // in one of the thinlto modules).
-  bool IsDefinition;
+  // For functions: true if the jump table is canonical. This essentially means
+  // whether the canonical address (i.e. the symbol table entry) of the function
+  // is provided by the local jump table. This is normally the same as whether
+  // the function is defined locally, but if canonical jump tables are disabled
+  // by the user then the jump table never provides a canonical definition.
+  bool IsJumpTableCanonical;
 
   // For functions: true if this function is either defined or used in a thinlto
   // module and its jumptable entry needs to be exported to thinlto backends.
@@ -263,13 +277,13 @@ class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
 
 public:
   static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
-                                  bool IsDefinition, bool IsExported,
+                                  bool IsJumpTableCanonical, bool IsExported,
                                   ArrayRef<MDNode *> Types) {
     auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
         totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
     GTM->GO = GO;
     GTM->NTypes = Types.size();
-    GTM->IsDefinition = IsDefinition;
+    GTM->IsJumpTableCanonical = IsJumpTableCanonical;
     GTM->IsExported = IsExported;
     std::uninitialized_copy(Types.begin(), Types.end(),
                             GTM->getTrailingObjects<MDNode *>());
@@ -280,8 +294,8 @@ public:
     return GO;
   }
 
-  bool isDefinition() const {
-    return IsDefinition;
+  bool isJumpTableCanonical() const {
+    return IsJumpTableCanonical;
   }
 
   bool isExported() const {
@@ -320,6 +334,49 @@ private:
   size_t NTargets;
 };
 
+struct ScopedSaveAliaseesAndUsed {
+  Module &M;
+  SmallPtrSet<GlobalValue *, 16> Used, CompilerUsed;
+  std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases;
+
+  ScopedSaveAliaseesAndUsed(Module &M) : M(M) {
+    // The users of this class want to replace all function references except
+    // for aliases and llvm.used/llvm.compiler.used with references to a jump
+    // table. We avoid replacing aliases in order to avoid introducing a double
+    // indirection (or an alias pointing to a declaration in ThinLTO mode), and
+    // we avoid replacing llvm.used/llvm.compiler.used because these global
+    // variables describe properties of the global, not the jump table (besides,
+    // offseted references to the jump table in llvm.used are invalid).
+    // Unfortunately, LLVM doesn't have a "RAUW except for these (possibly
+    // indirect) users", so what we do is save the list of globals referenced by
+    // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW
+    // replace the aliasees and then set them back to their original values at
+    // the end.
+    if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false))
+      GV->eraseFromParent();
+    if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true))
+      GV->eraseFromParent();
+
+    for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) {
+      // FIXME: This should look past all aliases not just interposable ones,
+      // see discussion on D65118.
+      if (auto *F =
+              dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts()))
+        FunctionAliases.push_back({&GIS, F});
+    }
+  }
+
+  ~ScopedSaveAliaseesAndUsed() {
+    appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end()));
+    appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(),
+                                                       CompilerUsed.end()));
+
+    for (auto P : FunctionAliases)
+      P.first->setIndirectSymbol(
+          ConstantExpr::getBitCast(P.second, P.first->getType()));
+  }
+};
+
 class LowerTypeTestsModule {
   Module &M;
 
@@ -387,7 +444,8 @@ class LowerTypeTestsModule {
   uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
   TypeIdLowering importTypeId(StringRef TypeId);
   void importTypeTest(CallInst *CI);
-  void importFunction(Function *F, bool isDefinition);
+  void importFunction(Function *F, bool isJumpTableCanonical,
+                      std::vector<GlobalAlias *> &AliasesToErase);
 
   BitSetInfo
   buildBitSet(Metadata *TypeId,
@@ -421,7 +479,8 @@ class LowerTypeTestsModule {
                               ArrayRef<GlobalTypeMember *> Globals,
                               ArrayRef<ICallBranchFunnel *> ICallBranchFunnels);
 
-  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT, bool IsDefinition);
+  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT,
+                                              bool IsJumpTableCanonical);
   void moveInitializerToModuleConstructor(GlobalVariable *GV);
   void findGlobalVariableUsersOf(Constant *C,
                                  SmallSetVector<GlobalVariable *, 8> &Out);
@@ -433,7 +492,7 @@ class LowerTypeTestsModule {
   /// the block. 'This's use list is expected to have at least one element.
   /// Unlike replaceAllUsesWith this function skips blockaddr and direct call
   /// uses.
-  void replaceCfiUses(Function *Old, Value *New, bool IsDefinition);
+  void replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical);
 
   /// replaceDirectCalls - Go through the uses list for this definition and
   /// replace each use, which is a direct function call.
@@ -759,43 +818,50 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
   // Build a new global with the combined contents of the referenced globals.
   // This global is a struct whose even-indexed elements contain the original
   // contents of the referenced globals and whose odd-indexed elements contain
-  // any padding required to align the next element to the next power of 2.
+  // any padding required to align the next element to the next power of 2 plus
+  // any additional padding required to meet its alignment requirements.
   std::vector<Constant *> GlobalInits;
   const DataLayout &DL = M.getDataLayout();
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+  Align MaxAlign;
+  uint64_t CurOffset = 0;
+  uint64_t DesiredPadding = 0;
   for (GlobalTypeMember *G : Globals) {
-    GlobalVariable *GV = cast<GlobalVariable>(G->getGlobal());
+    auto *GV = cast<GlobalVariable>(G->getGlobal());
+    MaybeAlign Alignment(GV->getAlignment());
+    if (!Alignment)
+      Alignment = Align(DL.getABITypeAlignment(GV->getValueType()));
+    MaxAlign = std::max(MaxAlign, *Alignment);
+    uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, *Alignment);
+    GlobalLayout[G] = GVOffset;
+    if (GVOffset != 0) {
+      uint64_t Padding = GVOffset - CurOffset;
+      GlobalInits.push_back(
+          ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+    }
+
     GlobalInits.push_back(GV->getInitializer());
     uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+    CurOffset = GVOffset + InitSize;
 
-    // Compute the amount of padding required.
-    uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
+    // Compute the amount of padding that we'd like for the next element.
+    DesiredPadding = NextPowerOf2(InitSize - 1) - InitSize;
 
     // Experiments of different caps with Chromium on both x64 and ARM64
     // have shown that the 32-byte cap generates the smallest binary on
     // both platforms while different caps yield similar performance.
     // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html)
-    if (Padding > 32)
-      Padding = alignTo(InitSize, 32) - InitSize;
-
-    GlobalInits.push_back(
-        ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+    if (DesiredPadding > 32)
+      DesiredPadding = alignTo(InitSize, 32) - InitSize;
   }
-  if (!GlobalInits.empty())
-    GlobalInits.pop_back();
+
   Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
   auto *CombinedGlobal =
       new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
                          GlobalValue::PrivateLinkage, NewInit);
+  CombinedGlobal->setAlignment(MaxAlign);
 
   StructType *NewTy = cast<StructType>(NewInit->getType());
-  const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy);
-
-  // Compute the offsets of the original globals within the new global.
-  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
-  for (unsigned I = 0; I != Globals.size(); ++I)
-    // Multiply by 2 to account for padding elements.
-    GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
-
   lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout);
 
   // Build aliases pointing to offsets into the combined global for each
@@ -975,14 +1041,16 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
 }
 
 // ThinLTO backend: the function F has a jump table entry; update this module
-// accordingly. isDefinition describes the type of the jump table entry.
-void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
+// accordingly. isJumpTableCanonical describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(
+    Function *F, bool isJumpTableCanonical,
+    std::vector<GlobalAlias *> &AliasesToErase) {
   assert(F->getType()->getAddressSpace() == 0);
 
   GlobalValue::VisibilityTypes Visibility = F->getVisibility();
   std::string Name = F->getName();
 
-  if (F->isDeclarationForLinker() && isDefinition) {
+  if (F->isDeclarationForLinker() && isJumpTableCanonical) {
     // Non-dso_local functions may be overriden at run time,
     // don't short curcuit them
     if (F->isDSOLocal()) {
@@ -997,12 +1065,13 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
   }
 
   Function *FDecl;
-  if (F->isDeclarationForLinker() && !isDefinition) {
-    // Declaration of an external function.
+  if (!isJumpTableCanonical) {
+    // Either a declaration of an external function or a reference to a locally
+    // defined jump table.
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
                              F->getAddressSpace(), Name + ".cfi_jt", &M);
     FDecl->setVisibility(GlobalValue::HiddenVisibility);
-  } else if (isDefinition) {
+  } else {
     F->setName(Name + ".cfi");
     F->setLinkage(GlobalValue::ExternalLinkage);
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
@@ -1011,8 +1080,8 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
     Visibility = GlobalValue::HiddenVisibility;
 
     // Delete aliases pointing to this function, they'll be re-created in the
-    // merged output
-    SmallVector<GlobalAlias*, 4> ToErase;
+    // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed
+    // will want to reset the aliasees first.
     for (auto &U : F->uses()) {
       if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
         Function *AliasDecl = Function::Create(
@@ -1020,24 +1089,15 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
             F->getAddressSpace(), "", &M);
         AliasDecl->takeName(A);
         A->replaceAllUsesWith(AliasDecl);
-        ToErase.push_back(A);
+        AliasesToErase.push_back(A);
       }
     }
-    for (auto *A : ToErase)
-      A->eraseFromParent();
-  } else {
-    // Function definition without type metadata, where some other translation
-    // unit contained a declaration with type metadata. This normally happens
-    // during mixed CFI + non-CFI compilation. We do nothing with the function
-    // so that it is treated the same way as a function defined outside of the
-    // LTO unit.
-    return;
   }
 
-  if (F->isWeakForLinker())
-    replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isDefinition);
+  if (F->hasExternalWeakLinkage())
+    replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isJumpTableCanonical);
   else
-    replaceCfiUses(F, FDecl, isDefinition);
+    replaceCfiUses(F, FDecl, isJumpTableCanonical);
 
   // Set visibility late because it's used in replaceCfiUses() to determine
   // whether uses need to to be replaced.
@@ -1225,7 +1285,7 @@ void LowerTypeTestsModule::findGlobalVariableUsersOf(
 
 // Replace all uses of F with (F ? JT : 0).
 void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
-    Function *F, Constant *JT, bool IsDefinition) {
+    Function *F, Constant *JT, bool IsJumpTableCanonical) {
   // The target expression can not appear in a constant initializer on most
   // (all?) targets. Switch to a runtime initializer.
   SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
@@ -1239,7 +1299,7 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
       Function::Create(cast<FunctionType>(F->getValueType()),
                        GlobalValue::ExternalWeakLinkage,
                        F->getAddressSpace(), "", &M);
-  replaceCfiUses(F, PlaceholderFn, IsDefinition);
+  replaceCfiUses(F, PlaceholderFn, IsJumpTableCanonical);
 
   Constant *Target = ConstantExpr::getSelect(
       ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
@@ -1276,8 +1336,9 @@ selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions,
 
   unsigned ArmCount = 0, ThumbCount = 0;
   for (const auto GTM : Functions) {
-    if (!GTM->isDefinition()) {
+    if (!GTM->isJumpTableCanonical()) {
       // PLT stubs are always ARM.
+      // FIXME: This is the wrong heuristic for non-canonical jump tables.
       ++ArmCount;
       continue;
     }
@@ -1303,7 +1364,7 @@ void LowerTypeTestsModule::createJumpTable(
                          cast<Function>(Functions[I]->getGlobal()));
 
   // Align the whole table by entry size.
-  F->setAlignment(getJumpTableEntrySize());
+  F->setAlignment(Align(getJumpTableEntrySize()));
   // Skip prologue.
   // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
   // Luckily, this function does not get any prologue even without the
@@ -1438,47 +1499,53 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
 
   lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
 
-  // Build aliases pointing to offsets into the jump table, and replace
-  // references to the original functions with references to the aliases.
-  for (unsigned I = 0; I != Functions.size(); ++I) {
-    Function *F = cast<Function>(Functions[I]->getGlobal());
-    bool IsDefinition = Functions[I]->isDefinition();
-
-    Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
-        ConstantExpr::getInBoundsGetElementPtr(
-            JumpTableType, JumpTable,
-            ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
-                                 ConstantInt::get(IntPtrTy, I)}),
-        F->getType());
-    if (Functions[I]->isExported()) {
-      if (IsDefinition) {
-        ExportSummary->cfiFunctionDefs().insert(F->getName());
+  {
+    ScopedSaveAliaseesAndUsed S(M);
+
+    // Build aliases pointing to offsets into the jump table, and replace
+    // references to the original functions with references to the aliases.
+    for (unsigned I = 0; I != Functions.size(); ++I) {
+      Function *F = cast<Function>(Functions[I]->getGlobal());
+      bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical();
+
+      Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
+          ConstantExpr::getInBoundsGetElementPtr(
+              JumpTableType, JumpTable,
+              ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
+                                   ConstantInt::get(IntPtrTy, I)}),
+          F->getType());
+      if (Functions[I]->isExported()) {
+        if (IsJumpTableCanonical) {
+          ExportSummary->cfiFunctionDefs().insert(F->getName());
+        } else {
+          GlobalAlias *JtAlias = GlobalAlias::create(
+              F->getValueType(), 0, GlobalValue::ExternalLinkage,
+              F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+          JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+          ExportSummary->cfiFunctionDecls().insert(F->getName());
+        }
+      }
+      if (!IsJumpTableCanonical) {
+        if (F->hasExternalWeakLinkage())
+          replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr,
+                                                 IsJumpTableCanonical);
+        else
+          replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical);
       } else {
-        GlobalAlias *JtAlias = GlobalAlias::create(
-            F->getValueType(), 0, GlobalValue::ExternalLinkage,
-            F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
-        JtAlias->setVisibility(GlobalValue::HiddenVisibility);
-        ExportSummary->cfiFunctionDecls().insert(F->getName());
+        assert(F->getType()->getAddressSpace() == 0);
+
+        GlobalAlias *FAlias =
+            GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "",
+                                CombinedGlobalElemPtr, &M);
+        FAlias->setVisibility(F->getVisibility());
+        FAlias->takeName(F);
+        if (FAlias->hasName())
+          F->setName(FAlias->getName() + ".cfi");
+        replaceCfiUses(F, FAlias, IsJumpTableCanonical);
+        if (!F->hasLocalLinkage())
+          F->setVisibility(GlobalVariable::HiddenVisibility);
       }
     }
-    if (!IsDefinition) {
-      if (F->isWeakForLinker())
-        replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, IsDefinition);
-      else
-        replaceCfiUses(F, CombinedGlobalElemPtr, IsDefinition);
-    } else {
-      assert(F->getType()->getAddressSpace() == 0);
-
-      GlobalAlias *FAlias = GlobalAlias::create(
-          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
-      FAlias->setVisibility(F->getVisibility());
-      FAlias->takeName(F);
-      if (FAlias->hasName())
-        F->setName(FAlias->getName() + ".cfi");
-      replaceCfiUses(F, FAlias, IsDefinition);
-      if (!F->hasLocalLinkage())
-        F->setVisibility(GlobalVariable::HiddenVisibility);
-    }
   }
 
   createJumpTable(JumpTableFn, Functions);
@@ -1623,7 +1690,7 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
     ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
                           ": ");
     std::error_code EC;
-    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
     ExitOnErr(errorCodeToError(EC));
 
     yaml::Output Out(OS);
@@ -1643,7 +1710,8 @@ static bool isDirectCall(Use& U) {
   return false;
 }
 
-void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefinition) {
+void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
+                                          bool IsJumpTableCanonical) {
   SmallSetVector<Constant *, 4> Constants;
   auto UI = Old->use_begin(), E = Old->use_end();
   for (; UI != E;) {
@@ -1655,7 +1723,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefi
       continue;
 
     // Skip direct calls to externally defined or non-dso_local functions
-    if (isDirectCall(U) && (Old->isDSOLocal() || !IsDefinition))
+    if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
       continue;
 
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
@@ -1678,16 +1746,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefi
 }
 
 void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
-  auto UI = Old->use_begin(), E = Old->use_end();
-  for (; UI != E;) {
-    Use &U = *UI;
-    ++UI;
-
-    if (!isDirectCall(U))
-      continue;
-
-    U.set(New);
-  }
+  Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); });
 }
 
 bool LowerTypeTestsModule::lower() {
@@ -1734,10 +1793,16 @@ bool LowerTypeTestsModule::lower() {
         Decls.push_back(&F);
     }
 
-    for (auto F : Defs)
-      importFunction(F, /*isDefinition*/ true);
-    for (auto F : Decls)
-      importFunction(F, /*isDefinition*/ false);
+    std::vector<GlobalAlias *> AliasesToErase;
+    {
+      ScopedSaveAliaseesAndUsed S(M);
+      for (auto F : Defs)
+        importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase);
+      for (auto F : Decls)
+        importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase);
+    }
+    for (GlobalAlias *GA : AliasesToErase)
+      GA->eraseFromParent();
 
     return true;
   }
@@ -1746,7 +1811,7 @@ bool LowerTypeTestsModule::lower() {
   // reference them. This is used to partition the set of type identifiers in
   // the module into disjoint sets.
   using GlobalClassesTy = EquivalenceClasses<
-      PointerUnion3<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>;
+      PointerUnion<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>;
   GlobalClassesTy GlobalClasses;
 
   // Verify the type metadata and build a few data structures to let us
@@ -1823,6 +1888,17 @@ bool LowerTypeTestsModule::lower() {
         CfiFunctionLinkage Linkage = P.second.Linkage;
         MDNode *FuncMD = P.second.FuncMD;
         Function *F = M.getFunction(FunctionName);
+        if (F && F->hasLocalLinkage()) {
+          // Locally defined function that happens to have the same name as a
+          // function defined in a ThinLTO module. Rename it to move it out of
+          // the way of the external reference that we're about to create.
+          // Note that setName will find a unique name for the function, so even
+          // if there is an existing function with the suffix there won't be a
+          // name collision.
+          F->setName(F->getName() + ".1");
+          F = nullptr;
+        }
+
         if (!F)
           F = Function::Create(
               FunctionType::get(Type::getVoidTy(M.getContext()), false),
@@ -1871,24 +1947,26 @@ bool LowerTypeTestsModule::lower() {
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
 
-    bool IsDefinition = !GO.isDeclarationForLinker();
+    bool IsJumpTableCanonical = false;
     bool IsExported = false;
     if (Function *F = dyn_cast<Function>(&GO)) {
+      IsJumpTableCanonical = isJumpTableCanonical(F);
       if (ExportedFunctions.count(F->getName())) {
-        IsDefinition |= ExportedFunctions[F->getName()].Linkage == CFL_Definition;
+        IsJumpTableCanonical |=
+            ExportedFunctions[F->getName()].Linkage == CFL_Definition;
         IsExported = true;
       // TODO: The logic here checks only that the function is address taken,
       // not that the address takers are live. This can be updated to check
       // their liveness and emit fewer jumptable entries once monolithic LTO
       // builds also emit summaries.
       } else if (!F->hasAddressTaken()) {
-        if (!CrossDsoCfi || !IsDefinition || F->hasLocalLinkage())
+        if (!CrossDsoCfi || !IsJumpTableCanonical || F->hasLocalLinkage())
           continue;
       }
     }
 
-    auto *GTM =
-        GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
+    auto *GTM = GlobalTypeMember::create(Alloc, &GO, IsJumpTableCanonical,
+                                         IsExported, Types);
     GlobalTypeMembers[&GO] = GTM;
     for (MDNode *Type : Types) {
       verifyTypeMDNode(&GO, Type);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 3a08069dcd4a..06d2a2f31941 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -115,12 +115,14 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/Transforms/Utils/FunctionComparator.h"
 #include <algorithm>
 #include <cassert>
@@ -195,16 +197,12 @@ public:
 /// by considering all pointer types to be equivalent. Once identified,
 /// MergeFunctions will fold them by replacing a call to one to a call to a
 /// bitcast of the other.
-class MergeFunctions : public ModulePass {
+class MergeFunctions {
 public:
-  static char ID;
-
-  MergeFunctions()
-    : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)) {
-    initializeMergeFunctionsPass(*PassRegistry::getPassRegistry());
+  MergeFunctions() : FnTree(FunctionNodeCmp(&GlobalNumbers)) {
   }
 
-  bool runOnModule(Module &M) override;
+  bool runOnModule(Module &M);
 
 private:
   // The function comparison operator is provided here so that FunctionNodes do
@@ -297,14 +295,39 @@ private:
   DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree;
 };
 
-} // end anonymous namespace
+class MergeFunctionsLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  MergeFunctionsLegacyPass(): ModulePass(ID) {
+    initializeMergeFunctionsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
 
-char MergeFunctions::ID = 0;
+    MergeFunctions MF;
+    return MF.runOnModule(M);
+  }
+};
 
-INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false)
+} // end anonymous namespace
+
+char MergeFunctionsLegacyPass::ID = 0;
+INITIALIZE_PASS(MergeFunctionsLegacyPass, "mergefunc",
+                "Merge Functions", false, false)
 
 ModulePass *llvm::createMergeFunctionsPass() {
-  return new MergeFunctions();
+  return new MergeFunctionsLegacyPass();
+}
+
+PreservedAnalyses MergeFunctionsPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  MergeFunctions MF;
+  if (!MF.runOnModule(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
 }
 
 #ifndef NDEBUG
@@ -386,9 +409,6 @@ static bool isEligibleForMerging(Function &F) {
 }
 
 bool MergeFunctions::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
-
   bool Changed = false;
 
   // All functions in the module, ordered by hash. Functions with a unique
@@ -449,28 +469,10 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
     ++UI;
     CallSite CS(U->getUser());
     if (CS && CS.isCallee(U)) {
-      // Transfer the called function's attributes to the call site. Due to the
-      // bitcast we will 'lose' ABI changing attributes because the 'called
-      // function' is no longer a Function* but the bitcast. Code that looks up
-      // the attributes from the called function will fail.
-
-      // FIXME: This is not actually true, at least not anymore. The callsite
-      // will always have the same ABI affecting attributes as the callee,
-      // because otherwise the original input has UB. Note that Old and New
-      // always have matching ABI, so no attributes need to be changed.
-      // Transferring other attributes may help other optimizations, but that
-      // should be done uniformly and not in this ad-hoc way.
-      auto &Context = New->getContext();
-      auto NewPAL = New->getAttributes();
-      SmallVector<AttributeSet, 4> NewArgAttrs;
-      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++)
-        NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx));
-      // Don't transfer attributes from the function to the callee. Function
-      // attributes typically aren't relevant to the calling convention or ABI.
-      CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(),
-                                          NewPAL.getRetAttributes(),
-                                          NewArgAttrs));
-
+      // Do not copy attributes from the called function to the call-site.
+      // Function comparison ensures that the attributes are the same up to
+      // type congruences in byval(), in which case we need to keep the byval
+      // type of the call-site, not the callee function.
       remove(CS.getInstruction()->getFunction());
       U->set(BitcastNew);
     }
@@ -769,7 +771,7 @@ void MergeFunctions::writeAlias(Function *F, Function *G) {
       PtrType->getElementType(), PtrType->getAddressSpace(),
       G->getLinkage(), "", BitcastF, G->getParent());
 
-  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  F->setAlignment(MaybeAlign(std::max(F->getAlignment(), G->getAlignment())));
   GA->takeName(G);
   GA->setVisibility(G->getVisibility());
   GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
@@ -816,7 +818,7 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
     removeUsers(F);
     F->replaceAllUsesWith(NewF);
 
-    unsigned MaxAlignment = std::max(G->getAlignment(), NewF->getAlignment());
+    MaybeAlign MaxAlignment(std::max(G->getAlignment(), NewF->getAlignment()));
 
     writeThunkOrAlias(F, G);
     writeThunkOrAlias(F, NewF);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 733782e8764d..cd3701e90308 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
@@ -409,7 +410,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
 
   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
-      llvm::make_unique<FunctionOutliningMultiRegionInfo>();
+      std::make_unique<FunctionOutliningMultiRegionInfo>();
 
   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
     BasicBlock *Dom = BlockList.front();
@@ -589,7 +590,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   };
 
   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
-      llvm::make_unique<FunctionOutliningInfo>();
+      std::make_unique<FunctionOutliningInfo>();
 
   BasicBlock *CurrEntry = EntryBlock;
   bool CandidateFound = false;
@@ -701,7 +702,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   return OutliningInfo;
 }
 
-// Check if there is PGO data or user annoated branch data:
+// Check if there is PGO data or user annotated branch data:
 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
   if (F->hasProfileData())
     return true;
@@ -966,7 +967,7 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
     function_ref<AssumptionCache *(Function &)> LookupAC)
     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
-  ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
+  ClonedOI = std::make_unique<FunctionOutliningInfo>();
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
@@ -991,7 +992,7 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
     OptimizationRemarkEmitter &ORE,
     function_ref<AssumptionCache *(Function &)> LookupAC)
     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
-  ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
+  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
@@ -1122,6 +1123,9 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
   BranchProbabilityInfo BPI(*ClonedFunc, LI);
   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
 
+  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+
   SetVector<Value *> Inputs, Outputs, Sinks;
   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
        ClonedOMRI->ORI) {
@@ -1148,7 +1152,7 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
     if (Outputs.size() > 0 && !ForceLiveExit)
       continue;
 
-    Function *OutlinedFunc = CE.extractCodeRegion();
+    Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
 
     if (OutlinedFunc) {
       CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
@@ -1210,11 +1214,12 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
     }
 
   // Extract the body of the if.
+  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
   Function *OutlinedFunc =
       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
                     /* AllowVarargs */ true)
-          .extractCodeRegion();
+          .extractCodeRegion(CEAC);
 
   if (OutlinedFunc) {
     BasicBlock *OutliningCallBB =
@@ -1264,7 +1269,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
   if (PSI->isFunctionEntryCold(F))
     return {false, nullptr};
 
-  if (empty(F->users()))
+  if (F->users().empty())
     return {false, nullptr};
 
   OptimizationRemarkEmitter ORE(F);
@@ -1370,7 +1375,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     return false;
   }
 
-  assert(empty(Cloner.OrigFunc->users()) &&
+  assert(Cloner.OrigFunc->users().empty() &&
          "F's users should all be replaced!");
 
   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 3ea77f08fd3c..7cfc29f7bf7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
@@ -147,6 +148,10 @@ cl::opt<bool> EnableOrderFileInstrumentation(
     "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
     cl::desc("Enable order file instrumentation (default = off)"));
 
+static cl::opt<bool>
+    EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
+                 cl::desc("Enable lowering of the matrix intrinsics"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -183,8 +188,13 @@ PassManagerBuilder::~PassManagerBuilder() {
 }
 
 /// Set of global extensions, automatically added as part of the standard set.
-static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
-   PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions;
+static ManagedStatic<
+    SmallVector<std::tuple<PassManagerBuilder::ExtensionPointTy,
+                           PassManagerBuilder::ExtensionFn,
+                           PassManagerBuilder::GlobalExtensionID>,
+                8>>
+    GlobalExtensions;
+static PassManagerBuilder::GlobalExtensionID GlobalExtensionsCounter;
 
 /// Check if GlobalExtensions is constructed and not empty.
 /// Since GlobalExtensions is a managed static, calling 'empty()' will trigger
@@ -193,10 +203,29 @@ static bool GlobalExtensionsNotEmpty() {
   return GlobalExtensions.isConstructed() && !GlobalExtensions->empty();
 }
 
-void PassManagerBuilder::addGlobalExtension(
-    PassManagerBuilder::ExtensionPointTy Ty,
-    PassManagerBuilder::ExtensionFn Fn) {
-  GlobalExtensions->push_back(std::make_pair(Ty, std::move(Fn)));
+PassManagerBuilder::GlobalExtensionID
+PassManagerBuilder::addGlobalExtension(PassManagerBuilder::ExtensionPointTy Ty,
+                                       PassManagerBuilder::ExtensionFn Fn) {
+  auto ExtensionID = GlobalExtensionsCounter++;
+  GlobalExtensions->push_back(std::make_tuple(Ty, std::move(Fn), ExtensionID));
+  return ExtensionID;
+}
+
+void PassManagerBuilder::removeGlobalExtension(
+    PassManagerBuilder::GlobalExtensionID ExtensionID) {
+  // RegisterStandardPasses may try to call this function after GlobalExtensions
+  // has already been destroyed; doing so should not generate an error.
+  if (!GlobalExtensions.isConstructed())
+    return;
+
+  auto GlobalExtension =
+      llvm::find_if(*GlobalExtensions, [ExtensionID](const auto &elem) {
+        return std::get<2>(elem) == ExtensionID;
+      });
+  assert(GlobalExtension != GlobalExtensions->end() &&
+         "The extension ID to be removed should always be valid.");
+
+  GlobalExtensions->erase(GlobalExtension);
 }
 
 void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
@@ -207,8 +236,8 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
                                            legacy::PassManagerBase &PM) const {
   if (GlobalExtensionsNotEmpty()) {
     for (auto &Ext : *GlobalExtensions) {
-      if (Ext.first == ETy)
-        Ext.second(*this, PM);
+      if (std::get<0>(Ext) == ETy)
+        std::get<1>(Ext)(*this, PM);
     }
   }
   for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
@@ -320,19 +349,26 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     legacy::PassManagerBase &MPM) {
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
+  assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!");
   MPM.add(createSROAPass());
   MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies
-  if (EnableGVNHoist)
-    MPM.add(createGVNHoistPass());
-  if (EnableGVNSink) {
-    MPM.add(createGVNSinkPass());
-    MPM.add(createCFGSimplificationPass());
+
+  if (OptLevel > 1) {
+    if (EnableGVNHoist)
+      MPM.add(createGVNHoistPass());
+    if (EnableGVNSink) {
+      MPM.add(createGVNSinkPass());
+      MPM.add(createCFGSimplificationPass());
+    }
   }
 
-  // Speculative execution if the target has divergent branches; otherwise nop.
-  MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
-  MPM.add(createJumpThreadingPass());         // Thread jumps.
-  MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  if (OptLevel > 1) {
+    // Speculative execution if the target has divergent branches; otherwise nop.
+    MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
+
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  }
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   // Combine silly seq's
   if (OptLevel > 2)
@@ -346,8 +382,10 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   if (SizeLevel == 0)
     MPM.add(createPGOMemOPSizeOptLegacyPass());
 
-  MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
-  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
+  if (OptLevel > 1)
+    MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
 
   // Begin the loop pass pipeline.
@@ -360,6 +398,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   }
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  // TODO: Investigate promotion cap for O1.
   MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
   if (EnableSimpleLoopUnswitch)
     MPM.add(createSimpleLoopUnswitchLegacyPass());
@@ -402,16 +441,19 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // opened up by them.
   addInstructionCombiningPass(MPM);
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createJumpThreadingPass());         // Thread jumps
-  MPM.add(createCorrelatedValuePropagationPass());
-  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  if (OptLevel > 1) {
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  }
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
 
+  // TODO: Investigate if this is too expensive at O1.
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   // Clean up after everything.
@@ -654,6 +696,15 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createGlobalsAAWrapperPass());
 
   MPM.add(createFloat2IntPass());
+  MPM.add(createLowerConstantIntrinsicsPass());
+
+  if (EnableMatrix) {
+    MPM.add(createLowerMatrixIntrinsicsPass());
+    // CSE the pointer arithmetic of the column vectors.  This allows alias
+    // analysis to establish no-aliasing between loads and stores of different
+    // columns of the same matrix.
+    MPM.add(createEarlyCSEPass(false));
+  }
 
   addExtensionsToPM(EP_VectorizerStart, MPM);
 
@@ -898,7 +949,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
 
   // LTO provides additional opportunities for tailcall elimination due to
   // link-time inlining, and visibility of nocapture attribute.
-  PM.add(createTailCallEliminationPass());
+  if (OptLevel > 1)
+    PM.add(createTailCallEliminationPass());
 
   // Infer attributes on declarations, call sites, arguments, etc.
   PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp
index cb3915dfb678..45a0ce20eb17 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -18,15 +18,16 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp
index 7be3608bd2ec..fdffffba0c2d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -2,6 +2,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 
@@ -9,16 +10,18 @@ using namespace llvm;
 
 PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
   const DataLayout &DL = M.getDataLayout();
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
   auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn {
     DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
     return {
-        make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
+        std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
         &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)};
   };
 
-  if (!runIPSCCP(M, DL, &TLI, getAnalysis))
+  if (!runIPSCCP(M, DL, GetTLI, getAnalysis))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
@@ -47,14 +50,14 @@ public:
     if (skipModule(M))
       return false;
     const DataLayout &DL = M.getDataLayout();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-
+    auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
     auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn {
       DominatorTree &DT =
           this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
       return {
-          make_unique<PredicateInfo>(
+          std::make_unique<PredicateInfo>(
               F, DT,
               this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
                   F)),
@@ -62,7 +65,7 @@ public:
           nullptr}; // manager, so set them to nullptr.
     };
 
-    return runIPSCCP(M, DL, TLI, getAnalysis);
+    return runIPSCCP(M, DL, GetTLI, getAnalysis);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 877d20e72ffc..a1fbb1adc412 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -26,13 +26,17 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -57,6 +61,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/SampleProf.h"
@@ -72,6 +77,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -79,6 +85,7 @@
 #include <limits>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <system_error>
 #include <utility>
@@ -88,6 +95,12 @@ using namespace llvm;
 using namespace sampleprof;
 using ProfileCount = Function::ProfileCount;
 #define DEBUG_TYPE "sample-profile"
+#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
+
+STATISTIC(NumCSInlined,
+          "Number of functions inlined with context sensitive profile");
+STATISTIC(NumCSNotInlined,
+          "Number of functions not inlined with context sensitive profile");
 
 // Command line option to specify the file to read samples from. This is
 // mainly used for debugging.
@@ -128,6 +141,31 @@ static cl::opt<bool> ProfileSampleAccurate(
              "callsite and function as having 0 samples. Otherwise, treat "
              "un-sampled callsites and functions conservatively as unknown. "));
 
+static cl::opt<bool> ProfileAccurateForSymsInList(
+    "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
+    cl::init(true),
+    cl::desc("For symbols in profile symbol list, regard their profiles to "
+             "be accurate. It may be overriden by profile-sample-accurate. "));
+
+static cl::opt<bool> ProfileMergeInlinee(
+    "sample-profile-merge-inlinee", cl::Hidden, cl::init(false),
+    cl::desc("Merge past inlinee's profile to outline version if sample "
+             "profile loader decided not to inline a call site."));
+
+static cl::opt<bool> ProfileTopDownLoad(
+    "sample-profile-top-down-load", cl::Hidden, cl::init(false),
+    cl::desc("Do profile annotation and inlining for functions in top-down "
+             "order of call graph during sample profile loading."));
+
+static cl::opt<bool> ProfileSizeInline(
+    "sample-profile-inline-size", cl::Hidden, cl::init(false),
+    cl::desc("Inline cold call sites in profile loader if it's beneficial "
+             "for code size."));
+
+static cl::opt<int> SampleColdCallSiteThreshold(
+    "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
+    cl::desc("Threshold for inlining cold callsites"));
+
 namespace {
 
 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -137,9 +175,11 @@ using EdgeWeightMap = DenseMap<Edge, uint64_t>;
 using BlockEdgeMap =
     DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
 
+class SampleProfileLoader;
+
 class SampleCoverageTracker {
 public:
-  SampleCoverageTracker() = default;
+  SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
 
   bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
                        uint32_t Discriminator, uint64_t Samples);
@@ -185,6 +225,76 @@ private:
   /// keyed by FunctionSamples pointers, but these stats are cleared after
   /// every function, so we just need to keep a single counter.
   uint64_t TotalUsedSamples = 0;
+
+  SampleProfileLoader &SPLoader;
+};
+
+class GUIDToFuncNameMapper {
+public:
+  GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
+                        DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
+      : CurrentReader(Reader), CurrentModule(M),
+      CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
+    if (CurrentReader.getFormat() != SPF_Compact_Binary)
+      return;
+
+    for (const auto &F : CurrentModule) {
+      StringRef OrigName = F.getName();
+      CurrentGUIDToFuncNameMap.insert(
+          {Function::getGUID(OrigName), OrigName});
+
+      // Local to global var promotion used by optimization like thinlto
+      // will rename the var and add suffix like ".llvm.xxx" to the
+      // original local name. In sample profile, the suffixes of function
+      // names are all stripped. Since it is possible that the mapper is
+      // built in post-thin-link phase and var promotion has been done,
+      // we need to add the substring of function name without the suffix
+      // into the GUIDToFuncNameMap.
+      StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+      if (CanonName != OrigName)
+        CurrentGUIDToFuncNameMap.insert(
+            {Function::getGUID(CanonName), CanonName});
+    }
+
+    // Update GUIDToFuncNameMap for each function including inlinees.
+    SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
+  }
+
+  ~GUIDToFuncNameMapper() {
+    if (CurrentReader.getFormat() != SPF_Compact_Binary)
+      return;
+
+    CurrentGUIDToFuncNameMap.clear();
+
+    // Reset GUIDToFuncNameMap for of each function as they're no
+    // longer valid at this point.
+    SetGUIDToFuncNameMapForAll(nullptr);
+  }
+
+private:
+  void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
+    std::queue<FunctionSamples *> FSToUpdate;
+    for (auto &IFS : CurrentReader.getProfiles()) {
+      FSToUpdate.push(&IFS.second);
+    }
+
+    while (!FSToUpdate.empty()) {
+      FunctionSamples *FS = FSToUpdate.front();
+      FSToUpdate.pop();
+      FS->GUIDToFuncNameMap = Map;
+      for (const auto &ICS : FS->getCallsiteSamples()) {
+        const FunctionSamplesMap &FSMap = ICS.second;
+        for (auto &IFS : FSMap) {
+          FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
+          FSToUpdate.push(&FS);
+        }
+      }
+    }
+  }
+
+  SampleProfileReader &CurrentReader;
+  Module &CurrentModule;
+  DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
 };
 
 /// Sample profile pass.
@@ -199,16 +309,19 @@ public:
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
       : GetAC(std::move(GetAssumptionCache)),
-        GetTTI(std::move(GetTargetTransformInfo)), Filename(Name),
-        RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {}
+        GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this),
+        Filename(Name), RemappingFilename(RemapName),
+        IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
-                   ProfileSummaryInfo *_PSI);
+                   ProfileSummaryInfo *_PSI, CallGraph *CG);
 
   void dump() { Reader->dump(); }
 
 protected:
+  friend class SampleCoverageTracker;
+
   bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
   unsigned getFunctionLoc(Function &F);
   bool emitAnnotations(Function &F);
@@ -222,6 +335,10 @@ protected:
   bool inlineCallInstruction(Instruction *I);
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  // Inline cold/small functions in addition to hot ones
+  bool shouldInlineColdCallee(Instruction &CallInst);
+  void emitOptimizationRemarksForInlineCandidates(
+    const SmallVector<Instruction *, 10> &Candidates, const Function &F, bool Hot);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
@@ -234,9 +351,12 @@ protected:
   void propagateWeights(Function &F);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
+  std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
   void clearFunctionData();
+  bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+                     ProfileSummaryInfo *PSI);
 
   /// Map basic blocks to their computed weights.
   ///
@@ -310,6 +430,10 @@ protected:
   /// Profile Summary Info computed from sample profile.
   ProfileSummaryInfo *PSI = nullptr;
 
+  /// Profle Symbol list tells whether a function name appears in the binary
+  /// used to generate the current profile.
+  std::unique_ptr<ProfileSymbolList> PSL;
+
   /// Total number of samples collected in this profile.
   ///
   /// This is the sum of all the samples collected in all the functions executed
@@ -326,6 +450,21 @@ protected:
     uint64_t entryCount;
   };
   DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
+
+  // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
+  // all the function symbols defined or declared in current module.
+  DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
+
+  // All the Names used in FunctionSamples including outline function
+  // names, inline instance names and call target names.
+  StringSet<> NamesInProfile;
+
+  // For symbol in profile symbol list, whether to regard their profiles
+  // to be accurate. It is mainly decided by existance of profile symbol
+  // list and -profile-accurate-for-symsinlist flag, but it can be
+  // overriden by -profile-sample-accurate or profile-sample-accurate
+  // attribute.
+  bool ProfAccForSymsInList;
 };
 
 class SampleProfileLoaderLegacyPass : public ModulePass {
@@ -381,14 +520,23 @@ private:
 /// To decide whether an inlined callsite is hot, we compare the callsite
 /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
 /// regarded as hot if the count is above the cutoff value.
-static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
-                          ProfileSummaryInfo *PSI) {
+///
+/// When ProfileAccurateForSymsInList is enabled and profile symbol list
+/// is present, functions in the profile symbol list but without profile will
+/// be regarded as cold and much less inlining will happen in CGSCC inlining
+/// pass, so we tend to lower the hot criteria here to allow more early
+/// inlining to happen for warm callsites and it is helpful for performance.
+bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
+                                        ProfileSummaryInfo *PSI) {
   if (!CallsiteFS)
     return false; // The callsite was not inlined in the original binary.
 
   assert(PSI && "PSI is expected to be non null");
   uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
-  return PSI->isHotCount(CallsiteTotalSamples);
+  if (ProfAccForSymsInList)
+    return !PSI->isColdCount(CallsiteTotalSamples);
+  else
+    return PSI->isHotCount(CallsiteTotalSamples);
 }
 
 /// Mark as used the sample record for the given function samples at
@@ -425,7 +573,7 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Count += countUsedRecords(CalleeSamples, PSI);
     }
 
@@ -444,7 +592,7 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Count += countBodyRecords(CalleeSamples, PSI);
     }
 
@@ -465,7 +613,7 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Total += countBodySamples(CalleeSamples, PSI);
     }
 
@@ -756,21 +904,52 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
       getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC,
                     None, nullptr, nullptr);
   if (Cost.isNever()) {
-    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB)
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
               << "incompatible inlining");
     return false;
   }
   InlineFunctionInfo IFI(nullptr, &GetAC);
   if (InlineFunction(CS, IFI)) {
     // The call to InlineFunction erases I, so we can't pass it here.
-    ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB)
-              << "inlined hot callee '" << ore::NV("Callee", CalledFunction)
+    ORE->emit(OptimizationRemark(CSINLINE_DEBUG, "InlineSuccess", DLoc, BB)
+              << "inlined callee '" << ore::NV("Callee", CalledFunction)
               << "' into '" << ore::NV("Caller", BB->getParent()) << "'");
     return true;
   }
   return false;
 }
 
+bool SampleProfileLoader::shouldInlineColdCallee(Instruction &CallInst) {
+  if (!ProfileSizeInline)
+    return false;
+
+  Function *Callee = CallSite(&CallInst).getCalledFunction();
+  if (Callee == nullptr)
+    return false;
+
+  InlineCost Cost =
+      getInlineCost(cast<CallBase>(CallInst), getInlineParams(),
+                    GetTTI(*Callee), GetAC, None, nullptr, nullptr);
+
+  return Cost.getCost() <= SampleColdCallSiteThreshold;
+}
+
+void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
+    const SmallVector<Instruction *, 10> &Candidates, const Function &F,
+    bool Hot) {
+  for (auto I : Candidates) {
+    Function *CalledFunction = CallSite(I).getCalledFunction();
+    if (CalledFunction) {
+      ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt", 
+                                           I->getDebugLoc(), I->getParent())
+                << "previous inlining reattempted for "
+                << (Hot ? "hotness: '" : "size: '")
+                << ore::NV("Callee", CalledFunction) << "' into '"
+                << ore::NV("Caller", &F) << "'");
+    }
+  }
+}
+
 /// Iteratively inline hot callsites of a function.
 ///
 /// Iteratively traverse all callsites of the function \p F, and find if
@@ -788,6 +967,14 @@ bool SampleProfileLoader::inlineHotFunctions(
     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
   DenseSet<Instruction *> PromotedInsns;
 
+  // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+  // Profile symbol list is ignored when profile-sample-accurate is on.
+  assert((!ProfAccForSymsInList ||
+          (!ProfileSampleAccurate &&
+           !F.hasFnAttribute("profile-sample-accurate"))) &&
+         "ProfAccForSymsInList should be false when profile-sample-accurate "
+         "is enabled");
+
   DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites;
   bool Changed = false;
   while (true) {
@@ -795,20 +982,28 @@ bool SampleProfileLoader::inlineHotFunctions(
     SmallVector<Instruction *, 10> CIS;
     for (auto &BB : F) {
       bool Hot = false;
-      SmallVector<Instruction *, 10> Candidates;
+      SmallVector<Instruction *, 10> AllCandidates;
+      SmallVector<Instruction *, 10> ColdCandidates;
       for (auto &I : BB.getInstList()) {
         const FunctionSamples *FS = nullptr;
         if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
             !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
-          Candidates.push_back(&I);
+          AllCandidates.push_back(&I);
           if (FS->getEntrySamples() > 0)
             localNotInlinedCallSites.try_emplace(&I, FS);
           if (callsiteIsHot(FS, PSI))
             Hot = true;
+          else if (shouldInlineColdCallee(I))
+            ColdCandidates.push_back(&I);
         }
       }
       if (Hot) {
-        CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
+        CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
+        emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
+      }
+      else {
+        CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
+        emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
       }
     }
     for (auto I : CIS) {
@@ -854,6 +1049,7 @@ bool SampleProfileLoader::inlineHotFunctions(
                 inlineCallInstruction(DI)) {
               localNotInlinedCallSites.erase(I);
               LocalChanged = true;
+              ++NumCSInlined;
             }
           } else {
             LLVM_DEBUG(dbgs()
@@ -866,6 +1062,7 @@ bool SampleProfileLoader::inlineHotFunctions(
         if (inlineCallInstruction(I)) {
           localNotInlinedCallSites.erase(I);
           LocalChanged = true;
+          ++NumCSInlined;
         }
       } else if (IsThinLTOPreLink) {
         findCalleeFunctionSamples(*I)->findInlinedFunctions(
@@ -885,10 +1082,35 @@ bool SampleProfileLoader::inlineHotFunctions(
     Function *Callee = CallSite(I).getCalledFunction();
     if (!Callee || Callee->isDeclaration())
       continue;
+
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
+                                         I->getDebugLoc(), I->getParent())
+              << "previous inlining not repeated: '"
+              << ore::NV("Callee", Callee) << "' into '"
+              << ore::NV("Caller", &F) << "'");
+
+    ++NumCSNotInlined;
     const FunctionSamples *FS = Pair.getSecond();
-    auto pair =
-        notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
-    pair.first->second.entryCount += FS->getEntrySamples();
+    if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
+      continue;
+    }
+
+    if (ProfileMergeInlinee) {
+      // Use entry samples as head samples during the merge, as inlinees
+      // don't have head samples.
+      assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee");
+      const_cast<FunctionSamples *>(FS)->addHeadSamples(FS->getEntrySamples());
+
+      // Note that we have to do the merge right after processing function.
+      // This allows OutlineFS's profile to be used for annotation during
+      // top-down processing of functions' annotation.
+      FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
+      OutlineFS->merge(*FS);
+    } else {
+      auto pair =
+          notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
+      pair.first->second.entryCount += FS->getEntrySamples();
+    }
   }
   return Changed;
 }
@@ -1219,17 +1441,12 @@ void SampleProfileLoader::buildEdges(Function &F) {
 }
 
 /// Returns the sorted CallTargetMap \p M by count in descending order.
-static SmallVector<InstrProfValueData, 2> SortCallTargets(
-    const SampleRecord::CallTargetMap &M) {
+static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(
+    const SampleRecord::CallTargetMap & M) {
   SmallVector<InstrProfValueData, 2> R;
-  for (auto I = M.begin(); I != M.end(); ++I)
-    R.push_back({FunctionSamples::getGUID(I->getKey()), I->getValue()});
-  llvm::sort(R, [](const InstrProfValueData &L, const InstrProfValueData &R) {
-    if (L.Count == R.Count)
-      return L.Value > R.Value;
-    else
-      return L.Count > R.Count;
-  });
+  for (const auto &I : SampleRecord::SortCallTargets(M)) {
+    R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
+  }
   return R;
 }
 
@@ -1324,7 +1541,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           if (!T || T.get().empty())
             continue;
           SmallVector<InstrProfValueData, 2> SortedCallTargets =
-              SortCallTargets(T.get());
+              GetSortedValueDataFromCallTargets(T.get());
           uint64_t Sum;
           findIndirectCallFunctionSamples(I, Sum);
           annotateValueSite(*I.getParent()->getParent()->getParent(), I,
@@ -1374,6 +1591,8 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       }
     }
 
+    misexpect::verifyMisExpect(TI, Weights, TI->getContext());
+
     uint64_t TempWeight;
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
@@ -1555,32 +1774,58 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
                     "Sample Profile loader", false, false)
 
+std::vector<Function *>
+SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
+  std::vector<Function *> FunctionOrderList;
+  FunctionOrderList.reserve(M.size());
+
+  if (!ProfileTopDownLoad || CG == nullptr) {
+    for (Function &F : M)
+      if (!F.isDeclaration())
+        FunctionOrderList.push_back(&F);
+    return FunctionOrderList;
+  }
+
+  assert(&CG->getModule() == &M);
+  scc_iterator<CallGraph *> CGI = scc_begin(CG);
+  while (!CGI.isAtEnd()) {
+    for (CallGraphNode *node : *CGI) {
+      auto F = node->getFunction();
+      if (F && !F->isDeclaration())
+        FunctionOrderList.push_back(F);
+    }
+    ++CGI;
+  }
+
+  std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
+  return FunctionOrderList;
+}
+
 bool SampleProfileLoader::doInitialization(Module &M) {
   auto &Ctx = M.getContext();
-  auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);
+
+  std::unique_ptr<SampleProfileReaderItaniumRemapper> RemapReader;
+  auto ReaderOrErr =
+      SampleProfileReader::create(Filename, Ctx, RemappingFilename);
   if (std::error_code EC = ReaderOrErr.getError()) {
     std::string Msg = "Could not open profile: " + EC.message();
     Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
     return false;
   }
   Reader = std::move(ReaderOrErr.get());
-  Reader->collectFuncsToUse(M);
+  Reader->collectFuncsFrom(M);
   ProfileIsValid = (Reader->read() == sampleprof_error::success);
-
-  if (!RemappingFilename.empty()) {
-    // Apply profile remappings to the loaded profile data if requested.
-    // For now, we only support remapping symbols encoded using the Itanium
-    // C++ ABI's name mangling scheme.
-    ReaderOrErr = SampleProfileReaderItaniumRemapper::create(
-        RemappingFilename, Ctx, std::move(Reader));
-    if (std::error_code EC = ReaderOrErr.getError()) {
-      std::string Msg = "Could not open profile remapping file: " + EC.message();
-      Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
-      return false;
-    }
-    Reader = std::move(ReaderOrErr.get());
-    ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  PSL = Reader->getProfileSymbolList();
+
+  // While profile-sample-accurate is on, ignore symbol list.
+  ProfAccForSymsInList =
+      ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
+  if (ProfAccForSymsInList) {
+    NamesInProfile.clear();
+    if (auto NameTable = Reader->getNameTable())
+      NamesInProfile.insert(NameTable->begin(), NameTable->end());
   }
+
   return true;
 }
 
@@ -1593,8 +1838,8 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
 }
 
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
-                                      ProfileSummaryInfo *_PSI) {
-  FunctionSamples::GUIDToFuncNameMapper Mapper(M);
+                                      ProfileSummaryInfo *_PSI, CallGraph *CG) {
+  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
   if (!ProfileIsValid)
     return false;
 
@@ -1628,11 +1873,11 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   }
 
   bool retval = false;
-  for (auto &F : M)
-    if (!F.isDeclaration()) {
-      clearFunctionData();
-      retval |= runOnFunction(F, AM);
-    }
+  for (auto F : buildFunctionOrder(M, CG)) {
+    assert(!F->isDeclaration());
+    clearFunctionData();
+    retval |= runOnFunction(*F, AM);
+  }
 
   // Account for cold calls not inlined....
   for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
@@ -1647,23 +1892,52 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
   TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
   ProfileSummaryInfo *PSI =
       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  return SampleLoader.runOnModule(M, nullptr, PSI);
+  return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
-  
+
   DILocation2SampleMap.clear();
   // By default the entry count is initialized to -1, which will be treated
   // conservatively by getEntryCount as the same as unknown (None). This is
   // to avoid newly added code to be treated as cold. If we have samples
   // this will be overwritten in emitAnnotations.
-  // If ProfileSampleAccurate is true or F has profile-sample-accurate
-  // attribute, initialize the entry count to 0 so callsites or functions
-  // unsampled will be treated as cold.
-  uint64_t initialEntryCount =
-      (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate"))
-          ? 0
-          : -1;
+  uint64_t initialEntryCount = -1;
+
+  ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
+  if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
+    // initialize all the function entry counts to 0. It means all the
+    // functions without profile will be regarded as cold.
+    initialEntryCount = 0;
+    // profile-sample-accurate is a user assertion which has a higher precedence
+    // than symbol list. When profile-sample-accurate is on, ignore symbol list.
+    ProfAccForSymsInList = false;
+  }
+
+  // PSL -- profile symbol list include all the symbols in sampled binary.
+  // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
+  // old functions without samples being cold, without having to worry
+  // about new and hot functions being mistakenly treated as cold.
+  if (ProfAccForSymsInList) {
+    // Initialize the entry count to 0 for functions in the list.
+    if (PSL->contains(F.getName()))
+      initialEntryCount = 0;
+
+    // Function in the symbol list but without sample will be regarded as
+    // cold. To minimize the potential negative performance impact it could
+    // have, we want to be a little conservative here saying if a function
+    // shows up in the profile, no matter as outline function, inline instance
+    // or call targets, treat the function as not being cold. This will handle
+    // the cases such as most callsites of a function are inlined in sampled
+    // binary but not inlined in current build (because of source code drift,
+    // imprecise debug information, or the callsites are all cold individually
+    // but not cold accumulatively...), so the outline function showing up as
+    // cold in sampled binary will actually not be cold after current build.
+    StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+    if (NamesInProfile.count(CanonName))
+      initialEntryCount = -1;
+  }
+
   F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
@@ -1672,7 +1946,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
             .getManager();
     ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   } else {
-    OwnedORE = make_unique<OptimizationRemarkEmitter>(&F);
+    OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
     ORE = OwnedORE.get();
   }
   Samples = Reader->getSamplesFor(F);
@@ -1699,10 +1973,12 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
                                        : ProfileRemappingFileName,
       IsThinLTOPreLink, GetAssumptionCache, GetTTI);
 
-  SampleLoader.doInitialization(M);
+  if (!SampleLoader.doInitialization(M))
+    return PreservedAnalyses::all();
 
   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
-  if (!SampleLoader.runOnModule(M, &AM, PSI))
+  CallGraph &CG = AM.getResult<CallGraphAnalysis>(M);
+  if (!SampleLoader.runOnModule(M, &AM, PSI, &CG))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 106db3c8bd9d..655a7a404951 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
index 67a473612fc1..6ce00714523b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -20,7 +20,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -28,8 +27,10 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 24c476376c14..87a18171787f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -24,6 +25,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 using namespace llvm;
@@ -218,10 +220,18 @@ void splitAndWriteThinLTOBitcode(
 
   promoteTypeIds(M, ModuleId);
 
-  // Returns whether a global has attached type metadata. Such globals may
-  // participate in CFI or whole-program devirtualization, so they need to
-  // appear in the merged module instead of the thin LTO module.
+  // Returns whether a global or its associated global has attached type
+  // metadata. The former may participate in CFI or whole-program
+  // devirtualization, so they need to appear in the merged module instead of
+  // the thin LTO module. Similarly, globals that are associated with globals
+  // with type metadata need to appear in the merged module because they will
+  // reference the global's section directly.
   auto HasTypeMetadata = [](const GlobalObject *GO) {
+    if (MDNode *MD = GO->getMetadata(LLVMContext::MD_associated))
+      if (auto *AssocVM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(0)))
+        if (auto *AssocGO = dyn_cast<GlobalObject>(AssocVM->getValue()))
+          if (AssocGO->hasMetadata(LLVMContext::MD_type))
+            return true;
     return GO->hasMetadata(LLVMContext::MD_type);
   };
 
@@ -315,9 +325,9 @@ void splitAndWriteThinLTOBitcode(
     SmallVector<Metadata *, 4> Elts;
     Elts.push_back(MDString::get(Ctx, F.getName()));
     CfiFunctionLinkage Linkage;
-    if (!F.isDeclarationForLinker())
+    if (lowertypetests::isJumpTableCanonical(&F))
       Linkage = CFL_Definition;
-    else if (F.isWeakForLinker())
+    else if (F.hasExternalWeakLinkage())
       Linkage = CFL_WeakDeclaration;
     else
       Linkage = CFL_Declaration;
@@ -457,7 +467,7 @@ void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
       // splitAndWriteThinLTOBitcode). Just always build it once via the
       // buildModuleSummaryIndex when Module(s) are ready.
       ProfileSummaryInfo PSI(M);
-      NewIndex = llvm::make_unique<ModuleSummaryIndex>(
+      NewIndex = std::make_unique<ModuleSummaryIndex>(
           buildModuleSummaryIndex(M, nullptr, &PSI));
       Index = NewIndex.get();
     }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 6b6dd6194e17..5ccfb29b01a1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -24,12 +24,14 @@
 //   returns 0, or a single vtable's function returns 1, replace each virtual
 //   call with a comparison of the vptr against that vtable's address.
 //
-// This pass is intended to be used during the regular and thin LTO pipelines.
+// This pass is intended to be used during the regular and thin LTO pipelines:
+//
 // During regular LTO, the pass determines the best optimization for each
 // virtual call and applies the resolutions directly to virtual calls that are
 // eligible for virtual call optimization (i.e. calls that use either of the
-// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During
-// ThinLTO, the pass operates in two phases:
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics).
+//
+// During hybrid Regular/ThinLTO, the pass operates in two phases:
 // - Export phase: this is run during the thin link over a single merged module
 //   that contains all vtables with !type metadata that participate in the link.
 //   The pass computes a resolution for each virtual call and stores it in the
@@ -38,6 +40,14 @@
 //   modules. The pass applies the resolutions previously computed during the
 //   import phase to each eligible virtual call.
 //
+// During ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over the index which
+//   contains a summary of all vtables with !type metadata that participate in
+//   the link. It computes a resolution for each virtual call and stores it in
+//   the type identifier summary. Only single implementation devirtualization
+//   is supported.
+// - Import phase: (same as with hybrid case above).
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -70,10 +80,12 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
@@ -117,6 +129,11 @@ static cl::opt<unsigned>
                 cl::desc("Maximum number of call targets per "
                          "call site to enable branch funnels"));
 
+static cl::opt<bool>
+    PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden,
+                       cl::init(false), cl::ZeroOrMore,
+                       cl::desc("Print index-based devirtualization messages"));
+
 // Find the minimum offset that we may store a value of size Size bits at. If
 // IsAfter is set, look for an offset before the object, otherwise look for an
 // offset after the object.
@@ -265,6 +282,25 @@ template <> struct DenseMapInfo<VTableSlot> {
   }
 };
 
+template <> struct DenseMapInfo<VTableSlotSummary> {
+  static VTableSlotSummary getEmptyKey() {
+    return {DenseMapInfo<StringRef>::getEmptyKey(),
+            DenseMapInfo<uint64_t>::getEmptyKey()};
+  }
+  static VTableSlotSummary getTombstoneKey() {
+    return {DenseMapInfo<StringRef>::getTombstoneKey(),
+            DenseMapInfo<uint64_t>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const VTableSlotSummary &I) {
+    return DenseMapInfo<StringRef>::getHashValue(I.TypeID) ^
+           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+  }
+  static bool isEqual(const VTableSlotSummary &LHS,
+                      const VTableSlotSummary &RHS) {
+    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+  }
+};
+
 } // end namespace llvm
 
 namespace {
@@ -342,19 +378,21 @@ struct CallSiteInfo {
   /// pass the vector is non-empty, we will need to add a use of llvm.type.test
   /// to each of the function summaries in the vector.
   std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+  std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers;
 
   bool isExported() const {
     return SummaryHasTypeTestAssumeUsers ||
            !SummaryTypeCheckedLoadUsers.empty();
   }
 
-  void markSummaryHasTypeTestAssumeUsers() {
-    SummaryHasTypeTestAssumeUsers = true;
+  void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
+    SummaryTypeCheckedLoadUsers.push_back(FS);
     AllCallSitesDevirted = false;
   }
 
-  void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
-    SummaryTypeCheckedLoadUsers.push_back(FS);
+  void addSummaryTypeTestAssumeUser(FunctionSummary *FS) {
+    SummaryTypeTestAssumeUsers.push_back(FS);
+    SummaryHasTypeTestAssumeUsers = true;
     AllCallSitesDevirted = false;
   }
 
@@ -450,13 +488,12 @@ struct DevirtModule {
 
   bool areRemarksEnabled();
 
-  void scanTypeTestUsers(Function *TypeTestFunc, Function *AssumeFunc);
+  void scanTypeTestUsers(Function *TypeTestFunc);
   void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
 
   void buildTypeIdentifierMap(
       std::vector<VTableBits> &Bits,
       DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
-  Constant *getPointerAtOffset(Constant *I, uint64_t Offset);
   bool
   tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
                             const std::set<TypeMemberInfo> &TypeMemberInfos,
@@ -464,7 +501,8 @@ struct DevirtModule {
 
   void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
                              bool &IsExported);
-  bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+  bool trySingleImplDevirt(ModuleSummaryIndex *ExportSummary,
+                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
                            VTableSlotInfo &SlotInfo,
                            WholeProgramDevirtResolution *Res);
 
@@ -542,13 +580,45 @@ struct DevirtModule {
                 function_ref<DominatorTree &(Function &)> LookupDomTree);
 };
 
+struct DevirtIndex {
+  ModuleSummaryIndex &ExportSummary;
+  // The set in which to record GUIDs exported from their module by
+  // devirtualization, used by client to ensure they are not internalized.
+  std::set<GlobalValue::GUID> &ExportedGUIDs;
+  // A map in which to record the information necessary to locate the WPD
+  // resolution for local targets in case they are exported by cross module
+  // importing.
+  std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap;
+
+  MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots;
+
+  DevirtIndex(
+      ModuleSummaryIndex &ExportSummary,
+      std::set<GlobalValue::GUID> &ExportedGUIDs,
+      std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap)
+      : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs),
+        LocalWPDTargetsMap(LocalWPDTargetsMap) {}
+
+  bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot,
+                                 const TypeIdCompatibleVtableInfo TIdInfo,
+                                 uint64_t ByteOffset);
+
+  bool trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+                           VTableSlotSummary &SlotSummary,
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res,
+                           std::set<ValueInfo> &DevirtTargets);
+
+  void run();
+};
+
 struct WholeProgramDevirt : public ModulePass {
   static char ID;
 
   bool UseCommandLine = false;
 
-  ModuleSummaryIndex *ExportSummary;
-  const ModuleSummaryIndex *ImportSummary;
+  ModuleSummaryIndex *ExportSummary = nullptr;
+  const ModuleSummaryIndex *ImportSummary = nullptr;
 
   WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
     initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
@@ -572,7 +642,7 @@ struct WholeProgramDevirt : public ModulePass {
     // an optimization remark emitter on the fly, when we need it.
     std::unique_ptr<OptimizationRemarkEmitter> ORE;
     auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
-      ORE = make_unique<OptimizationRemarkEmitter>(F);
+      ORE = std::make_unique<OptimizationRemarkEmitter>(F);
       return *ORE;
     };
 
@@ -632,6 +702,41 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
+namespace llvm {
+void runWholeProgramDevirtOnIndex(
+    ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+  DevirtIndex(Summary, ExportedGUIDs, LocalWPDTargetsMap).run();
+}
+
+void updateIndexWPDForExports(
+    ModuleSummaryIndex &Summary,
+    function_ref<bool(StringRef, ValueInfo)> isExported,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+  for (auto &T : LocalWPDTargetsMap) {
+    auto &VI = T.first;
+    // This was enforced earlier during trySingleImplDevirt.
+    assert(VI.getSummaryList().size() == 1 &&
+           "Devirt of local target has more than one copy");
+    auto &S = VI.getSummaryList()[0];
+    if (!isExported(S->modulePath(), VI))
+      continue;
+
+    // It's been exported by a cross module import.
+    for (auto &SlotSummary : T.second) {
+      auto *TIdSum = Summary.getTypeIdSummary(SlotSummary.TypeID);
+      assert(TIdSum);
+      auto WPDRes = TIdSum->WPDRes.find(SlotSummary.ByteOffset);
+      assert(WPDRes != TIdSum->WPDRes.end());
+      WPDRes->second.SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+          WPDRes->second.SingleImplName,
+          Summary.getModuleHash(S->modulePath()));
+    }
+  }
+}
+
+} // end namespace llvm
+
 bool DevirtModule::runForTesting(
     Module &M, function_ref<AAResults &(Function &)> AARGetter,
     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
@@ -662,7 +767,7 @@ bool DevirtModule::runForTesting(
     ExitOnError ExitOnErr(
         "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
     std::error_code EC;
-    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
     ExitOnErr(errorCodeToError(EC));
 
     yaml::Output Out(OS);
@@ -706,38 +811,6 @@ void DevirtModule::buildTypeIdentifierMap(
   }
 }
 
-Constant *DevirtModule::getPointerAtOffset(Constant *I, uint64_t Offset) {
-  if (I->getType()->isPointerTy()) {
-    if (Offset == 0)
-      return I;
-    return nullptr;
-  }
-
-  const DataLayout &DL = M.getDataLayout();
-
-  if (auto *C = dyn_cast<ConstantStruct>(I)) {
-    const StructLayout *SL = DL.getStructLayout(C->getType());
-    if (Offset >= SL->getSizeInBytes())
-      return nullptr;
-
-    unsigned Op = SL->getElementContainingOffset(Offset);
-    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
-                              Offset - SL->getElementOffset(Op));
-  }
-  if (auto *C = dyn_cast<ConstantArray>(I)) {
-    ArrayType *VTableTy = C->getType();
-    uint64_t ElemSize = DL.getTypeAllocSize(VTableTy->getElementType());
-
-    unsigned Op = Offset / ElemSize;
-    if (Op >= C->getNumOperands())
-      return nullptr;
-
-    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
-                              Offset % ElemSize);
-  }
-  return nullptr;
-}
-
 bool DevirtModule::tryFindVirtualCallTargets(
     std::vector<VirtualCallTarget> &TargetsForSlot,
     const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
@@ -746,7 +819,7 @@ bool DevirtModule::tryFindVirtualCallTargets(
       return false;
 
     Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
-                                       TM.Offset + ByteOffset);
+                                       TM.Offset + ByteOffset, M);
     if (!Ptr)
       return false;
 
@@ -766,6 +839,47 @@ bool DevirtModule::tryFindVirtualCallTargets(
   return !TargetsForSlot.empty();
 }
 
+bool DevirtIndex::tryFindVirtualCallTargets(
+    std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
+    uint64_t ByteOffset) {
+  for (const TypeIdOffsetVtableInfo &P : TIdInfo) {
+    // Find the first non-available_externally linkage vtable initializer.
+    // We can have multiple available_externally, linkonce_odr and weak_odr
+    // vtable initializers, however we want to skip available_externally as they
+    // do not have type metadata attached, and therefore the summary will not
+    // contain any vtable functions. We can also have multiple external
+    // vtable initializers in the case of comdats, which we cannot check here.
+    // The linker should give an error in this case.
+    //
+    // Also, handle the case of same-named local Vtables with the same path
+    // and therefore the same GUID. This can happen if there isn't enough
+    // distinguishing path when compiling the source file. In that case we
+    // conservatively return false early.
+    const GlobalVarSummary *VS = nullptr;
+    bool LocalFound = false;
+    for (auto &S : P.VTableVI.getSummaryList()) {
+      if (GlobalValue::isLocalLinkage(S->linkage())) {
+        if (LocalFound)
+          return false;
+        LocalFound = true;
+      }
+      if (!GlobalValue::isAvailableExternallyLinkage(S->linkage()))
+        VS = cast<GlobalVarSummary>(S->getBaseObject());
+    }
+    if (!VS->isLive())
+      continue;
+    for (auto VTP : VS->vTableFuncs()) {
+      if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset)
+        continue;
+
+      TargetsForSlot.push_back(VTP.FuncVI);
+    }
+  }
+
+  // Give up if we couldn't find any targets.
+  return !TargetsForSlot.empty();
+}
+
 void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
                                          Constant *TheFn, bool &IsExported) {
   auto Apply = [&](CallSiteInfo &CSInfo) {
@@ -788,9 +902,38 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
     Apply(P.second);
 }
 
+static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
+  // We can't add calls if we haven't seen a definition
+  if (Callee.getSummaryList().empty())
+    return false;
+
+  // Insert calls into the summary index so that the devirtualized targets
+  // are eligible for import.
+  // FIXME: Annotate type tests with hotness. For now, mark these as hot
+  // to better ensure we have the opportunity to inline them.
+  bool IsExported = false;
+  auto &S = Callee.getSummaryList()[0];
+  CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0);
+  auto AddCalls = [&](CallSiteInfo &CSInfo) {
+    for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) {
+      FS->addCall({Callee, CI});
+      IsExported |= S->modulePath() != FS->modulePath();
+    }
+    for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) {
+      FS->addCall({Callee, CI});
+      IsExported |= S->modulePath() != FS->modulePath();
+    }
+  };
+  AddCalls(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    AddCalls(P.second);
+  return IsExported;
+}
+
 bool DevirtModule::trySingleImplDevirt(
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) {
+    ModuleSummaryIndex *ExportSummary,
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res) {
   // See if the program contains a single implementation of this virtual
   // function.
   Function *TheFn = TargetsForSlot[0].Fn;
@@ -830,6 +973,10 @@ bool DevirtModule::trySingleImplDevirt(
     TheFn->setVisibility(GlobalValue::HiddenVisibility);
     TheFn->setName(NewName);
   }
+  if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID()))
+    // Any needed promotion of 'TheFn' has already been done during
+    // LTO unit split, so we can ignore return value of AddCalls.
+    AddCalls(SlotInfo, TheFnVI);
 
   Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
   Res->SingleImplName = TheFn->getName();
@@ -837,6 +984,63 @@ bool DevirtModule::trySingleImplDevirt(
   return true;
 }
 
+bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+                                      VTableSlotSummary &SlotSummary,
+                                      VTableSlotInfo &SlotInfo,
+                                      WholeProgramDevirtResolution *Res,
+                                      std::set<ValueInfo> &DevirtTargets) {
+  // See if the program contains a single implementation of this virtual
+  // function.
+  auto TheFn = TargetsForSlot[0];
+  for (auto &&Target : TargetsForSlot)
+    if (TheFn != Target)
+      return false;
+
+  // Don't devirtualize if we don't have target definition.
+  auto Size = TheFn.getSummaryList().size();
+  if (!Size)
+    return false;
+
+  // If the summary list contains multiple summaries where at least one is
+  // a local, give up, as we won't know which (possibly promoted) name to use.
+  for (auto &S : TheFn.getSummaryList())
+    if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
+      return false;
+
+  // Collect functions devirtualized at least for one call site for stats.
+  if (PrintSummaryDevirt)
+    DevirtTargets.insert(TheFn);
+
+  auto &S = TheFn.getSummaryList()[0];
+  bool IsExported = AddCalls(SlotInfo, TheFn);
+  if (IsExported)
+    ExportedGUIDs.insert(TheFn.getGUID());
+
+  // Record in summary for use in devirtualization during the ThinLTO import
+  // step.
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  if (GlobalValue::isLocalLinkage(S->linkage())) {
+    if (IsExported)
+      // If target is a local function and we are exporting it by
+      // devirtualizing a call in another module, we need to record the
+      // promoted name.
+      Res->SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+          TheFn.name(), ExportSummary.getModuleHash(S->modulePath()));
+    else {
+      LocalWPDTargetsMap[TheFn].push_back(SlotSummary);
+      Res->SingleImplName = TheFn.name();
+    }
+  } else
+    Res->SingleImplName = TheFn.name();
+
+  // Name will be empty if this thin link driven off of serialized combined
+  // index (e.g. llvm-lto). However, WPD is not supported/invoked for the
+  // legacy LTO API anyway.
+  assert(!Res->SingleImplName.empty());
+
+  return true;
+}
+
 void DevirtModule::tryICallBranchFunnel(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
     WholeProgramDevirtResolution *Res, VTableSlot Slot) {
@@ -1044,8 +1248,7 @@ std::string DevirtModule::getGlobalName(VTableSlot Slot,
 
 bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() {
   Triple T(M.getTargetTriple());
-  return (T.getArch() == Triple::x86 || T.getArch() == Triple::x86_64) &&
-         T.getObjectFormat() == Triple::ELF;
+  return T.isX86() && T.getObjectFormat() == Triple::ELF;
 }
 
 void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
@@ -1302,10 +1505,13 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
   if (B.Before.Bytes.empty() && B.After.Bytes.empty())
     return;
 
-  // Align each byte array to pointer width.
-  unsigned PointerSize = M.getDataLayout().getPointerSize();
-  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), PointerSize));
-  B.After.Bytes.resize(alignTo(B.After.Bytes.size(), PointerSize));
+  // Align the before byte array to the global's minimum alignment so that we
+  // don't break any alignment requirements on the global.
+  MaybeAlign Alignment(B.GV->getAlignment());
+  if (!Alignment)
+    Alignment =
+        Align(M.getDataLayout().getABITypeAlignment(B.GV->getValueType()));
+  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment));
 
   // Before was stored in reverse order; flip it now.
   for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I)
@@ -1322,6 +1528,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
                          GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
   NewGV->setSection(B.GV->getSection());
   NewGV->setComdat(B.GV->getComdat());
+  NewGV->setAlignment(MaybeAlign(B.GV->getAlignment()));
 
   // Copy the original vtable's metadata to the anonymous global, adjusting
   // offsets as required.
@@ -1355,8 +1562,7 @@ bool DevirtModule::areRemarksEnabled() {
   return false;
 }
 
-void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
-                                     Function *AssumeFunc) {
+void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc) {
   // Find all virtual calls via a virtual table pointer %p under an assumption
   // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
   // points to a member of the type identifier %md. Group calls by (type ID,
@@ -1483,8 +1689,11 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
 }
 
 void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+  auto *TypeId = dyn_cast<MDString>(Slot.TypeID);
+  if (!TypeId)
+    return;
   const TypeIdSummary *TidSummary =
-      ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString());
+      ImportSummary->getTypeIdSummary(TypeId->getString());
   if (!TidSummary)
     return;
   auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
@@ -1493,6 +1702,7 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
   const WholeProgramDevirtResolution &Res = ResI->second;
 
   if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+    assert(!Res.SingleImplName.empty());
     // The type of the function in the declaration is irrelevant because every
     // call site will cast it to the correct type.
     Constant *SingleImpl =
@@ -1587,7 +1797,7 @@ bool DevirtModule::run() {
     return false;
 
   if (TypeTestFunc && AssumeFunc)
-    scanTypeTestUsers(TypeTestFunc, AssumeFunc);
+    scanTypeTestUsers(TypeTestFunc);
 
   if (TypeCheckedLoadFunc)
     scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
@@ -1627,8 +1837,7 @@ bool DevirtModule::run() {
         // FIXME: Only add live functions.
         for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
           for (Metadata *MD : MetadataByGUID[VF.GUID]) {
-            CallSlots[{MD, VF.Offset}]
-                .CSInfo.markSummaryHasTypeTestAssumeUsers();
+            CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
           }
         }
         for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
@@ -1641,7 +1850,7 @@ bool DevirtModule::run() {
           for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
             CallSlots[{MD, VC.VFunc.Offset}]
                 .ConstCSInfo[VC.Args]
-                .markSummaryHasTypeTestAssumeUsers();
+                .addSummaryTypeTestAssumeUser(FS);
           }
         }
         for (const FunctionSummary::ConstVCall &VC :
@@ -1673,7 +1882,7 @@ bool DevirtModule::run() {
                        cast<MDString>(S.first.TypeID)->getString())
                    .WPDRes[S.first.ByteOffset];
 
-      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res)) {
+      if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
         DidVirtualConstProp |=
             tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
 
@@ -1710,7 +1919,7 @@ bool DevirtModule::run() {
       using namespace ore;
       OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
                         << "devirtualized "
-                        << NV("FunctionName", F->getName()));
+                        << NV("FunctionName", DT.first));
     }
   }
 
@@ -1722,5 +1931,86 @@ bool DevirtModule::run() {
     for (VTableBits &B : Bits)
       rebuildGlobal(B);
 
+  // We have lowered or deleted the type checked load intrinsics, so we no
+  // longer have enough information to reason about the liveness of virtual
+  // function pointers in GlobalDCE.
+  for (GlobalVariable &GV : M.globals())
+    GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
   return true;
 }
+
+void DevirtIndex::run() {
+  if (ExportSummary.typeIdCompatibleVtableMap().empty())
+    return;
+
+  DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
+  for (auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
+    NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first);
+  }
+
+  // Collect information from summary about which calls to try to devirtualize.
+  for (auto &P : ExportSummary) {
+    for (auto &S : P.second.SummaryList) {
+      auto *FS = dyn_cast<FunctionSummary>(S.get());
+      if (!FS)
+        continue;
+      // FIXME: Only add live functions.
+      for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+        for (StringRef Name : NameByGUID[VF.GUID]) {
+          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
+        }
+      }
+      for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+        for (StringRef Name : NameByGUID[VF.GUID]) {
+          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
+        }
+      }
+      for (const FunctionSummary::ConstVCall &VC :
+           FS->type_test_assume_const_vcalls()) {
+        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+          CallSlots[{Name, VC.VFunc.Offset}]
+              .ConstCSInfo[VC.Args]
+              .addSummaryTypeTestAssumeUser(FS);
+        }
+      }
+      for (const FunctionSummary::ConstVCall &VC :
+           FS->type_checked_load_const_vcalls()) {
+        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+          CallSlots[{Name, VC.VFunc.Offset}]
+              .ConstCSInfo[VC.Args]
+              .addSummaryTypeCheckedLoadUser(FS);
+        }
+      }
+    }
+  }
+
+  std::set<ValueInfo> DevirtTargets;
+  // For each (type, offset) pair:
+  for (auto &S : CallSlots) {
+    // Search each of the members of the type identifier for the virtual
+    // function implementation at offset S.first.ByteOffset, and add to
+    // TargetsForSlot.
+    std::vector<ValueInfo> TargetsForSlot;
+    auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID);
+    assert(TidSummary);
+    if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary,
+                                  S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res =
+          &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
+               .WPDRes[S.first.ByteOffset];
+
+      if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res,
+                               DevirtTargets))
+        continue;
+    }
+  }
+
+  // Optionally have the thin link print message for each devirtualized
+  // function.
+  if (PrintSummaryDevirt)
+    for (const auto &DT : DevirtTargets)
+      errs() << "Devirtualized call to " << DT << "\n";
+
+  return;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index ba15b023f2a3..ec976a971e3c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -890,6 +890,10 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
   if (match(Op0, m_ZExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
     return SelectInst::Create(X, AddOne(Op1C), Op1);
+  // sext(bool) + C -> bool ? C - 1 : C
+  if (match(Op0, m_SExt(m_Value(X))) &&
+      X->getType()->getScalarSizeInBits() == 1)
+    return SelectInst::Create(X, SubOne(Op1C), Op1);
 
   // ~X + C --> (C-1) - X
   if (match(Op0, m_Not(m_Value(X))))
@@ -1097,6 +1101,107 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
   return nullptr;
 }
 
+Instruction *
+InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+    BinaryOperator &I) {
+  assert((I.getOpcode() == Instruction::Add ||
+          I.getOpcode() == Instruction::Or ||
+          I.getOpcode() == Instruction::Sub) &&
+         "Expecting add/or/sub instruction");
+
+  // We have a subtraction/addition between a (potentially truncated) *logical*
+  // right-shift of X and a "select".
+  Value *X, *Select;
+  Instruction *LowBitsToSkip, *Extract;
+  if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd(
+                               m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)),
+                               m_Instruction(Extract))),
+                           m_Value(Select))))
+    return nullptr;
+
+  // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS.
+  if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select)
+    return nullptr;
+
+  Type *XTy = X->getType();
+  bool HadTrunc = I.getType() != XTy;
+
+  // If there was a truncation of extracted value, then we'll need to produce
+  // one extra instruction, so we need to ensure one instruction will go away.
+  if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
+  // Extraction should extract high NBits bits, with shift amount calculated as:
+  //   low bits to skip = shift bitwidth - high bits to extract
+  // The shift amount itself may be extended, and we need to look past zero-ext
+  // when matching NBits, that will matter for matching later.
+  Constant *C;
+  Value *NBits;
+  if (!match(
+          LowBitsToSkip,
+          m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) ||
+      !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                   APInt(C->getType()->getScalarSizeInBits(),
+                                         X->getType()->getScalarSizeInBits()))))
+    return nullptr;
+
+  // Sign-extending value can be zero-extended if we `sub`tract it,
+  // or sign-extended otherwise.
+  auto SkipExtInMagic = [&I](Value *&V) {
+    if (I.getOpcode() == Instruction::Sub)
+      match(V, m_ZExtOrSelf(m_Value(V)));
+    else
+      match(V, m_SExtOrSelf(m_Value(V)));
+  };
+
+  // Now, finally validate the sign-extending magic.
+  // `select` itself may be appropriately extended, look past that.
+  SkipExtInMagic(Select);
+
+  ICmpInst::Predicate Pred;
+  const APInt *Thr;
+  Value *SignExtendingValue, *Zero;
+  bool ShouldSignext;
+  // It must be a select between two values we will later establish to be a
+  // sign-extending value and a zero constant. The condition guarding the
+  // sign-extension must be based on a sign bit of the same X we had in `lshr`.
+  if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)),
+                              m_Value(SignExtendingValue), m_Value(Zero))) ||
+      !isSignBitCheck(Pred, *Thr, ShouldSignext))
+    return nullptr;
+
+  // icmp-select pair is commutative.
+  if (!ShouldSignext)
+    std::swap(SignExtendingValue, Zero);
+
+  // If we should not perform sign-extension then we must add/or/subtract zero.
+  if (!match(Zero, m_Zero()))
+    return nullptr;
+  // Otherwise, it should be some constant, left-shifted by the same NBits we
+  // had in `lshr`. Said left-shift can also be appropriately extended.
+  // Again, we must look past zero-ext when looking for NBits.
+  SkipExtInMagic(SignExtendingValue);
+  Constant *SignExtendingValueBaseConstant;
+  if (!match(SignExtendingValue,
+             m_Shl(m_Constant(SignExtendingValueBaseConstant),
+                   m_ZExtOrSelf(m_Specific(NBits)))))
+    return nullptr;
+  // If we `sub`, then the constant should be one, else it should be all-ones.
+  if (I.getOpcode() == Instruction::Sub
+          ? !match(SignExtendingValueBaseConstant, m_One())
+          : !match(SignExtendingValueBaseConstant, m_AllOnes()))
+    return nullptr;
+
+  auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip,
+                                             Extract->getName() + ".sext");
+  NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness.
+  if (!HadTrunc)
+    return NewAShr;
+
+  Builder.Insert(NewAShr);
+  return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
+}
+
 Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
@@ -1187,12 +1292,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     return BinaryOperator::CreateSub(RHS, A);
   }
 
-  // Canonicalize sext to zext for better value tracking potential.
-  // add A, sext(B) --> sub A, zext(B)
-  if (match(&I, m_c_Add(m_Value(A), m_OneUse(m_SExt(m_Value(B))))) &&
-      B->getType()->isIntOrIntVectorTy(1))
-    return BinaryOperator::CreateSub(A, Builder.CreateZExt(B, Ty));
-
   // A + -B  -->  A - B
   if (match(RHS, m_Neg(m_Value(B))))
     return BinaryOperator::CreateSub(LHS, B);
@@ -1302,12 +1401,32 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Instruction *V = canonicalizeLowbitMask(I, Builder))
     return V;
 
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
   if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I))
     return SatAdd;
 
   return Changed ? &I : nullptr;
 }
 
+/// Eliminate an op from a linear interpolation (lerp) pattern.
+static Instruction *factorizeLerp(BinaryOperator &I,
+                                  InstCombiner::BuilderTy &Builder) {
+  Value *X, *Y, *Z;
+  if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y),
+                                            m_OneUse(m_FSub(m_FPOne(),
+                                                            m_Value(Z))))),
+                          m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z))))))
+    return nullptr;
+
+  // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants]
+  Value *XY = Builder.CreateFSubFMF(X, Y, &I);
+  Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I);
+  return BinaryOperator::CreateFAddFMF(Y, MulZ, &I);
+}
+
 /// Factor a common operand out of fadd/fsub of fmul/fdiv.
 static Instruction *factorizeFAddFSub(BinaryOperator &I,
                                       InstCombiner::BuilderTy &Builder) {
@@ -1315,6 +1434,10 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
           I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub");
   assert(I.hasAllowReassoc() && I.hasNoSignedZeros() &&
          "FP factorization requires FMF");
+
+  if (Instruction *Lerp = factorizeLerp(I, Builder))
+    return Lerp;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Value *X, *Y, *Z;
   bool IsFMul;
@@ -1362,17 +1485,32 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
     return FoldedFAdd;
 
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-  Value *X;
   // (-X) + Y --> Y - X
-  if (match(LHS, m_FNeg(m_Value(X))))
-    return BinaryOperator::CreateFSubFMF(RHS, X, &I);
-  // Y + (-X) --> Y - X
-  if (match(RHS, m_FNeg(m_Value(X))))
-    return BinaryOperator::CreateFSubFMF(LHS, X, &I);
+  Value *X, *Y;
+  if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y))))
+    return BinaryOperator::CreateFSubFMF(Y, X, &I);
+
+  // Similar to above, but look through fmul/fdiv for the negated term.
+  // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants]
+  Value *Z;
+  if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))),
+                         m_Value(Z)))) {
+    Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+    return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+  }
+  // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants]
+  // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants]
+  if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))),
+                         m_Value(Z))) ||
+      match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))),
+                         m_Value(Z)))) {
+    Value *XY = Builder.CreateFDivFMF(X, Y, &I);
+    return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+  }
 
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
     Value *LHSIntVal = LHSConv->getOperand(0);
     Type *FPType = LHSConv->getType();
@@ -1447,7 +1585,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
 Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
-                                               Type *Ty) {
+                                               Type *Ty, bool IsNUW) {
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
@@ -1515,6 +1653,15 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
   // Emit the offset of the GEP and an intptr_t.
   Value *Result = EmitGEPOffset(GEP1);
 
+  // If this is a single inbounds GEP and the original sub was nuw,
+  // then the final multiplication is also nuw. We match an extra add zero
+  // here, because that's what EmitGEPOffset() generates.
+  Instruction *I;
+  if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
+      match(Result, m_Add(m_Instruction(I), m_Zero())) &&
+      I->getOpcode() == Instruction::Mul)
+    I->setHasNoUnsignedWrap();
+
   // If we had a constant expression GEP on the other side offsetting the
   // pointer, subtract it from the offset we have.
   if (GEP2) {
@@ -1631,37 +1778,50 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
   const APInt *Op0C;
   if (match(Op0, m_APInt(Op0C))) {
-    unsigned BitWidth = I.getType()->getScalarSizeInBits();
 
-    // -(X >>u 31) -> (X >>s 31)
-    // -(X >>s 31) -> (X >>u 31)
     if (Op0C->isNullValue()) {
+      Value *Op1Wide;
+      match(Op1, m_TruncOrSelf(m_Value(Op1Wide)));
+      bool HadTrunc = Op1Wide != Op1;
+      bool NoTruncOrTruncIsOneUse = !HadTrunc || Op1->hasOneUse();
+      unsigned BitWidth = Op1Wide->getType()->getScalarSizeInBits();
+
       Value *X;
       const APInt *ShAmt;
-      if (match(Op1, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
+      // -(X >>u 31) -> (X >>s 31)
+      if (NoTruncOrTruncIsOneUse &&
+          match(Op1Wide, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
           *ShAmt == BitWidth - 1) {
-        Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
-        return BinaryOperator::CreateAShr(X, ShAmtOp);
+        Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
+        Instruction *NewShift = BinaryOperator::CreateAShr(X, ShAmtOp);
+        NewShift->copyIRFlags(Op1Wide);
+        if (!HadTrunc)
+          return NewShift;
+        Builder.Insert(NewShift);
+        return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
       }
-      if (match(Op1, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
+      // -(X >>s 31) -> (X >>u 31)
+      if (NoTruncOrTruncIsOneUse &&
+          match(Op1Wide, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
           *ShAmt == BitWidth - 1) {
-        Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
-        return BinaryOperator::CreateLShr(X, ShAmtOp);
+        Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
+        Instruction *NewShift = BinaryOperator::CreateLShr(X, ShAmtOp);
+        NewShift->copyIRFlags(Op1Wide);
+        if (!HadTrunc)
+          return NewShift;
+        Builder.Insert(NewShift);
+        return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
       }
 
-      if (Op1->hasOneUse()) {
+      if (!HadTrunc && Op1->hasOneUse()) {
         Value *LHS, *RHS;
         SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor;
         if (SPF == SPF_ABS || SPF == SPF_NABS) {
           // This is a negate of an ABS/NABS pattern. Just swap the operands
           // of the select.
-          SelectInst *SI = cast<SelectInst>(Op1);
-          Value *TrueVal = SI->getTrueValue();
-          Value *FalseVal = SI->getFalseValue();
-          SI->setTrueValue(FalseVal);
-          SI->setFalseValue(TrueVal);
+          cast<SelectInst>(Op1)->swapValues();
           // Don't swap prof metadata, we didn't change the branch behavior.
-          return replaceInstUsesWith(I, SI);
+          return replaceInstUsesWith(I, Op1);
         }
       }
     }
@@ -1686,6 +1846,23 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateNeg(Y);
   }
 
+  // (sub (or A, B) (and A, B)) --> (xor A, B)
+  {
+    Value *A, *B;
+    if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateXor(A, B);
+  }
+
+  // (sub (and A, B) (or A, B)) --> neg (xor A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse()))
+      return BinaryOperator::CreateNeg(Builder.CreateXor(A, B));
+  }
+
   // (sub (or A, B), (xor A, B)) --> (and A, B)
   {
     Value *A, *B;
@@ -1694,6 +1871,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(A, B);
   }
 
+  // (sub (xor A, B) (or A, B)) --> neg (and A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse()))
+      return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B));
+  }
+
   {
     Value *Y;
     // ((X | Y) - X) --> (~X & Y)
@@ -1702,6 +1888,74 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
           Y, Builder.CreateNot(Op1, Op1->getName() + ".not"));
   }
 
+  {
+    // (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1))
+    Value *X;
+    if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1),
+                                    m_OneUse(m_Neg(m_Value(X))))))) {
+      return BinaryOperator::CreateNeg(Builder.CreateAnd(
+          Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType()))));
+    }
+  }
+
+  {
+    // (sub (and Op1, C), Op1) --> neg (and Op1, ~C)
+    Constant *C;
+    if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) {
+      return BinaryOperator::CreateNeg(
+          Builder.CreateAnd(Op1, Builder.CreateNot(C)));
+    }
+  }
+
+  {
+    // If we have a subtraction between some value and a select between
+    // said value and something else, sink subtraction into select hands, i.e.:
+    //   sub (select %Cond, %TrueVal, %FalseVal), %Op1
+    //     ->
+    //   select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1)
+    //  or
+    //   sub %Op0, (select %Cond, %TrueVal, %FalseVal)
+    //     ->
+    //   select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal)
+    // This will result in select between new subtraction and 0.
+    auto SinkSubIntoSelect =
+        [Ty = I.getType()](Value *Select, Value *OtherHandOfSub,
+                           auto SubBuilder) -> Instruction * {
+      Value *Cond, *TrueVal, *FalseVal;
+      if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal),
+                                           m_Value(FalseVal)))))
+        return nullptr;
+      if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal)
+        return nullptr;
+      // While it is really tempting to just create two subtractions and let
+      // InstCombine fold one of those to 0, it isn't possible to do so
+      // because of worklist visitation order. So ugly it is.
+      bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal;
+      Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal);
+      Constant *Zero = Constant::getNullValue(Ty);
+      SelectInst *NewSel =
+          SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub,
+                             OtherHandOfSubIsTrueVal ? NewSub : Zero);
+      // Preserve prof metadata if any.
+      NewSel->copyMetadata(cast<Instruction>(*Select));
+      return NewSel;
+    };
+    if (Instruction *NewSel = SinkSubIntoSelect(
+            /*Select=*/Op0, /*OtherHandOfSub=*/Op1,
+            [Builder = &Builder, Op1](Value *OtherHandOfSelect) {
+              return Builder->CreateSub(OtherHandOfSelect,
+                                        /*OtherHandOfSub=*/Op1);
+            }))
+      return NewSel;
+    if (Instruction *NewSel = SinkSubIntoSelect(
+            /*Select=*/Op1, /*OtherHandOfSub=*/Op0,
+            [Builder = &Builder, Op0](Value *OtherHandOfSelect) {
+              return Builder->CreateSub(/*OtherHandOfSub=*/Op0,
+                                        OtherHandOfSelect);
+            }))
+      return NewSel;
+  }
+
   if (Op1->hasOneUse()) {
     Value *X = nullptr, *Y = nullptr, *Z = nullptr;
     Constant *C = nullptr;
@@ -1717,14 +1971,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
                                   Builder.CreateNot(Y, Y->getName() + ".not"));
 
     // 0 - (X sdiv C)  -> (X sdiv -C)  provided the negation doesn't overflow.
-    // TODO: This could be extended to match arbitrary vector constants.
-    const APInt *DivC;
-    if (match(Op0, m_Zero()) && match(Op1, m_SDiv(m_Value(X), m_APInt(DivC))) &&
-        !DivC->isMinSignedValue() && *DivC != 1) {
-      Constant *NegDivC = ConstantInt::get(I.getType(), -(*DivC));
-      Instruction *BO = BinaryOperator::CreateSDiv(X, NegDivC);
-      BO->setIsExact(cast<BinaryOperator>(Op1)->isExact());
-      return BO;
+    if (match(Op0, m_Zero())) {
+      Constant *Op11C;
+      if (match(Op1, m_SDiv(m_Value(X), m_Constant(Op11C))) &&
+          !Op11C->containsUndefElement() && Op11C->isNotMinSignedValue() &&
+          Op11C->isNotOneValue()) {
+        Instruction *BO =
+            BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(Op11C));
+        BO->setIsExact(cast<BinaryOperator>(Op1)->isExact());
+        return BO;
+      }
     }
 
     // 0 - (X << Y)  -> (-X << Y)   when X is freely negatable.
@@ -1742,6 +1998,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       Add->setHasNoSignedWrap(I.hasNoSignedWrap());
       return Add;
     }
+    // sub [nsw] X, zext(bool Y) -> add [nsw] X, sext(bool Y)
+    // 'nuw' is dropped in favor of the canonical form.
+    if (match(Op1, m_ZExt(m_Value(Y))) && Y->getType()->isIntOrIntVectorTy(1)) {
+      Value *Sext = Builder.CreateSExt(Y, I.getType());
+      BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Sext);
+      Add->setHasNoSignedWrap(I.hasNoSignedWrap());
+      return Add;
+    }
 
     // X - A*-B -> X + A*B
     // X - -A*B -> X + A*B
@@ -1778,7 +2042,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         std::swap(LHS, RHS);
       // LHS is now O above and expected to have at least 2 uses (the min/max)
       // NotA is epected to have 2 uses from the min/max and 1 from the sub.
-      if (IsFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
           !NotA->hasNUsesOrMore(4)) {
         // Note: We don't generate the inverse max/min, just create the not of
         // it and let other folds do the rest.
@@ -1796,13 +2060,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   Value *LHSOp, *RHSOp;
   if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
       match(Op1, m_PtrToInt(m_Value(RHSOp))))
-    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+                                               I.hasNoUnsignedWrap()))
       return replaceInstUsesWith(I, Res);
 
   // trunc(p)-trunc(q) -> trunc(p-q)
   if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
       match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
-    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+                                               /* IsNUW */ false))
       return replaceInstUsesWith(I, Res);
 
   // Canonicalize a shifty way to code absolute value to the common pattern.
@@ -1826,6 +2092,10 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return SelectInst::Create(Cmp, Neg, A);
   }
 
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
   if (Instruction *Ext = narrowMathIfNoOverflow(I))
     return Ext;
 
@@ -1865,6 +2135,22 @@ static Instruction *foldFNegIntoConstant(Instruction &I) {
   return nullptr;
 }
 
+static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
+                                           InstCombiner::BuilderTy &Builder) {
+  Value *FNeg;
+  if (!match(&I, m_FNeg(m_Value(FNeg))))
+    return nullptr;
+
+  Value *X, *Y;
+  if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y)))))
+    return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+  if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))))
+    return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
   Value *Op = I.getOperand(0);
 
@@ -1882,6 +2168,9 @@ Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
       match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y)))))
     return BinaryOperator::CreateFSubFMF(Y, X, &I);
 
+  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+    return R;
+
   return nullptr;
 }
 
@@ -1903,6 +2192,9 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (Instruction *X = foldFNegIntoConstant(I))
     return X;
 
+  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+    return R;
+
   Value *X, *Y;
   Constant *C;
 
@@ -1944,6 +2236,21 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
     return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I);
 
+  // Similar to above, but look through fmul/fdiv of the negated value:
+  // Op0 - (-X * Y) --> Op0 + (X * Y)
+  // Op0 - (Y * -X) --> Op0 + (X * Y)
+  if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) {
+    Value *FMul = Builder.CreateFMulFMF(X, Y, &I);
+    return BinaryOperator::CreateFAddFMF(Op0, FMul, &I);
+  }
+  // Op0 - (-X / Y) --> Op0 + (X / Y)
+  // Op0 - (X / -Y) --> Op0 + (X / Y)
+  if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) ||
+      match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) {
+    Value *FDiv = Builder.CreateFDivFMF(X, Y, &I);
+    return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I);
+  }
+
   // Handle special cases for FSub with selects feeding the operation
   if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
     return replaceInstUsesWith(I, V);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2b9859b602f4..cc0a9127f8b1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -160,16 +160,14 @@ Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
 }
 
 /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
-/// (V < Lo || V >= Hi). This method expects that Lo <= Hi. IsSigned indicates
+/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
 /// whether to treat V, Lo, and Hi as signed or not.
 Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                                      bool isSigned, bool Inside) {
-  assert((isSigned ? Lo.sle(Hi) : Lo.ule(Hi)) &&
-         "Lo is not <= Hi in range emission code!");
+  assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
+         "Lo is not < Hi in range emission code!");
 
   Type *Ty = V->getType();
-  if (Lo == Hi)
-    return Inside ? ConstantInt::getFalse(Ty) : ConstantInt::getTrue(Ty);
 
   // V >= Min && V <  Hi --> V <  Hi
   // V <  Min || V >= Hi --> V >= Hi
@@ -1051,9 +1049,103 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
   return nullptr;
 }
 
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
+static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
+                                         ICmpInst *UnsignedICmp, bool IsAnd,
+                                         const SimplifyQuery &Q,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *ZeroCmpOp;
+  ICmpInst::Predicate EqPred;
+  if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) ||
+      !ICmpInst::isEquality(EqPred))
+    return nullptr;
+
+  auto IsKnownNonZero = [&](Value *V) {
+    return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+  };
+
+  ICmpInst::Predicate UnsignedPred;
+
+  Value *A, *B;
+  if (match(UnsignedICmp,
+            m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) &&
+      match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) &&
+      (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) {
+    if (UnsignedICmp->getOperand(0) != ZeroCmpOp)
+      UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
+
+    auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) {
+      if (!IsKnownNonZero(NonZero))
+        std::swap(NonZero, Other);
+      return IsKnownNonZero(NonZero);
+    };
+
+    // Given  ZeroCmpOp = (A + B)
+    //   ZeroCmpOp <= A && ZeroCmpOp != 0  -->  (0-B) <  A
+    //   ZeroCmpOp >  A || ZeroCmpOp == 0  -->  (0-B) >= A
+    //
+    //   ZeroCmpOp <  A && ZeroCmpOp != 0  -->  (0-X) <  Y  iff
+    //   ZeroCmpOp >= A || ZeroCmpOp == 0  -->  (0-X) >= Y  iff
+    //     with X being the value (A/B) that is known to be non-zero,
+    //     and Y being remaining value.
+    if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+        IsAnd)
+      return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE &&
+        IsAnd && GetKnownNonZeroAndOther(B, A))
+      return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+        !IsAnd)
+      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ &&
+        !IsAnd && GetKnownNonZeroAndOther(B, A))
+      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+  }
+
+  Value *Base, *Offset;
+  if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset))))
+    return nullptr;
+
+  if (!match(UnsignedICmp,
+             m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) ||
+      !ICmpInst::isUnsigned(UnsignedPred))
+    return nullptr;
+  if (UnsignedICmp->getOperand(0) != Base)
+    UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
+
+  // Base >=/> Offset && (Base - Offset) != 0  <-->  Base > Offset
+  // (no overflow and not null)
+  if ((UnsignedPred == ICmpInst::ICMP_UGE ||
+       UnsignedPred == ICmpInst::ICMP_UGT) &&
+      EqPred == ICmpInst::ICMP_NE && IsAnd)
+    return Builder.CreateICmpUGT(Base, Offset);
+
+  // Base <=/< Offset || (Base - Offset) == 0  <-->  Base <= Offset
+  // (overflow or null)
+  if ((UnsignedPred == ICmpInst::ICMP_ULE ||
+       UnsignedPred == ICmpInst::ICMP_ULT) &&
+      EqPred == ICmpInst::ICMP_EQ && !IsAnd)
+    return Builder.CreateICmpULE(Base, Offset);
+
+  // Base <= Offset && (Base - Offset) != 0  -->  Base < Offset
+  if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+      IsAnd)
+    return Builder.CreateICmpULT(Base, Offset);
+
+  // Base > Offset || (Base - Offset) == 0  -->  Base >= Offset
+  if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+      !IsAnd)
+    return Builder.CreateICmpUGE(Base, Offset);
+
+  return nullptr;
+}
+
 /// Fold (icmp)&(icmp) if possible.
 Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                     Instruction &CxtI) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&CxtI);
+
   // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
   // if K1 and K2 are a one-bit mask.
   if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI))
@@ -1096,6 +1188,13 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder))
     return V;
 
+  if (Value *X =
+          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder))
+    return X;
+  if (Value *X =
+          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
+    return X;
+
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
@@ -1196,16 +1295,22 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_ULT:
-      if (LHSC == SubOne(RHSC)) // (X != 13 & X u< 14) -> X < 13
+      // (X != 13 & X u< 14) -> X < 13
+      if (LHSC->getValue() == (RHSC->getValue() - 1))
         return Builder.CreateICmpULT(LHS0, LHSC);
-      if (LHSC->isZero()) // (X !=  0 & X u< 14) -> X-1 u< 13
+      if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1
         return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                                false, true);
       break; // (X != 13 & X u< 15) -> no change
     case ICmpInst::ICMP_SLT:
-      if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13
+      // (X != 13 & X s< 14) -> X < 13
+      if (LHSC->getValue() == (RHSC->getValue() - 1))
         return Builder.CreateICmpSLT(LHS0, LHSC);
-      break;                 // (X != 13 & X s< 15) -> no change
+      // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1))
+      if (LHSC->isMinValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               true, true);
+      break; // (X != 13 & X s< 15) -> no change
     case ICmpInst::ICMP_NE:
       // Potential folds for this case should already be handled.
       break;
@@ -1216,10 +1321,15 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_NE:
-      if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14
+      // (X u> 13 & X != 14) -> X u> 14
+      if (RHSC->getValue() == (LHSC->getValue() + 1))
         return Builder.CreateICmp(PredL, LHS0, RHSC);
+      // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1)
+      if (RHSC->isMaxValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               false, true);
       break;                 // (X u> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1
+    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1
       return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                              false, true);
     }
@@ -1229,10 +1339,15 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_NE:
-      if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14
+      // (X s> 13 & X != 14) -> X s> 14
+      if (RHSC->getValue() == (LHSC->getValue() + 1))
         return Builder.CreateICmp(PredL, LHS0, RHSC);
+      // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1)
+      if (RHSC->isMaxValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               true, true);
       break;                 // (X s> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1
+    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1
       return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
                              true);
     }
@@ -1352,8 +1467,8 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
   Value *A, *B;
   if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
       match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
-      !IsFreeToInvert(A, A->hasOneUse()) &&
-      !IsFreeToInvert(B, B->hasOneUse())) {
+      !isFreeToInvert(A, A->hasOneUse()) &&
+      !isFreeToInvert(B, B->hasOneUse())) {
     Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
     return BinaryOperator::CreateNot(AndOr);
   }
@@ -1770,13 +1885,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
     if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
       if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
-        if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+        if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
           return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C));
 
     // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
     if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
       if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
-        if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+        if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
           return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
 
     // (A | B) & ((~A) ^ B) -> (A & B)
@@ -1844,6 +1959,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType()));
 
+  // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
+  {
+    Value *X, *Y;
+    const APInt *ShAmt;
+    Type *Ty = I.getType();
+    if (match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
+                                          m_APInt(ShAmt))),
+                          m_Deferred(X))) &&
+        *ShAmt == Ty->getScalarSizeInBits() - 1) {
+      Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
+      return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
+    }
+  }
+
   return nullptr;
 }
 
@@ -2057,6 +2186,8 @@ Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
 /// Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                    Instruction &CxtI) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&CxtI);
+
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
   // if K1 and K2 are a one-bit mask.
   if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI))
@@ -2182,6 +2313,13 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder))
     return V;
 
+  if (Value *X =
+          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder))
+    return X;
+  if (Value *X =
+          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
+    return X;
+
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
   if (!LHSC || !RHSC)
     return nullptr;
@@ -2251,8 +2389,19 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     case ICmpInst::ICMP_EQ:
       // Potential folds for this case should already be handled.
       break;
-    case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change
-    case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change
+    case ICmpInst::ICMP_UGT:
+      // (X == 0 || X u> C) -> (X-1) u>= C
+      if (LHSC->isMinValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+                               false, false);
+      // (X == 13 | X u> 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT:
+      // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN
+      if (LHSC->isMinValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+                               true, false);
+      // (X == 13 | X s> 14) -> no change
       break;
     }
     break;
@@ -2261,6 +2410,10 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
+      // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C
+      if (RHSC->isMaxValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+                               false, false);
       break;
     case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
       assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
@@ -2272,9 +2425,14 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     switch (PredR) {
     default:
       llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change
+    case ICmpInst::ICMP_EQ:
+      // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C
+      if (RHSC->isMaxValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+                               true, false);
+      // (X s< 13 | X == 14) -> no change
       break;
-    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2
+    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2
       assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
       return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
                              false);
@@ -2552,6 +2710,25 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     }
   }
 
+  // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X)  --> X s> Y ? -1 : X.
+  {
+    Value *X, *Y;
+    const APInt *ShAmt;
+    Type *Ty = I.getType();
+    if (match(&I, m_c_Or(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
+                                         m_APInt(ShAmt))),
+                         m_Deferred(X))) &&
+        *ShAmt == Ty->getScalarSizeInBits() - 1) {
+      Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
+      return SelectInst::Create(NewICmpInst, ConstantInt::getAllOnesValue(Ty),
+                                X);
+    }
+  }
+
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
   return nullptr;
 }
 
@@ -2617,7 +2794,11 @@ static Instruction *foldXorToXor(BinaryOperator &I,
   return nullptr;
 }
 
-Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                    BinaryOperator &I) {
+  assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
+         I.getOperand(1) == RHS && "Should be 'xor' with these operands");
+
   if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
@@ -2672,14 +2853,35 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
     if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
       // TODO: Independently handle cases where the 'and' side is a constant.
-      if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) {
-        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS
-        RHS->setPredicate(RHS->getInversePredicate());
-        return Builder.CreateAnd(LHS, RHS);
+      ICmpInst *X = nullptr, *Y = nullptr;
+      if (OrICmp == LHS && AndICmp == RHS) {
+        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS  --> X & !Y
+        X = LHS;
+        Y = RHS;
       }
-      if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) {
-        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS
-        LHS->setPredicate(LHS->getInversePredicate());
+      if (OrICmp == RHS && AndICmp == LHS) {
+        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS  --> !Y & X
+        X = RHS;
+        Y = LHS;
+      }
+      if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) {
+        // Invert the predicate of 'Y', thus inverting its output.
+        Y->setPredicate(Y->getInversePredicate());
+        // So, are there other uses of Y?
+        if (!Y->hasOneUse()) {
+          // We need to adapt other uses of Y though. Get a value that matches
+          // the original value of Y before inversion. While this increases
+          // immediate instruction count, we have just ensured that all the
+          // users are freely-invertible, so that 'not' *will* get folded away.
+          BuilderTy::InsertPointGuard Guard(Builder);
+          // Set insertion point to right after the Y.
+          Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator()));
+          Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+          // Replace all uses of Y (excluding the one in NotY!) with NotY.
+          Y->replaceUsesWithIf(NotY,
+                               [NotY](Use &U) { return U.getUser() != NotY; });
+        }
+        // All done.
         return Builder.CreateAnd(LHS, RHS);
       }
     }
@@ -2747,9 +2949,9 @@ static Instruction *sinkNotIntoXor(BinaryOperator &I,
     return nullptr;
 
   // We only want to do the transform if it is free to do.
-  if (IsFreeToInvert(X, X->hasOneUse())) {
+  if (isFreeToInvert(X, X->hasOneUse())) {
     // Ok, good.
-  } else if (IsFreeToInvert(Y, Y->hasOneUse())) {
+  } else if (isFreeToInvert(Y, Y->hasOneUse())) {
     std::swap(X, Y);
   } else
     return nullptr;
@@ -2827,9 +3029,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       // Apply DeMorgan's Law when inverts are free:
       // ~(X & Y) --> (~X | ~Y)
       // ~(X | Y) --> (~X & ~Y)
-      if (IsFreeToInvert(NotVal->getOperand(0),
+      if (isFreeToInvert(NotVal->getOperand(0),
                          NotVal->getOperand(0)->hasOneUse()) &&
-          IsFreeToInvert(NotVal->getOperand(1),
+          isFreeToInvert(NotVal->getOperand(1),
                          NotVal->getOperand(1)->hasOneUse())) {
         Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs");
         Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs");
@@ -3004,7 +3206,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
   if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
     if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
-      if (Value *V = foldXorOfICmps(LHS, RHS))
+      if (Value *V = foldXorOfICmps(LHS, RHS, I))
         return replaceInstUsesWith(I, V);
 
   if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
@@ -3052,7 +3254,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     if (SelectPatternResult::isMinOrMax(SPF)) {
       // It's possible we get here before the not has been simplified, so make
       // sure the input to the not isn't freely invertible.
-      if (match(LHS, m_Not(m_Value(X))) && !IsFreeToInvert(X, X->hasOneUse())) {
+      if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
         Value *NotY = Builder.CreateNot(RHS);
         return SelectInst::Create(
             Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
@@ -3060,7 +3262,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
       // It's possible we get here before the not has been simplified, so make
       // sure the input to the not isn't freely invertible.
-      if (match(RHS, m_Not(m_Value(Y))) && !IsFreeToInvert(Y, Y->hasOneUse())) {
+      if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
         Value *NotX = Builder.CreateNot(LHS);
         return SelectInst::Create(
             Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
@@ -3068,8 +3270,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
       // If both sides are freely invertible, then we can get rid of the xor
       // completely.
-      if (IsFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
-          IsFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
+      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+          isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
         Value *NotLHS = Builder.CreateNot(LHS);
         Value *NotRHS = Builder.CreateNot(RHS);
         return SelectInst::Create(
@@ -3077,6 +3279,23 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
             NotLHS, NotRHS);
       }
     }
+
+    // Pull 'not' into operands of select if both operands are one-use compares.
+    // Inverting the predicates eliminates the 'not' operation.
+    // Example:
+    //     not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) -->
+    //     select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?)
+    // TODO: Canonicalize by hoisting 'not' into an arm of the select if only
+    //       1 select operand is a cmp?
+    if (auto *Sel = dyn_cast<SelectInst>(Op0)) {
+      auto *CmpT = dyn_cast<CmpInst>(Sel->getTrueValue());
+      auto *CmpF = dyn_cast<CmpInst>(Sel->getFalseValue());
+      if (CmpT && CmpF && CmpT->hasOneUse() && CmpF->hasOneUse()) {
+        CmpT->setPredicate(CmpT->getInversePredicate());
+        CmpF->setPredicate(CmpF->getInversePredicate());
+        return replaceInstUsesWith(I, Sel);
+      }
+    }
   }
 
   if (Instruction *NewXor = sinkNotIntoXor(I, Builder))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index 5f37a00f56cf..825f4b468b0a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -124,7 +124,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
     auto *SI = new StoreInst(RMWI.getValOperand(),
                              RMWI.getPointerOperand(), &RMWI);
     SI->setAtomic(Ordering, RMWI.getSyncScopeID());
-    SI->setAlignment(DL.getABITypeAlignment(RMWI.getType()));
+    SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType())));
     return eraseInstFromFunction(RMWI);
   }
   
@@ -154,6 +154,6 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
   
   LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand());
   Load->setAtomic(Ordering, RMWI.getSyncScopeID());
-  Load->setAlignment(DL.getABITypeAlignment(RMWI.getType()));
+  Load->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType())));
   return Load;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 4b3333affa72..f463c5fa1138 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -40,6 +40,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
@@ -185,7 +191,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
   LoadInst *L = Builder.CreateLoad(IntType, Src);
   // Alignment from the mem intrinsic will be better, so use it.
-  L->setAlignment(CopySrcAlign);
+  L->setAlignment(
+      MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead.
   if (CopyMD)
     L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   MDNode *LoopMemParallelMD =
@@ -198,7 +205,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 
   StoreInst *S = Builder.CreateStore(L, Dest);
   // Alignment from the mem intrinsic will be better, so use it.
-  S->setAlignment(CopyDstAlign);
+  S->setAlignment(
+      MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead.
   if (CopyMD)
     S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   if (LoopMemParallelMD)
@@ -223,9 +231,10 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 }
 
 Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
-  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
-  if (MI->getDestAlignment() < Alignment) {
-    MI->setDestAlignment(Alignment);
+  const unsigned KnownAlignment =
+      getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
+  if (MI->getDestAlignment() < KnownAlignment) {
+    MI->setDestAlignment(KnownAlignment);
     return MI;
   }
 
@@ -243,13 +252,9 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
   ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
   if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
     return nullptr;
-  uint64_t Len = LenC->getLimitedValue();
-  Alignment = MI->getDestAlignment();
+  const uint64_t Len = LenC->getLimitedValue();
   assert(Len && "0-sized memory setting should be removed already.");
-
-  // Alignment 0 is identity for alignment 1 for memset, but not store.
-  if (Alignment == 0)
-    Alignment = 1;
+  const Align Alignment = assumeAligned(MI->getDestAlignment());
 
   // If it is an atomic and alignment is less than the size then we will
   // introduce the unaligned memory access which will be later transformed
@@ -1060,9 +1065,9 @@ Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
 
   // If we can unconditionally load from this address, replace with a
   // load/select idiom. TODO: use DT for context sensitive query
-  if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment,
-                                         II.getModule()->getDataLayout(),
-                                         &II, nullptr)) {
+  if (isDereferenceableAndAlignedPointer(
+          LoadPtr, II.getType(), MaybeAlign(Alignment),
+          II.getModule()->getDataLayout(), &II, nullptr)) {
     Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
                                          "unmaskedload");
     return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
@@ -1086,7 +1091,8 @@ Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
   // If the mask is all ones, this is a plain vector store of the 1st argument.
   if (ConstMask->isAllOnesValue()) {
     Value *StorePtr = II.getArgOperand(1);
-    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
+    MaybeAlign Alignment(
+        cast<ConstantInt>(II.getArgOperand(2))->getZExtValue());
     return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
   }
 
@@ -2234,6 +2240,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return replaceInstUsesWith(*II, Add);
     }
 
+    // Try to simplify the underlying FMul.
+    if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
+                                    II->getFastMathFlags(),
+                                    SQ.getWithInstruction(II))) {
+      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
+      FAdd->copyFastMathFlags(II);
+      return FAdd;
+    }
+
     LLVM_FALLTHROUGH;
   }
   case Intrinsic::fma: {
@@ -2258,15 +2273,47 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     }
 
-    // fma x, 1, z -> fadd x, z
-    if (match(Src1, m_FPOne())) {
-      auto *FAdd = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+    // Try to simplify the underlying FMul. We can only apply simplifications
+    // that do not require rounding.
+    if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
+                                   II->getFastMathFlags(),
+                                   SQ.getWithInstruction(II))) {
+      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
       FAdd->copyFastMathFlags(II);
       return FAdd;
     }
 
     break;
   }
+  case Intrinsic::copysign: {
+    if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) {
+      // If we know that the sign argument is positive, reduce to FABS:
+      // copysign X, Pos --> fabs X
+      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
+                                                 II->getArgOperand(0), II);
+      return replaceInstUsesWith(*II, Fabs);
+    }
+    // TODO: There should be a ValueTracking sibling like SignBitMustBeOne.
+    const APFloat *C;
+    if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) {
+      // If we know that the sign argument is negative, reduce to FNABS:
+      // copysign X, Neg --> fneg (fabs X)
+      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
+                                                 II->getArgOperand(0), II);
+      return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II));
+    }
+
+    // Propagate sign argument through nested calls:
+    // copysign X, (copysign ?, SignArg) --> copysign X, SignArg
+    Value *SignArg;
+    if (match(II->getArgOperand(1),
+              m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) {
+      II->setArgOperand(1, SignArg);
+      return II;
+    }
+
+    break;
+  }
   case Intrinsic::fabs: {
     Value *Cond;
     Constant *LHS, *RHS;
@@ -2331,7 +2378,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC VSX loads into normal loads.
     Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                        PointerType::getUnqual(II->getType()));
-    return new LoadInst(II->getType(), Ptr, Twine(""), false, 1);
+    return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None());
   }
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
@@ -2349,7 +2396,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC VSX stores into normal stores.
     Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
     Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-    return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
+    return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None());
   }
   case Intrinsic::ppc_qpx_qvlfs:
     // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
@@ -2440,6 +2487,64 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // TODO should we convert this to an AND if the RHS is constant?
     }
     break;
+  case Intrinsic::x86_bmi_pext_32:
+  case Intrinsic::x86_bmi_pext_64:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      if (MaskC->isNullValue())
+        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
+      if (MaskC->isAllOnesValue())
+        return replaceInstUsesWith(CI, II->getArgOperand(0));
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
+        uint64_t Src = SrcC->getZExtValue();
+        uint64_t Mask = MaskC->getZExtValue();
+        uint64_t Result = 0;
+        uint64_t BitToSet = 1;
+
+        while (Mask) {
+          // Isolate lowest set bit.
+          uint64_t BitToTest = Mask & -Mask;
+          if (BitToTest & Src)
+            Result |= BitToSet;
+
+          BitToSet <<= 1;
+          // Clear lowest set bit.
+          Mask &= Mask - 1;
+        }
+
+        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
+      }
+    }
+    break;
+  case Intrinsic::x86_bmi_pdep_32:
+  case Intrinsic::x86_bmi_pdep_64:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      if (MaskC->isNullValue())
+        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
+      if (MaskC->isAllOnesValue())
+        return replaceInstUsesWith(CI, II->getArgOperand(0));
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
+        uint64_t Src = SrcC->getZExtValue();
+        uint64_t Mask = MaskC->getZExtValue();
+        uint64_t Result = 0;
+        uint64_t BitToTest = 1;
+
+        while (Mask) {
+          // Isolate lowest set bit.
+          uint64_t BitToSet = Mask & -Mask;
+          if (BitToTest & Src)
+            Result |= BitToSet;
+
+          BitToTest <<= 1;
+          // Clear lowest set bit;
+          Mask &= Mask - 1;
+        }
+
+        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
+      }
+    }
+    break;
 
   case Intrinsic::x86_vcvtph2ps_128:
   case Intrinsic::x86_vcvtph2ps_256: {
@@ -3296,6 +3401,60 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::arm_mve_pred_i2v: {
+    Value *Arg = II->getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) &&
+        II->getType() == ArgArg->getType())
+      return replaceInstUsesWith(*II, ArgArg);
+    Constant *XorMask;
+    if (match(Arg,
+              m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)),
+                    m_Constant(XorMask))) &&
+        II->getType() == ArgArg->getType()) {
+      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
+        if (CI->getValue().trunc(16).isAllOnesValue()) {
+          auto TrueVector = Builder.CreateVectorSplat(
+              II->getType()->getVectorNumElements(), Builder.getTrue());
+          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
+        }
+      }
+    }
+    KnownBits ScalarKnown(32);
+    if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16),
+                             ScalarKnown, 0))
+      return II;
+    break;
+  }
+  case Intrinsic::arm_mve_pred_v2i: {
+    Value *Arg = II->getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg))))
+      return replaceInstUsesWith(*II, ArgArg);
+    if (!II->getMetadata(LLVMContext::MD_range)) {
+      Type *IntTy32 = Type::getInt32Ty(II->getContext());
+      Metadata *M[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
+        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))
+      };
+      II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M));
+      return II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_vadc:
+  case Intrinsic::arm_mve_vadc_predicated: {
+    unsigned CarryOp =
+        (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
+    assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
+           "Bad type for intrinsic!");
+
+    KnownBits CarryKnown(32);
+    if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29),
+                             CarryKnown))
+      return II;
+    break;
+  }
   case Intrinsic::amdgcn_rcp: {
     Value *Src = II->getArgOperand(0);
 
@@ -3305,7 +3464,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
       const APFloat &ArgVal = C->getValueAPF();
-      APFloat Val(ArgVal.getSemantics(), 1.0);
+      APFloat Val(ArgVal.getSemantics(), 1);
       APFloat::opStatus Status = Val.divide(ArgVal,
                                             APFloat::rmNearestTiesToEven);
       // Only do this if it was exact and therefore not dependent on the
@@ -3860,7 +4019,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
             return eraseInstFromFunction(CI);
 
           // Bail if we cross over an intrinsic with side effects, such as
-          // llvm.stacksave, llvm.read_register, or llvm.setjmp.
+          // llvm.stacksave, or llvm.read_register.
           if (II2->mayHaveSideEffects()) {
             CannotRemove = true;
             break;
@@ -3885,6 +4044,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Asan needs to poison memory to detect invalid access which is possible
     // even for empty lifetime range.
     if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
+        II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) ||
         II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
       break;
 
@@ -3950,10 +4110,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::experimental_gc_relocate: {
+    auto &GCR = *cast<GCRelocateInst>(II);
+
+    // If we have two copies of the same pointer in the statepoint argument
+    // list, canonicalize to one.  This may let us common gc.relocates.
+    if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
+        GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
+      auto *OpIntTy = GCR.getOperand(2)->getType();
+      II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
+      return II;
+    }
+    
     // Translate facts known about a pointer before relocating into
     // facts about the relocate value, while being careful to
     // preserve relocation semantics.
-    Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();
+    Value *DerivedPtr = GCR.getDerivedPtr();
 
     // Remove the relocation if unused, note that this check is required
     // to prevent the cases below from looping forever.
@@ -3995,12 +4166,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Is this guard followed by another guard?  We scan forward over a small
     // fixed window of instructions to handle common cases with conditions
     // computed between guards.
-    Instruction *NextInst = II->getNextNode();
+    Instruction *NextInst = II->getNextNonDebugInstruction();
     for (unsigned i = 0; i < GuardWideningWindow; i++) {
       // Note: Using context-free form to avoid compile time blow up
       if (!isSafeToSpeculativelyExecute(NextInst))
         break;
-      NextInst = NextInst->getNextNode();
+      NextInst = NextInst->getNextNonDebugInstruction();
     }
     Value *NextCond = nullptr;
     if (match(NextInst,
@@ -4008,18 +4179,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       Value *CurrCond = II->getArgOperand(0);
 
       // Remove a guard that it is immediately preceded by an identical guard.
-      if (CurrCond == NextCond)
-        return eraseInstFromFunction(*NextInst);
-
       // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
-      Instruction* MoveI = II->getNextNode();
-      while (MoveI != NextInst) {
-        auto *Temp = MoveI;
-        MoveI = MoveI->getNextNode();
-        Temp->moveBefore(II);
+      if (CurrCond != NextCond) {
+        Instruction *MoveI = II->getNextNonDebugInstruction();
+        while (MoveI != NextInst) {
+          auto *Temp = MoveI;
+          MoveI = MoveI->getNextNonDebugInstruction();
+          Temp->moveBefore(II);
+        }
+        II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
       }
-      II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
-      return eraseInstFromFunction(*NextInst);
+      eraseInstFromFunction(*NextInst);
+      return II;
     }
     break;
   }
@@ -4177,10 +4348,58 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
   return nullptr;
 }
 
+static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
+  unsigned NumArgs = Call.getNumArgOperands();
+  ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
+  ConstantInt *Op1C =
+      (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
+  // Bail out if the allocation size is zero.
+  if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
+    return;
+
+  if (isMallocLikeFn(&Call, TLI) && Op0C) {
+    if (isOpNewLikeFn(&Call, TLI))
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableBytes(
+                            Call.getContext(), Op0C->getZExtValue()));
+    else
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableOrNullBytes(
+                            Call.getContext(), Op0C->getZExtValue()));
+  } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
+    Call.addAttribute(AttributeList::ReturnIndex,
+                      Attribute::getWithDereferenceableOrNullBytes(
+                          Call.getContext(), Op1C->getZExtValue()));
+  } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
+    bool Overflow;
+    const APInt &N = Op0C->getValue();
+    APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
+    if (!Overflow)
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableOrNullBytes(
+                            Call.getContext(), Size.getZExtValue()));
+  } else if (isStrdupLikeFn(&Call, TLI)) {
+    uint64_t Len = GetStringLength(Call.getOperand(0));
+    if (Len) {
+      // strdup
+      if (NumArgs == 1)
+        Call.addAttribute(AttributeList::ReturnIndex,
+                          Attribute::getWithDereferenceableOrNullBytes(
+                              Call.getContext(), Len));
+      // strndup
+      else if (NumArgs == 2 && Op1C)
+        Call.addAttribute(
+            AttributeList::ReturnIndex,
+            Attribute::getWithDereferenceableOrNullBytes(
+                Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+    }
+  }
+}
+
 /// Improvements for call, callbr and invoke instructions.
 Instruction *InstCombiner::visitCallBase(CallBase &Call) {
-  if (isAllocLikeFn(&Call, &TLI))
-    return visitAllocSite(Call);
+  if (isAllocationFn(&Call, &TLI))
+    annotateAnyAllocSite(Call, &TLI);
 
   bool Changed = false;
 
@@ -4312,6 +4531,9 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
     if (I) return eraseInstFromFunction(*I);
   }
 
+  if (isAllocLikeFn(&Call, &TLI))
+    return visitAllocSite(Call);
+
   return Changed ? &Call : nullptr;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2c9ba203fbf3..71b7f279e5fa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include <numeric>
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -140,7 +141,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   }
 
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
-  New->setAlignment(AI.getAlignment());
+  New->setAlignment(MaybeAlign(AI.getAlignment()));
   New->takeName(&AI);
   New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
 
@@ -843,33 +844,33 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   return nullptr;
 }
 
-Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
+Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
                                              bool DoTransform) {
   // If we are just checking for a icmp eq of a single bit and zext'ing it
   // to an integer, then shift the bit to the appropriate place and then
   // cast to integer to avoid the comparison.
   const APInt *Op1CV;
-  if (match(ICI->getOperand(1), m_APInt(Op1CV))) {
+  if (match(Cmp->getOperand(1), m_APInt(Op1CV))) {
 
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
-    if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) ||
-        (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) {
-      if (!DoTransform) return ICI;
+    if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) ||
+        (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) {
+      if (!DoTransform) return Cmp;
 
-      Value *In = ICI->getOperand(0);
+      Value *In = Cmp->getOperand(0);
       Value *Sh = ConstantInt::get(In->getType(),
                                    In->getType()->getScalarSizeInBits() - 1);
       In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit");
-      if (In->getType() != CI.getType())
-        In = Builder.CreateIntCast(In, CI.getType(), false /*ZExt*/);
+      if (In->getType() != Zext.getType())
+        In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/);
 
-      if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
+      if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) {
         Constant *One = ConstantInt::get(In->getType(), 1);
         In = Builder.CreateXor(In, One, In->getName() + ".not");
       }
 
-      return replaceInstUsesWith(CI, In);
+      return replaceInstUsesWith(Zext, In);
     }
 
     // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
@@ -882,24 +883,24 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
     // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
     if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) &&
         // This only works for EQ and NE
-        ICI->isEquality()) {
+        Cmp->isEquality()) {
       // If Op1C some other power of two, convert:
-      KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI);
+      KnownBits Known = computeKnownBits(Cmp->getOperand(0), 0, &Zext);
 
       APInt KnownZeroMask(~Known.Zero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
-        if (!DoTransform) return ICI;
+        if (!DoTransform) return Cmp;
 
-        bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
+        bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE;
         if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) {
           // (X&4) == 2 --> false
           // (X&4) != 2 --> true
-          Constant *Res = ConstantInt::get(CI.getType(), isNE);
-          return replaceInstUsesWith(CI, Res);
+          Constant *Res = ConstantInt::get(Zext.getType(), isNE);
+          return replaceInstUsesWith(Zext, Res);
         }
 
         uint32_t ShAmt = KnownZeroMask.logBase2();
-        Value *In = ICI->getOperand(0);
+        Value *In = Cmp->getOperand(0);
         if (ShAmt) {
           // Perform a logical shr by shiftamt.
           // Insert the shift to put the result in the low bit.
@@ -912,11 +913,11 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
           In = Builder.CreateXor(In, One);
         }
 
-        if (CI.getType() == In->getType())
-          return replaceInstUsesWith(CI, In);
+        if (Zext.getType() == In->getType())
+          return replaceInstUsesWith(Zext, In);
 
-        Value *IntCast = Builder.CreateIntCast(In, CI.getType(), false);
-        return replaceInstUsesWith(CI, IntCast);
+        Value *IntCast = Builder.CreateIntCast(In, Zext.getType(), false);
+        return replaceInstUsesWith(Zext, IntCast);
       }
     }
   }
@@ -924,19 +925,19 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
   // icmp ne A, B is equal to xor A, B when A and B only really have one bit.
   // It is also profitable to transform icmp eq into not(xor(A, B)) because that
   // may lead to additional simplifications.
-  if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
-    if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
-      Value *LHS = ICI->getOperand(0);
-      Value *RHS = ICI->getOperand(1);
+  if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(Zext.getType())) {
+      Value *LHS = Cmp->getOperand(0);
+      Value *RHS = Cmp->getOperand(1);
 
-      KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI);
-      KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI);
+      KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext);
+      KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext);
 
       if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
         APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
         APInt UnknownBit = ~KnownBits;
         if (UnknownBit.countPopulation() == 1) {
-          if (!DoTransform) return ICI;
+          if (!DoTransform) return Cmp;
 
           Value *Result = Builder.CreateXor(LHS, RHS);
 
@@ -949,10 +950,10 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
           Result = Builder.CreateLShr(
                Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
 
-          if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+          if (Cmp->getPredicate() == ICmpInst::ICMP_EQ)
             Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1));
-          Result->takeName(ICI);
-          return replaceInstUsesWith(CI, Result);
+          Result->takeName(Cmp);
+          return replaceInstUsesWith(Zext, Result);
         }
       }
     }
@@ -1172,8 +1173,8 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     }
   }
 
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
-    return transformZExtICmp(ICI, CI);
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src))
+    return transformZExtICmp(Cmp, CI);
 
   BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
   if (SrcI && SrcI->getOpcode() == Instruction::Or) {
@@ -1188,7 +1189,9 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
       // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
       Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName());
       Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName());
-      BinaryOperator *Or = BinaryOperator::Create(Instruction::Or, LCast, RCast);
+      Value *Or = Builder.CreateOr(LCast, RCast, CI.getName());
+      if (auto *OrInst = dyn_cast<Instruction>(Or))
+        Builder.SetInsertPoint(OrInst);
 
       // Perform the elimination.
       if (auto *LZExt = dyn_cast<ZExtInst>(LCast))
@@ -1196,7 +1199,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
       if (auto *RZExt = dyn_cast<ZExtInst>(RCast))
         transformZExtICmp(RHS, *RZExt);
 
-      return Or;
+      return replaceInstUsesWith(CI, Or);
     }
   }
 
@@ -1531,16 +1534,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
   // what we can and cannot do safely varies from operation to operation, and
   // is explained below in the various case statements.
   Type *Ty = FPT.getType();
-  BinaryOperator *OpI = dyn_cast<BinaryOperator>(FPT.getOperand(0));
-  if (OpI && OpI->hasOneUse()) {
-    Type *LHSMinType = getMinimumFPType(OpI->getOperand(0));
-    Type *RHSMinType = getMinimumFPType(OpI->getOperand(1));
-    unsigned OpWidth = OpI->getType()->getFPMantissaWidth();
+  auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
+  if (BO && BO->hasOneUse()) {
+    Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
+    Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
+    unsigned OpWidth = BO->getType()->getFPMantissaWidth();
     unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
     unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
     unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
     unsigned DstWidth = Ty->getFPMantissaWidth();
-    switch (OpI->getOpcode()) {
+    switch (BO->getOpcode()) {
       default: break;
       case Instruction::FAdd:
       case Instruction::FSub:
@@ -1563,10 +1566,10 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
         // could be tightened for those cases, but they are rare (the main
         // case of interest here is (float)((double)float + float)).
         if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
-          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
-          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
-          Instruction *RI = BinaryOperator::Create(OpI->getOpcode(), LHS, RHS);
-          RI->copyFastMathFlags(OpI);
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS);
+          RI->copyFastMathFlags(BO);
           return RI;
         }
         break;
@@ -1577,9 +1580,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
         // rounding can possibly occur; we can safely perform the operation
         // in the destination format if it can represent both sources.
         if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
-          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
-          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
-          return BinaryOperator::CreateFMulFMF(LHS, RHS, OpI);
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          return BinaryOperator::CreateFMulFMF(LHS, RHS, BO);
         }
         break;
       case Instruction::FDiv:
@@ -1590,9 +1593,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
         // condition used here is a good conservative first pass.
         // TODO: Tighten bound via rigorous analysis of the unbalanced case.
         if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
-          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
-          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
-          return BinaryOperator::CreateFDivFMF(LHS, RHS, OpI);
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          return BinaryOperator::CreateFDivFMF(LHS, RHS, BO);
         }
         break;
       case Instruction::FRem: {
@@ -1604,14 +1607,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
           break;
         Value *LHS, *RHS;
         if (LHSWidth == SrcWidth) {
-           LHS = Builder.CreateFPTrunc(OpI->getOperand(0), LHSMinType);
-           RHS = Builder.CreateFPTrunc(OpI->getOperand(1), LHSMinType);
+           LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType);
+           RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType);
         } else {
-           LHS = Builder.CreateFPTrunc(OpI->getOperand(0), RHSMinType);
-           RHS = Builder.CreateFPTrunc(OpI->getOperand(1), RHSMinType);
+           LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType);
+           RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType);
         }
 
-        Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, OpI);
+        Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO);
         return CastInst::CreateFPCast(ExactResult, Ty);
       }
     }
@@ -1621,6 +1624,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
   Value *X;
   Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0));
   if (Op && Op->hasOneUse()) {
+    // FIXME: The FMF should propagate from the fptrunc, not the source op.
+    IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+    if (isa<FPMathOperator>(Op))
+      Builder.setFastMathFlags(Op->getFastMathFlags());
+
     if (match(Op, m_FNeg(m_Value(X)))) {
       Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
 
@@ -1630,6 +1638,24 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
         return BinaryOperator::CreateFNegFMF(InnerTrunc, Op);
       return UnaryOperator::CreateFNegFMF(InnerTrunc, Op);
     }
+
+    // If we are truncating a select that has an extended operand, we can
+    // narrow the other operand and do the select as a narrow op.
+    Value *Cond, *X, *Y;
+    if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) &&
+        X->getType() == Ty) {
+      // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y)
+      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
+      Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op);
+      return replaceInstUsesWith(FPT, Sel);
+    }
+    if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) &&
+        X->getType() == Ty) {
+      // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X
+      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
+      Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op);
+      return replaceInstUsesWith(FPT, Sel);
+    }
   }
 
   if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) {
@@ -1808,7 +1834,7 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   Type *Ty = CI.getType();
   unsigned AS = CI.getPointerAddressSpace();
 
-  if (Ty->getScalarSizeInBits() == DL.getIndexSizeInBits(AS))
+  if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS))
     return commonPointerCastTransforms(CI);
 
   Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
@@ -1820,12 +1846,24 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
 }
 
 /// This input value (which is known to have vector type) is being zero extended
-/// or truncated to the specified vector type.
+/// or truncated to the specified vector type. Since the zext/trunc is done
+/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
+/// endianness will impact which end of the vector that is extended or
+/// truncated.
+///
+/// A vector is always stored with index 0 at the lowest address, which
+/// corresponds to the most significant bits for a big endian stored integer and
+/// the least significant bits for little endian. A trunc/zext of an integer
+/// impacts the big end of the integer. Thus, we need to add/remove elements at
+/// the front of the vector for big endian targets, and the back of the vector
+/// for little endian targets.
+///
 /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,
-                                         InstCombiner &IC) {
+static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
+                                                            VectorType *DestTy,
+                                                            InstCombiner &IC) {
   // We can only do this optimization if the output is a multiple of the input
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
@@ -1844,31 +1882,53 @@ static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,
     InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
   }
 
+  bool IsBigEndian = IC.getDataLayout().isBigEndian();
+  unsigned SrcElts = SrcTy->getNumElements();
+  unsigned DestElts = DestTy->getNumElements();
+
+  assert(SrcElts != DestElts && "Element counts should be different.");
+
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
-  SmallVector<uint32_t, 16> ShuffleMask;
+  SmallVector<uint32_t, 16> ShuffleMaskStorage;
+  ArrayRef<uint32_t> ShuffleMask;
   Value *V2;
 
-  if (SrcTy->getNumElements() > DestTy->getNumElements()) {
-    // If we're shrinking the number of elements, just shuffle in the low
-    // elements from the input and use undef as the second shuffle input.
-    V2 = UndefValue::get(SrcTy);
-    for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
-      ShuffleMask.push_back(i);
+  // Produce an identify shuffle mask for the src vector.
+  ShuffleMaskStorage.resize(SrcElts);
+  std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0);
 
+  if (SrcElts > DestElts) {
+    // If we're shrinking the number of elements (rewriting an integer
+    // truncate), just shuffle in the elements corresponding to the least
+    // significant bits from the input and use undef as the second shuffle
+    // input.
+    V2 = UndefValue::get(SrcTy);
+    // Make sure the shuffle mask selects the "least significant bits" by
+    // keeping elements from back of the src vector for big endian, and from the
+    // front for little endian.
+    ShuffleMask = ShuffleMaskStorage;
+    if (IsBigEndian)
+      ShuffleMask = ShuffleMask.take_back(DestElts);
+    else
+      ShuffleMask = ShuffleMask.take_front(DestElts);
   } else {
-    // If we're increasing the number of elements, shuffle in all of the
-    // elements from InVal and fill the rest of the result elements with zeros
-    // from a constant zero.
+    // If we're increasing the number of elements (rewriting an integer zext),
+    // shuffle in all of the elements from InVal. Fill the rest of the result
+    // elements with zeros from a constant zero.
     V2 = Constant::getNullValue(SrcTy);
-    unsigned SrcElts = SrcTy->getNumElements();
-    for (unsigned i = 0, e = SrcElts; i != e; ++i)
-      ShuffleMask.push_back(i);
-
-    // The excess elements reference the first element of the zero input.
-    for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
-      ShuffleMask.push_back(SrcElts);
+    // Use first elt from V2 when indicating zero in the shuffle mask.
+    uint32_t NullElt = SrcElts;
+    // Extend with null values in the "most significant bits" by adding elements
+    // in front of the src vector for big endian, and at the back for little
+    // endian.
+    unsigned DeltaElts = DestElts - SrcElts;
+    if (IsBigEndian)
+      ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt);
+    else
+      ShuffleMaskStorage.append(DeltaElts, NullElt);
+    ShuffleMask = ShuffleMaskStorage;
   }
 
   return new ShuffleVectorInst(InVal, V2,
@@ -2217,6 +2277,31 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
     }
   }
 
+  // Check that each user of each old PHI node is something that we can
+  // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
+  for (auto *OldPN : OldPhiNodes) {
+    for (User *V : OldPN->users()) {
+      if (auto *SI = dyn_cast<StoreInst>(V)) {
+        if (!SI->isSimple() || SI->getOperand(0) != OldPN)
+          return nullptr;
+      } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+        // Verify it's a B->A cast.
+        Type *TyB = BCI->getOperand(0)->getType();
+        Type *TyA = BCI->getType();
+        if (TyA != DestTy || TyB != SrcTy)
+          return nullptr;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // As long as the user is another old PHI node, then even if we don't
+        // rewrite it, the PHI web we're considering won't have any users
+        // outside itself, so it'll be dead.
+        if (OldPhiNodes.count(PHI) == 0)
+          return nullptr;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
   // For each old PHI node, create a corresponding new PHI node with a type A.
   SmallDenseMap<PHINode *, PHINode *> NewPNodes;
   for (auto *OldPN : OldPhiNodes) {
@@ -2234,9 +2319,14 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
       if (auto *C = dyn_cast<Constant>(V)) {
         NewV = ConstantExpr::getBitCast(C, DestTy);
       } else if (auto *LI = dyn_cast<LoadInst>(V)) {
-        Builder.SetInsertPoint(LI->getNextNode());
-        NewV = Builder.CreateBitCast(LI, DestTy);
-        Worklist.Add(LI);
+        // Explicitly perform load combine to make sure no opposing transform
+        // can remove the bitcast in the meantime and trigger an infinite loop.
+        Builder.SetInsertPoint(LI);
+        NewV = combineLoadToNewType(*LI, DestTy);
+        // Remove the old load and its use in the old phi, which itself becomes
+        // dead once the whole transform finishes.
+        replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
+        eraseInstFromFunction(*LI);
       } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
         NewV = BCI->getOperand(0);
       } else if (auto *PrevPN = dyn_cast<PHINode>(V)) {
@@ -2259,26 +2349,33 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
   Instruction *RetVal = nullptr;
   for (auto *OldPN : OldPhiNodes) {
     PHINode *NewPN = NewPNodes[OldPN];
-    for (User *V : OldPN->users()) {
+    for (auto It = OldPN->user_begin(), End = OldPN->user_end(); It != End; ) {
+      User *V = *It;
+      // We may remove this user, advance to avoid iterator invalidation.
+      ++It;
       if (auto *SI = dyn_cast<StoreInst>(V)) {
-        if (SI->isSimple() && SI->getOperand(0) == OldPN) {
-          Builder.SetInsertPoint(SI);
-          auto *NewBC =
-            cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
-          SI->setOperand(0, NewBC);
-          Worklist.Add(SI);
-          assert(hasStoreUsersOnly(*NewBC));
-        }
+        assert(SI->isSimple() && SI->getOperand(0) == OldPN);
+        Builder.SetInsertPoint(SI);
+        auto *NewBC =
+          cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
+        SI->setOperand(0, NewBC);
+        Worklist.Add(SI);
+        assert(hasStoreUsersOnly(*NewBC));
       }
       else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
-        // Verify it's a B->A cast.
         Type *TyB = BCI->getOperand(0)->getType();
         Type *TyA = BCI->getType();
-        if (TyA == DestTy && TyB == SrcTy) {
-          Instruction *I = replaceInstUsesWith(*BCI, NewPN);
-          if (BCI == &CI)
-            RetVal = I;
-        }
+        assert(TyA == DestTy && TyB == SrcTy);
+        (void) TyA;
+        (void) TyB;
+        Instruction *I = replaceInstUsesWith(*BCI, NewPN);
+        if (BCI == &CI)
+          RetVal = I;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        assert(OldPhiNodes.count(PHI) > 0);
+        (void) PHI;
+      } else {
+        llvm_unreachable("all uses should be handled");
       }
     }
   }
@@ -2338,8 +2435,23 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // If we found a path from the src to dest, create the getelementptr now.
     if (SrcElTy == DstElTy) {
       SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
-      return GetElementPtrInst::CreateInBounds(SrcPTy->getElementType(), Src,
-                                               Idxs);
+      GetElementPtrInst *GEP =
+          GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs);
+
+      // If the source pointer is dereferenceable, then assume it points to an
+      // allocated object and apply "inbounds" to the GEP.
+      bool CanBeNull;
+      if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) {
+        // In a non-default address space (not 0), a null pointer can not be
+        // assumed inbounds, so ignore that case (dereferenceable_or_null).
+        // The reason is that 'null' is not treated differently in these address
+        // spaces, and we consequently ignore the 'gep inbounds' special case
+        // for 'null' which allows 'inbounds' on 'null' if the indices are
+        // zeros.
+        if (SrcPTy->getAddressSpace() == 0 || !CanBeNull)
+          GEP->setIsInBounds();
+      }
+      return GEP;
     }
   }
 
@@ -2359,8 +2471,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
         CastInst *SrcCast = cast<CastInst>(Src);
         if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
           if (isa<VectorType>(BCIn->getOperand(0)->getType()))
-            if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0),
-                                               cast<VectorType>(DestTy), *this))
+            if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts(
+                    BCIn->getOperand(0), cast<VectorType>(DestTy), *this))
               return I;
       }
 
@@ -2391,28 +2503,47 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
-    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
-        DestTy->getVectorNumElements() == SVI->getType()->getNumElements() &&
-        SVI->getType()->getNumElements() ==
-        SVI->getOperand(0)->getType()->getVectorNumElements()) {
+    Value *ShufOp0 = Shuf->getOperand(0);
+    Value *ShufOp1 = Shuf->getOperand(1);
+    unsigned NumShufElts = Shuf->getType()->getVectorNumElements();
+    unsigned NumSrcVecElts = ShufOp0->getType()->getVectorNumElements();
+    if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
+        DestTy->getVectorNumElements() == NumShufElts &&
+        NumShufElts == NumSrcVecElts) {
       BitCastInst *Tmp;
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
       // us to eliminate at least one cast.
-      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
+      if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) &&
            Tmp->getOperand(0)->getType() == DestTy) ||
-          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
+          ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) &&
            Tmp->getOperand(0)->getType() == DestTy)) {
-        Value *LHS = Builder.CreateBitCast(SVI->getOperand(0), DestTy);
-        Value *RHS = Builder.CreateBitCast(SVI->getOperand(1), DestTy);
+        Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy);
+        Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy);
         // Return a new shuffle vector.  Use the same element ID's, as we
         // know the vector types match #elts.
-        return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
+        return new ShuffleVectorInst(LHS, RHS, Shuf->getOperand(2));
       }
     }
+
+    // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as
+    // a byte-swap:
+    // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X)
+    // TODO: We should match the related pattern for bitreverse.
+    if (DestTy->isIntegerTy() &&
+        DL.isLegalInteger(DestTy->getScalarSizeInBits()) &&
+        SrcTy->getScalarSizeInBits() == 8 && NumShufElts % 2 == 0 &&
+        Shuf->hasOneUse() && Shuf->isReverse()) {
+      assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
+      assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op");
+      Function *Bswap =
+          Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy);
+      Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy);
+      return IntrinsicInst::Create(Bswap, { ScalarX });
+    }
   }
 
   // Handle the A->B->A cast, and there is an intervening PHI node.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 0ece3299754e..f38dc436722d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -69,34 +69,6 @@ static bool hasBranchUse(ICmpInst &I) {
   return false;
 }
 
-/// Given an exploded icmp instruction, return true if the comparison only
-/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
-/// result of the comparison is true when the input value is signed.
-static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
-                           bool &TrueIfSigned) {
-  switch (Pred) {
-  case ICmpInst::ICMP_SLT:   // True if LHS s< 0
-    TrueIfSigned = true;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
-    TrueIfSigned = true;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGT:   // True if LHS s> -1
-    TrueIfSigned = false;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_UGT:
-    // True if LHS u> RHS and RHS == high-bit-mask - 1
-    TrueIfSigned = true;
-    return RHS.isMaxSignedValue();
-  case ICmpInst::ICMP_UGE:
-    // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = true;
-    return RHS.isSignMask();
-  default:
-    return false;
-  }
-}
-
 /// Returns true if the exploded icmp can be expressed as a signed comparison
 /// to zero and updates the predicate accordingly.
 /// The signedness of the comparison is preserved.
@@ -900,6 +872,37 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       Offset = EmitGEPOffset(GEPLHS);
     return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
                         Constant::getNullValue(Offset->getType()));
+  }
+
+  if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) &&
+      isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() &&
+      !NullPointerIsDefined(I.getFunction(),
+                            RHS->getType()->getPointerAddressSpace())) {
+    // For most address spaces, an allocation can't be placed at null, but null
+    // itself is treated as a 0 size allocation in the in bounds rules.  Thus,
+    // the only valid inbounds address derived from null, is null itself.
+    // Thus, we have four cases to consider:
+    // 1) Base == nullptr, Offset == 0 -> inbounds, null
+    // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds
+    // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations)
+    // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison)
+    //
+    // (Note if we're indexing a type of size 0, that simply collapses into one
+    //  of the buckets above.)
+    //
+    // In general, we're allowed to make values less poison (i.e. remove
+    //   sources of full UB), so in this case, we just select between the two
+    //   non-poison cases (1 and 4 above).
+    //
+    // For vectors, we apply the same reasoning on a per-lane basis.
+    auto *Base = GEPLHS->getPointerOperand();
+    if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) {
+      int NumElts = GEPLHS->getType()->getVectorNumElements();
+      Base = Builder.CreateVectorSplat(NumElts, Base);
+    }
+    return new ICmpInst(Cond, Base,
+                        ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                            cast<Constant>(RHS), Base->getType()));
   } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
     // If the base pointers are different, but the indices are the same, just
     // compare the base pointer.
@@ -957,12 +960,14 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     }
 
     // If one of the GEPs has all zero indices, recurse.
-    if (GEPLHS->hasAllZeroIndices())
+    // FIXME: Handle vector of pointers.
+    if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices())
       return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
                          ICmpInst::getSwappedPredicate(Cond), I);
 
     // If the other GEP has all zero indices, recurse.
-    if (GEPRHS->hasAllZeroIndices())
+    // FIXME: Handle vector of pointers.
+    if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices())
       return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
 
     bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
@@ -1330,6 +1335,59 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   return ExtractValueInst::Create(Call, 1, "sadd.overflow");
 }
 
+/// If we have:
+///   icmp eq/ne (urem/srem %x, %y), 0
+/// iff %y is a power-of-two, we can replace this with a bit test:
+///   icmp eq/ne (and %x, (add %y, -1)), 0
+Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
+  // This fold is only valid for equality predicates.
+  if (!I.isEquality())
+    return nullptr;
+  ICmpInst::Predicate Pred;
+  Value *X, *Y, *Zero;
+  if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))),
+                        m_CombineAnd(m_Zero(), m_Value(Zero)))))
+    return nullptr;
+  if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I))
+    return nullptr;
+  // This may increase instruction count, we don't enforce that Y is a constant.
+  Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType()));
+  Value *Masked = Builder.CreateAnd(X, Mask);
+  return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero);
+}
+
+/// Fold equality-comparison between zero and any (maybe truncated) right-shift
+/// by one-less-than-bitwidth into a sign test on the original value.
+Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
+  Instruction *Val;
+  ICmpInst::Predicate Pred;
+  if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
+    return nullptr;
+
+  Value *X;
+  Type *XTy;
+
+  Constant *C;
+  if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) {
+    XTy = X->getType();
+    unsigned XBitWidth = XTy->getScalarSizeInBits();
+    if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                     APInt(XBitWidth, XBitWidth - 1))))
+      return nullptr;
+  } else if (isa<BinaryOperator>(Val) &&
+             (X = reassociateShiftAmtsOfTwoSameDirectionShifts(
+                  cast<BinaryOperator>(Val), SQ.getWithInstruction(Val),
+                  /*AnalyzeForSignBitExtraction=*/true))) {
+    XTy = X->getType();
+  } else
+    return nullptr;
+
+  return ICmpInst::Create(Instruction::ICmp,
+                          Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE
+                                                    : ICmpInst::ICMP_SLT,
+                          X, ConstantInt::getNullValue(XTy));
+}
+
 // Handle  icmp pred X, 0
 Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
   CmpInst::Predicate Pred = Cmp.getPredicate();
@@ -1348,6 +1406,9 @@ Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
     }
   }
 
+  if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp))
+    return New;
+
   // Given:
   //   icmp eq/ne (urem %x, %y), 0
   // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
@@ -2192,6 +2253,44 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
+Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp,
+                                                BinaryOperator *SRem,
+                                                const APInt &C) {
+  // Match an 'is positive' or 'is negative' comparison of remainder by a
+  // constant power-of-2 value:
+  // (X % pow2C) sgt/slt 0
+  const ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT)
+    return nullptr;
+
+  // TODO: The one-use check is standard because we do not typically want to
+  //       create longer instruction sequences, but this might be a special-case
+  //       because srem is not good for analysis or codegen.
+  if (!SRem->hasOneUse())
+    return nullptr;
+
+  const APInt *DivisorC;
+  if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
+    return nullptr;
+
+  // Mask off the sign bit and the modulo bits (low-bits).
+  Type *Ty = SRem->getType();
+  APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits());
+  Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1));
+  Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC);
+
+  // For 'is positive?' check that the sign-bit is clear and at least 1 masked
+  // bit is set. Example:
+  // (i8 X % 32) s> 0 --> (X & 159) s> 0
+  if (Pred == ICmpInst::ICMP_SGT)
+    return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty));
+
+  // For 'is negative?' check that the sign-bit is set and at least 1 masked
+  // bit is set. Example:
+  // (i16 X % 4) s< 0 --> (X & 32771) u> 32768
+  return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask));
+}
+
 /// Fold icmp (udiv X, Y), C.
 Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
                                                 BinaryOperator *UDiv,
@@ -2400,6 +2499,11 @@ Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
   const APInt *C2;
   APInt SubResult;
 
+  // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0
+  if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality())
+    return new ICmpInst(Cmp.getPredicate(), Y,
+                        ConstantInt::get(Y->getType(), 0));
+
   // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
   if (match(X, m_APInt(C2)) &&
       ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) ||
@@ -2462,9 +2566,6 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   Type *Ty = Add->getType();
   CmpInst::Predicate Pred = Cmp.getPredicate();
 
-  if (!Add->hasOneUse())
-    return nullptr;
-
   // If the add does not wrap, we can always adjust the compare by subtracting
   // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
   // are canonicalized to SGT/SLT/UGT/ULT.
@@ -2498,6 +2599,9 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
       return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
   }
 
+  if (!Add->hasOneUse())
+    return nullptr;
+
   // X+C <u C2 -> (X & -C2) == C
   //   iff C & (C2-1) == 0
   //       C2 is a power of 2
@@ -2522,20 +2626,49 @@ bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
   // TODO: Generalize this to work with other comparison idioms or ensure
   // they get canonicalized into this form.
 
-  // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32
-  // Greater), where Equal, Less and Greater are placeholders for any three
-  // constants.
-  ICmpInst::Predicate PredA, PredB;
-  if (match(SI->getTrueValue(), m_ConstantInt(Equal)) &&
-      match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) &&
-      PredA == ICmpInst::ICMP_EQ &&
-      match(SI->getFalseValue(),
-            m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)),
-                     m_ConstantInt(Less), m_ConstantInt(Greater))) &&
-      PredB == ICmpInst::ICMP_SLT) {
-    return true;
+  // select i1 (a == b),
+  //        i32 Equal,
+  //        i32 (select i1 (a < b), i32 Less, i32 Greater)
+  // where Equal, Less and Greater are placeholders for any three constants.
+  ICmpInst::Predicate PredA;
+  if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) ||
+      !ICmpInst::isEquality(PredA))
+    return false;
+  Value *EqualVal = SI->getTrueValue();
+  Value *UnequalVal = SI->getFalseValue();
+  // We still can get non-canonical predicate here, so canonicalize.
+  if (PredA == ICmpInst::ICMP_NE)
+    std::swap(EqualVal, UnequalVal);
+  if (!match(EqualVal, m_ConstantInt(Equal)))
+    return false;
+  ICmpInst::Predicate PredB;
+  Value *LHS2, *RHS2;
+  if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)),
+                                  m_ConstantInt(Less), m_ConstantInt(Greater))))
+    return false;
+  // We can get predicate mismatch here, so canonicalize if possible:
+  // First, ensure that 'LHS' match.
+  if (LHS2 != LHS) {
+    // x sgt y <--> y slt x
+    std::swap(LHS2, RHS2);
+    PredB = ICmpInst::getSwappedPredicate(PredB);
+  }
+  if (LHS2 != LHS)
+    return false;
+  // We also need to canonicalize 'RHS'.
+  if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
+    // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
+    auto FlippedStrictness =
+        getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2));
+    if (!FlippedStrictness)
+      return false;
+    assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
+    RHS2 = FlippedStrictness->second;
+    // And kind-of perform the result swap.
+    std::swap(Less, Greater);
+    PredB = ICmpInst::ICMP_SLT;
   }
-  return false;
+  return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
 }
 
 Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
@@ -2715,6 +2848,10 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
       if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C))
         return I;
       break;
+    case Instruction::SRem:
+      if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C))
+        return I;
+      break;
     case Instruction::UDiv:
       if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C))
         return I;
@@ -2939,6 +3076,28 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
     }
     break;
   }
+
+  case Intrinsic::uadd_sat: {
+    // uadd.sat(a, b) == 0  ->  (a | b) == 0
+    if (C.isNullValue()) {
+      Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1));
+      return replaceInstUsesWith(Cmp, Builder.CreateICmp(
+          Cmp.getPredicate(), Or, Constant::getNullValue(Ty)));
+
+    }
+    break;
+  }
+
+  case Intrinsic::usub_sat: {
+    // usub.sat(a, b) == 0  ->  a <= b
+    if (C.isNullValue()) {
+      ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ
+          ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+      return ICmpInst::Create(Instruction::ICmp, NewPred,
+                              II->getArgOperand(0), II->getArgOperand(1));
+    }
+    break;
+  }
   default:
     break;
   }
@@ -3205,6 +3364,23 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
     llvm_unreachable("All possible folds are handled.");
   }
 
+  // The mask value may be a vector constant that has undefined elements. But it
+  // may not be safe to propagate those undefs into the new compare, so replace
+  // those elements by copying an existing, defined, and safe scalar constant.
+  Type *OpTy = M->getType();
+  auto *VecC = dyn_cast<Constant>(M);
+  if (OpTy->isVectorTy() && VecC && VecC->containsUndefElement()) {
+    Constant *SafeReplacementConstant = nullptr;
+    for (unsigned i = 0, e = OpTy->getVectorNumElements(); i != e; ++i) {
+      if (!isa<UndefValue>(VecC->getAggregateElement(i))) {
+        SafeReplacementConstant = VecC->getAggregateElement(i);
+        break;
+      }
+    }
+    assert(SafeReplacementConstant && "Failed to find undef replacement");
+    M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant);
+  }
+
   return Builder.CreateICmp(DstPred, X, M);
 }
 
@@ -3288,6 +3464,7 @@ foldICmpWithTruncSignExtendedVal(ICmpInst &I,
 // we should move shifts to the same hand of 'and', i.e. rewrite as
 //   icmp eq/ne (and (x shift (Q+K)), y), 0  iff (Q+K) u< bitwidth(x)
 // We are only interested in opposite logical shifts here.
+// One of the shifts can be truncated.
 // If we can, we want to end up creating 'lshr' shift.
 static Value *
 foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
@@ -3297,18 +3474,32 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
     return nullptr;
 
   auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value());
-  auto m_AnyLShr = m_LShr(m_Value(), m_Value());
-
-  // Look for an 'and' of two (opposite) logical shifts.
-  // Pick the single-use shift as XShift.
-  Instruction *XShift, *YShift;
-  if (!match(I.getOperand(0),
-             m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
-                     m_CombineAnd(m_AnyLogicalShift, m_Instruction(YShift)))))
+
+  // Look for an 'and' of two logical shifts, one of which may be truncated.
+  // We use m_TruncOrSelf() on the RHS to correctly handle commutative case.
+  Instruction *XShift, *MaybeTruncation, *YShift;
+  if (!match(
+          I.getOperand(0),
+          m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
+                  m_CombineAnd(m_TruncOrSelf(m_CombineAnd(
+                                   m_AnyLogicalShift, m_Instruction(YShift))),
+                               m_Instruction(MaybeTruncation)))))
     return nullptr;
 
+  // We potentially looked past 'trunc', but only when matching YShift,
+  // therefore YShift must have the widest type.
+  Instruction *WidestShift = YShift;
+  // Therefore XShift must have the shallowest type.
+  // Or they both have identical types if there was no truncation.
+  Instruction *NarrowestShift = XShift;
+
+  Type *WidestTy = WidestShift->getType();
+  assert(NarrowestShift->getType() == I.getOperand(0)->getType() &&
+         "We did not look past any shifts while matching XShift though.");
+  bool HadTrunc = WidestTy != I.getOperand(0)->getType();
+
   // If YShift is a 'lshr', swap the shifts around.
-  if (match(YShift, m_AnyLShr))
+  if (match(YShift, m_LShr(m_Value(), m_Value())))
     std::swap(XShift, YShift);
 
   // The shifts must be in opposite directions.
@@ -3317,44 +3508,181 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
     return nullptr; // Do not care about same-direction shifts here.
 
   Value *X, *XShAmt, *Y, *YShAmt;
-  match(XShift, m_BinOp(m_Value(X), m_Value(XShAmt)));
-  match(YShift, m_BinOp(m_Value(Y), m_Value(YShAmt)));
+  match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt))));
+  match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt))));
 
   // If one of the values being shifted is a constant, then we will end with
-  // and+icmp, and shift instr will be constant-folded. If they are not,
+  // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not,
   // however, we will need to ensure that we won't increase instruction count.
   if (!isa<Constant>(X) && !isa<Constant>(Y)) {
     // At least one of the hands of the 'and' should be one-use shift.
     if (!match(I.getOperand(0),
                m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
       return nullptr;
+    if (HadTrunc) {
+      // Due to the 'trunc', we will need to widen X. For that either the old
+      // 'trunc' or the shift amt in the non-truncated shift should be one-use.
+      if (!MaybeTruncation->hasOneUse() &&
+          !NarrowestShift->getOperand(1)->hasOneUse())
+        return nullptr;
+    }
   }
 
+  // We have two shift amounts from two different shifts. The types of those
+  // shift amounts may not match. If that's the case let's bailout now.
+  if (XShAmt->getType() != YShAmt->getType())
+    return nullptr;
+
   // Can we fold (XShAmt+YShAmt) ?
-  Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, XShAmt, YShAmt,
-                                  SQ.getWithInstruction(&I));
+  auto *NewShAmt = dyn_cast_or_null<Constant>(
+      SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
+                      /*isNUW=*/false, SQ.getWithInstruction(&I)));
   if (!NewShAmt)
     return nullptr;
+  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy);
+  unsigned WidestBitWidth = WidestTy->getScalarSizeInBits();
+
   // Is the new shift amount smaller than the bit width?
   // FIXME: could also rely on ConstantRange.
-  unsigned BitWidth = X->getType()->getScalarSizeInBits();
-  if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
-                                          APInt(BitWidth, BitWidth))))
+  if (!match(NewShAmt,
+             m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+                                APInt(WidestBitWidth, WidestBitWidth))))
     return nullptr;
-  // All good, we can do this fold. The shift is the same that was for X.
+
+  // An extra legality check is needed if we had trunc-of-lshr.
+  if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) {
+    auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ,
+                    WidestShift]() {
+      // It isn't obvious whether it's worth it to analyze non-constants here.
+      // Also, let's basically give up on non-splat cases, pessimizing vectors.
+      // If *any* of these preconditions matches we can perform the fold.
+      Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy()
+                                    ? NewShAmt->getSplatValue()
+                                    : NewShAmt;
+      // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold.
+      if (NewShAmtSplat &&
+          (NewShAmtSplat->isNullValue() ||
+           NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1))
+        return true;
+      // We consider *min* leading zeros so a single outlier
+      // blocks the transform as opposed to allowing it.
+      if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) {
+        KnownBits Known = computeKnownBits(C, SQ.DL);
+        unsigned MinLeadZero = Known.countMinLeadingZeros();
+        // If the value being shifted has at most lowest bit set we can fold.
+        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+        if (MaxActiveBits <= 1)
+          return true;
+        // Precondition:  NewShAmt u<= countLeadingZeros(C)
+        if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero))
+          return true;
+      }
+      if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) {
+        KnownBits Known = computeKnownBits(C, SQ.DL);
+        unsigned MinLeadZero = Known.countMinLeadingZeros();
+        // If the value being shifted has at most lowest bit set we can fold.
+        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+        if (MaxActiveBits <= 1)
+          return true;
+        // Precondition:  ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C)
+        if (NewShAmtSplat) {
+          APInt AdjNewShAmt =
+              (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger();
+          if (AdjNewShAmt.ule(MinLeadZero))
+            return true;
+        }
+      }
+      return false; // Can't tell if it's ok.
+    };
+    if (!CanFold())
+      return nullptr;
+  }
+
+  // All good, we can do this fold.
+  X = Builder.CreateZExt(X, WidestTy);
+  Y = Builder.CreateZExt(Y, WidestTy);
+  // The shift is the same that was for X.
   Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr
                   ? Builder.CreateLShr(X, NewShAmt)
                   : Builder.CreateShl(X, NewShAmt);
   Value *T1 = Builder.CreateAnd(T0, Y);
   return Builder.CreateICmp(I.getPredicate(), T1,
-                            Constant::getNullValue(X->getType()));
+                            Constant::getNullValue(WidestTy));
+}
+
+/// Fold
+///   (-1 u/ x) u< y
+///   ((x * y) u/ x) != y
+/// to
+///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
+/// Note that the comparison is commutative, while inverted (u>=, ==) predicate
+/// will mean that we are looking for the opposite answer.
+Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
+  ICmpInst::Predicate Pred;
+  Value *X, *Y;
+  Instruction *Mul;
+  bool NeedNegation;
+  // Look for: (-1 u/ x) u</u>= y
+  if (!I.isEquality() &&
+      match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
+                         m_Value(Y)))) {
+    Mul = nullptr;
+    // Canonicalize as-if y was on RHS.
+    if (I.getOperand(1) != Y)
+      Pred = I.getSwappedPredicate();
+
+    // Are we checking that overflow does not happen, or does happen?
+    switch (Pred) {
+    case ICmpInst::Predicate::ICMP_ULT:
+      NeedNegation = false;
+      break; // OK
+    case ICmpInst::Predicate::ICMP_UGE:
+      NeedNegation = true;
+      break; // OK
+    default:
+      return nullptr; // Wrong predicate.
+    }
+  } else // Look for: ((x * y) u/ x) !=/== y
+      if (I.isEquality() &&
+          match(&I, m_c_ICmp(Pred, m_Value(Y),
+                             m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
+                                                                  m_Value(X)),
+                                                          m_Instruction(Mul)),
+                                             m_Deferred(X)))))) {
+    NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
+  } else
+    return nullptr;
+
+  BuilderTy::InsertPointGuard Guard(Builder);
+  // If the pattern included (x * y), we'll want to insert new instructions
+  // right before that original multiplication so that we can replace it.
+  bool MulHadOtherUses = Mul && !Mul->hasOneUse();
+  if (MulHadOtherUses)
+    Builder.SetInsertPoint(Mul);
+
+  Function *F = Intrinsic::getDeclaration(
+      I.getModule(), Intrinsic::umul_with_overflow, X->getType());
+  CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul");
+
+  // If the multiplication was used elsewhere, to ensure that we don't leave
+  // "duplicate" instructions, replace uses of that original multiplication
+  // with the multiplication result from the with.overflow intrinsic.
+  if (MulHadOtherUses)
+    replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val"));
+
+  Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov");
+  if (NeedNegation) // This technically increases instruction count.
+    Res = Builder.CreateNot(Res, "umul.not.ov");
+
+  return Res;
 }
 
 /// Try to fold icmp (binop), X or icmp X, (binop).
 /// TODO: A large part of this logic is duplicated in InstSimplify's
 /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
 /// duplication.
-Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
+Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // Special logic for binary operators.
@@ -3367,13 +3695,13 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   Value *X;
 
   // Convert add-with-unsigned-overflow comparisons into a 'not' with compare.
-  // (Op1 + X) <u Op1 --> ~Op1 <u X
-  // Op0 >u (Op0 + X) --> X >u ~Op0
+  // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X
   if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) &&
-      Pred == ICmpInst::ICMP_ULT)
+      (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
     return new ICmpInst(Pred, Builder.CreateNot(Op1), X);
+  // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0
   if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) &&
-      Pred == ICmpInst::ICMP_UGT)
+      (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
     return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
 
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
@@ -3400,21 +3728,21 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     D = BO1->getOperand(1);
   }
 
-  // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
+  // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow.
+  // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow.
   if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
     return new ICmpInst(Pred, A == Op1 ? B : A,
                         Constant::getNullValue(Op1->getType()));
 
-  // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
+  // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow.
+  // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow.
   if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
     return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
                         C == Op0 ? D : C);
 
-  // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
+  // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow.
   if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
-      NoOp1WrapProblem &&
-      // Try not to increase register pressure.
-      BO0->hasOneUse() && BO1->hasOneUse()) {
+      NoOp1WrapProblem) {
     // Determine Y and Z in the form icmp (X+Y), (X+Z).
     Value *Y, *Z;
     if (A == C) {
@@ -3438,39 +3766,39 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     return new ICmpInst(Pred, Y, Z);
   }
 
-  // icmp slt (X + -1), Y -> icmp sle X, Y
+  // icmp slt (A + -1), Op1 -> icmp sle A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
       match(B, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
 
-  // icmp sge (X + -1), Y -> icmp sgt X, Y
+  // icmp sge (A + -1), Op1 -> icmp sgt A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
       match(B, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
 
-  // icmp sle (X + 1), Y -> icmp slt X, Y
+  // icmp sle (A + 1), Op1 -> icmp slt A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
 
-  // icmp sgt (X + 1), Y -> icmp sge X, Y
+  // icmp sgt (A + 1), Op1 -> icmp sge A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
 
-  // icmp sgt X, (Y + -1) -> icmp sge X, Y
+  // icmp sgt Op0, (C + -1) -> icmp sge Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
       match(D, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
 
-  // icmp sle X, (Y + -1) -> icmp slt X, Y
+  // icmp sle Op0, (C + -1) -> icmp slt Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
       match(D, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
 
-  // icmp sge X, (Y + 1) -> icmp sgt X, Y
+  // icmp sge Op0, (C + 1) -> icmp sgt Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
 
-  // icmp slt X, (Y + 1) -> icmp sle X, Y
+  // icmp slt Op0, (C + 1) -> icmp sle Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
 
@@ -3478,33 +3806,33 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
   // wouldn't happen even if they were implemented.
   //
-  // icmp ult (X - 1), Y -> icmp ule X, Y
-  // icmp uge (X - 1), Y -> icmp ugt X, Y
-  // icmp ugt X, (Y - 1) -> icmp uge X, Y
-  // icmp ule X, (Y - 1) -> icmp ult X, Y
+  // icmp ult (A - 1), Op1 -> icmp ule A, Op1
+  // icmp uge (A - 1), Op1 -> icmp ugt A, Op1
+  // icmp ugt Op0, (C - 1) -> icmp uge Op0, C
+  // icmp ule Op0, (C - 1) -> icmp ult Op0, C
 
-  // icmp ule (X + 1), Y -> icmp ult X, Y
+  // icmp ule (A + 1), Op0 -> icmp ult A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
 
-  // icmp ugt (X + 1), Y -> icmp uge X, Y
+  // icmp ugt (A + 1), Op0 -> icmp uge A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
 
-  // icmp uge X, (Y + 1) -> icmp ugt X, Y
+  // icmp uge Op0, (C + 1) -> icmp ugt Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
 
-  // icmp ult X, (Y + 1) -> icmp ule X, Y
+  // icmp ult Op0, (C + 1) -> icmp ule Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
 
   // if C1 has greater magnitude than C2:
-  //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
+  //  icmp (A + C1), (C + C2) -> icmp (A + C3), C
   //  s.t. C3 = C1 - C2
   //
   // if C2 has greater magnitude than C1:
-  //  icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
+  //  icmp (A + C1), (C + C2) -> icmp A, (C + C3)
   //  s.t. C3 = C2 - C1
   if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
       (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
@@ -3542,29 +3870,35 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     D = BO1->getOperand(1);
   }
 
-  // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
+  // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow.
   if (A == Op1 && NoOp0WrapProblem)
     return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
-  // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
+  // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow.
   if (C == Op0 && NoOp1WrapProblem)
     return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
 
-  // (A - B) >u A --> A <u B
-  if (A == Op1 && Pred == ICmpInst::ICMP_UGT)
-    return new ICmpInst(ICmpInst::ICMP_ULT, A, B);
-  // C <u (C - D) --> C <u D
-  if (C == Op0 && Pred == ICmpInst::ICMP_ULT)
-    return new ICmpInst(ICmpInst::ICMP_ULT, C, D);
-
-  // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
-  if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
-      // Try not to increase register pressure.
-      BO0->hasOneUse() && BO1->hasOneUse())
+  // Convert sub-with-unsigned-overflow comparisons into a comparison of args.
+  // (A - B) u>/u<= A --> B u>/u<= A
+  if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
+    return new ICmpInst(Pred, B, A);
+  // C u</u>= (C - D) --> C u</u>= D
+  if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
+    return new ICmpInst(Pred, C, D);
+  // (A - B) u>=/u< A --> B u>/u<= A  iff B != 0
+  if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) &&
+      isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A);
+  // C u<=/u> (C - D) --> C u</u>= D  iff B != 0
+  if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) &&
+      isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D);
+
+  // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow.
+  if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem)
     return new ICmpInst(Pred, A, C);
-  // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
-  if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
-      // Try not to increase register pressure.
-      BO0->hasOneUse() && BO1->hasOneUse())
+
+  // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow.
+  if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem)
     return new ICmpInst(Pred, D, B);
 
   // icmp (0-X) < cst --> x > -cst
@@ -3699,6 +4033,9 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     }
   }
 
+  if (Value *V = foldUnsignedMultiplicationOverflowCheck(I))
+    return replaceInstUsesWith(I, V);
+
   if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
     return replaceInstUsesWith(I, V);
 
@@ -3975,125 +4312,140 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
   return nullptr;
 }
 
-/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so
-/// far.
-Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
-  const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0));
-  Value *LHSCIOp        = LHSCI->getOperand(0);
-  Type *SrcTy     = LHSCIOp->getType();
-  Type *DestTy    = LHSCI->getType();
-
-  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
-  // integer type is the same size as the pointer type.
-  const auto& CompatibleSizes = [&](Type* SrcTy, Type* DestTy) -> bool {
-    if (isa<VectorType>(SrcTy)) {
-      SrcTy = cast<VectorType>(SrcTy)->getElementType();
-      DestTy = cast<VectorType>(DestTy)->getElementType();
-    }
-    return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
-  };
-  if (LHSCI->getOpcode() == Instruction::PtrToInt &&
-      CompatibleSizes(SrcTy, DestTy)) {
-    Value *RHSOp = nullptr;
-    if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
-      Value *RHSCIOp = RHSC->getOperand(0);
-      if (RHSCIOp->getType()->getPointerAddressSpace() ==
-          LHSCIOp->getType()->getPointerAddressSpace()) {
-        RHSOp = RHSC->getOperand(0);
-        // If the pointer types don't match, insert a bitcast.
-        if (LHSCIOp->getType() != RHSOp->getType())
-          RHSOp = Builder.CreateBitCast(RHSOp, LHSCIOp->getType());
-      }
-    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
-      RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
-    }
-
-    if (RHSOp)
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp);
-  }
-
-  // The code below only handles extension cast instructions, so far.
-  // Enforce this.
-  if (LHSCI->getOpcode() != Instruction::ZExt &&
-      LHSCI->getOpcode() != Instruction::SExt)
+static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
+                                           InstCombiner::BuilderTy &Builder) {
+  assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0");
+  auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0));
+  Value *X;
+  if (!match(CastOp0, m_ZExtOrSExt(m_Value(X))))
     return nullptr;
 
-  bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
-  bool isSignedCmp = ICmp.isSigned();
-
-  if (auto *CI = dyn_cast<CastInst>(ICmp.getOperand(1))) {
-    // Not an extension from the same type?
-    Value *RHSCIOp = CI->getOperand(0);
-    if (RHSCIOp->getType() != LHSCIOp->getType())
-      return nullptr;
-
+  bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt;
+  bool IsSignedCmp = ICmp.isSigned();
+  if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) {
     // If the signedness of the two casts doesn't agree (i.e. one is a sext
     // and the other is a zext), then we can't handle this.
-    if (CI->getOpcode() != LHSCI->getOpcode())
+    // TODO: This is too strict. We can handle some predicates (equality?).
+    if (CastOp0->getOpcode() != CastOp1->getOpcode())
       return nullptr;
 
-    // Deal with equality cases early.
+    // Not an extension from the same type?
+    Value *Y = CastOp1->getOperand(0);
+    Type *XTy = X->getType(), *YTy = Y->getType();
+    if (XTy != YTy) {
+      // One of the casts must have one use because we are creating a new cast.
+      if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse())
+        return nullptr;
+      // Extend the narrower operand to the type of the wider operand.
+      if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits())
+        X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy);
+      else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits())
+        Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy);
+      else
+        return nullptr;
+    }
+
+    // (zext X) == (zext Y) --> X == Y
+    // (sext X) == (sext Y) --> X == Y
     if (ICmp.isEquality())
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
+      return new ICmpInst(ICmp.getPredicate(), X, Y);
 
     // A signed comparison of sign extended values simplifies into a
     // signed comparison.
-    if (isSignedCmp && isSignedExt)
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
+    if (IsSignedCmp && IsSignedExt)
+      return new ICmpInst(ICmp.getPredicate(), X, Y);
 
     // The other three cases all fold into an unsigned comparison.
-    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
+    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y);
   }
 
-  // If we aren't dealing with a constant on the RHS, exit early.
+  // Below here, we are only folding a compare with constant.
   auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
   if (!C)
     return nullptr;
 
   // Compute the constant that would happen if we truncated to SrcTy then
   // re-extended to DestTy.
+  Type *SrcTy = CastOp0->getSrcTy();
+  Type *DestTy = CastOp0->getDestTy();
   Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
-  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
+  Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy);
 
   // If the re-extended constant didn't change...
   if (Res2 == C) {
-    // Deal with equality cases early.
     if (ICmp.isEquality())
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
+      return new ICmpInst(ICmp.getPredicate(), X, Res1);
 
     // A signed comparison of sign extended values simplifies into a
     // signed comparison.
-    if (isSignedExt && isSignedCmp)
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
+    if (IsSignedExt && IsSignedCmp)
+      return new ICmpInst(ICmp.getPredicate(), X, Res1);
 
     // The other three cases all fold into an unsigned comparison.
-    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1);
+    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1);
   }
 
   // The re-extended constant changed, partly changed (in the case of a vector),
   // or could not be determined to be equal (in the case of a constant
   // expression), so the constant cannot be represented in the shorter type.
-  // Consequently, we cannot emit a simple comparison.
   // All the cases that fold to true or false will have already been handled
   // by SimplifyICmpInst, so only deal with the tricky case.
+  if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C))
+    return nullptr;
 
-  if (isSignedCmp || !isSignedExt || !isa<ConstantInt>(C))
+  // Is source op positive?
+  // icmp ult (sext X), C --> icmp sgt X, -1
+  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
+    return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy));
+
+  // Is source op negative?
+  // icmp ugt (sext X), C --> icmp slt X, 0
+  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
+  return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy));
+}
+
+/// Handle icmp (cast x), (cast or constant).
+Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) {
+  auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
+  if (!CastOp0)
+    return nullptr;
+  if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1)))
     return nullptr;
 
-  // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
-  // should have been folded away previously and not enter in here.
+  Value *Op0Src = CastOp0->getOperand(0);
+  Type *SrcTy = CastOp0->getSrcTy();
+  Type *DestTy = CastOp0->getDestTy();
 
-  // We're performing an unsigned comp with a sign extended value.
-  // This is true if the input is >= 0. [aka >s -1]
-  Constant *NegOne = Constant::getAllOnesValue(SrcTy);
-  Value *Result = Builder.CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
+  // integer type is the same size as the pointer type.
+  auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) {
+    if (isa<VectorType>(SrcTy)) {
+      SrcTy = cast<VectorType>(SrcTy)->getElementType();
+      DestTy = cast<VectorType>(DestTy)->getElementType();
+    }
+    return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
+  };
+  if (CastOp0->getOpcode() == Instruction::PtrToInt &&
+      CompatibleSizes(SrcTy, DestTy)) {
+    Value *NewOp1 = nullptr;
+    if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
+      Value *PtrSrc = PtrToIntOp1->getOperand(0);
+      if (PtrSrc->getType()->getPointerAddressSpace() ==
+          Op0Src->getType()->getPointerAddressSpace()) {
+        NewOp1 = PtrToIntOp1->getOperand(0);
+        // If the pointer types don't match, insert a bitcast.
+        if (Op0Src->getType() != NewOp1->getType())
+          NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType());
+      }
+    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
+      NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    }
 
-  // Finally, return the value computed.
-  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
-    return replaceInstUsesWith(ICmp, Result);
+    if (NewOp1)
+      return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1);
+  }
 
-  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
-  return BinaryOperator::CreateNot(Result);
+  return foldICmpWithZextOrSext(ICmp, Builder);
 }
 
 static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
@@ -4595,7 +4947,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   // Get scalar or pointer size.
   unsigned BitWidth = Ty->isIntOrIntVectorTy()
                           ? Ty->getScalarSizeInBits()
-                          : DL.getIndexTypeSizeInBits(Ty->getScalarType());
+                          : DL.getPointerTypeSizeInBits(Ty->getScalarType());
 
   if (!BitWidth)
     return nullptr;
@@ -4813,41 +5165,36 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   return nullptr;
 }
 
-/// If we have an icmp le or icmp ge instruction with a constant operand, turn
-/// it into the appropriate icmp lt or icmp gt instruction. This transform
-/// allows them to be folded in visitICmpInst.
-static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
-  ICmpInst::Predicate Pred = I.getPredicate();
-  if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGE &&
-      Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_UGE)
-    return nullptr;
+llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
+llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
+                                               Constant *C) {
+  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
+         "Only for relational integer predicates.");
 
-  Value *Op0 = I.getOperand(0);
-  Value *Op1 = I.getOperand(1);
-  auto *Op1C = dyn_cast<Constant>(Op1);
-  if (!Op1C)
-    return nullptr;
+  Type *Type = C->getType();
+  bool IsSigned = ICmpInst::isSigned(Pred);
+
+  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
+  bool WillIncrement =
+      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
 
-  // Check if the constant operand can be safely incremented/decremented without
-  // overflowing/underflowing. For scalars, SimplifyICmpInst has already handled
-  // the edge cases for us, so we just assert on them. For vectors, we must
-  // handle the edge cases.
-  Type *Op1Type = Op1->getType();
-  bool IsSigned = I.isSigned();
-  bool IsLE = (Pred == ICmpInst::ICMP_SLE || Pred == ICmpInst::ICMP_ULE);
-  auto *CI = dyn_cast<ConstantInt>(Op1C);
-  if (CI) {
-    // A <= MAX -> TRUE ; A >= MIN -> TRUE
-    assert(IsLE ? !CI->isMaxValue(IsSigned) : !CI->isMinValue(IsSigned));
-  } else if (Op1Type->isVectorTy()) {
-    // TODO? If the edge cases for vectors were guaranteed to be handled as they
-    // are for scalar, we could remove the min/max checks. However, to do that,
-    // we would have to use insertelement/shufflevector to replace edge values.
-    unsigned NumElts = Op1Type->getVectorNumElements();
+  // Check if the constant operand can be safely incremented/decremented
+  // without overflowing/underflowing.
+  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
+    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
+  };
+
+  Constant *SafeReplacementConstant = nullptr;
+  if (auto *CI = dyn_cast<ConstantInt>(C)) {
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!ConstantIsOk(CI))
+      return llvm::None;
+  } else if (Type->isVectorTy()) {
+    unsigned NumElts = Type->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = Op1C->getAggregateElement(i);
+      Constant *Elt = C->getAggregateElement(i);
       if (!Elt)
-        return nullptr;
+        return llvm::None;
 
       if (isa<UndefValue>(Elt))
         continue;
@@ -4855,20 +5202,54 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
       // Bail out if we can't determine if this constant is min/max or if we
       // know that this constant is min/max.
       auto *CI = dyn_cast<ConstantInt>(Elt);
-      if (!CI || (IsLE ? CI->isMaxValue(IsSigned) : CI->isMinValue(IsSigned)))
-        return nullptr;
+      if (!CI || !ConstantIsOk(CI))
+        return llvm::None;
+
+      if (!SafeReplacementConstant)
+        SafeReplacementConstant = CI;
     }
   } else {
     // ConstantExpr?
-    return nullptr;
+    return llvm::None;
   }
 
-  // Increment or decrement the constant and set the new comparison predicate:
-  // ULE -> ULT ; UGE -> UGT ; SLE -> SLT ; SGE -> SGT
-  Constant *OneOrNegOne = ConstantInt::get(Op1Type, IsLE ? 1 : -1, true);
-  CmpInst::Predicate NewPred = IsLE ? ICmpInst::ICMP_ULT: ICmpInst::ICMP_UGT;
-  NewPred = IsSigned ? ICmpInst::getSignedPredicate(NewPred) : NewPred;
-  return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
+  // It may not be safe to change a compare predicate in the presence of
+  // undefined elements, so replace those elements with the first safe constant
+  // that we found.
+  if (C->containsUndefElement()) {
+    assert(SafeReplacementConstant && "Replacement constant not set");
+    C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
+  }
+
+  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
+
+  // Increment or decrement the constant.
+  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
+  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
+
+  return std::make_pair(NewPred, NewC);
+}
+
+/// If we have an icmp le or icmp ge instruction with a constant operand, turn
+/// it into the appropriate icmp lt or icmp gt instruction. This transform
+/// allows them to be folded in visitICmpInst.
+static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
+  ICmpInst::Predicate Pred = I.getPredicate();
+  if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
+      isCanonicalPredicate(Pred))
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (!Op1C)
+    return nullptr;
+
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
+  if (!FlippedStrictness)
+    return nullptr;
+
+  return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second);
 }
 
 /// Integer compare with boolean values can always be turned into bitwise ops.
@@ -5022,8 +5403,39 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
   return nullptr;
 }
 
+// extract(uadd.with.overflow(A, B), 0) ult A
+//  -> extract(uadd.with.overflow(A, B), 1)
+static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
+  CmpInst::Predicate Pred = I.getPredicate();
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  Value *UAddOv;
+  Value *A, *B;
+  auto UAddOvResultPat = m_ExtractValue<0>(
+      m_Intrinsic<Intrinsic::uadd_with_overflow>(m_Value(A), m_Value(B)));
+  if (match(Op0, UAddOvResultPat) &&
+      ((Pred == ICmpInst::ICMP_ULT && (Op1 == A || Op1 == B)) ||
+       (Pred == ICmpInst::ICMP_EQ && match(Op1, m_ZeroInt()) &&
+        (match(A, m_One()) || match(B, m_One()))) ||
+       (Pred == ICmpInst::ICMP_NE && match(Op1, m_AllOnes()) &&
+        (match(A, m_AllOnes()) || match(B, m_AllOnes())))))
+    // extract(uadd.with.overflow(A, B), 0) < A
+    // extract(uadd.with.overflow(A, 1), 0) == 0
+    // extract(uadd.with.overflow(A, -1), 0) != -1
+    UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand();
+  else if (match(Op1, UAddOvResultPat) &&
+           Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B))
+    // A > extract(uadd.with.overflow(A, B), 0)
+    UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand();
+  else
+    return nullptr;
+
+  return ExtractValueInst::Create(UAddOv, 1);
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   unsigned Op0Cplxity = getComplexity(Op0);
   unsigned Op1Cplxity = getComplexity(Op1);
@@ -5038,8 +5450,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     Changed = true;
   }
 
-  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1,
-                                  SQ.getWithInstruction(&I)))
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
     return replaceInstUsesWith(I, V);
 
   // Comparing -val or val with non-zero is the same as just comparing val
@@ -5072,6 +5483,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpWithDominatingICmp(I))
     return Res;
 
+  if (Instruction *Res = foldICmpBinOp(I, Q))
+    return Res;
+
   if (Instruction *Res = foldICmpUsingKnownBits(I))
     return Res;
 
@@ -5120,6 +5534,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpInstWithConstant(I))
     return Res;
 
+  // Try to match comparison as a sign bit test. Intentionally do this after
+  // foldICmpInstWithConstant() to potentially let other folds to happen first.
+  if (Instruction *New = foldSignBitTest(I))
+    return New;
+
   if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
     return Res;
 
@@ -5146,20 +5565,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpBitCast(I, Builder))
     return Res;
 
-  if (isa<CastInst>(Op0)) {
-    // Handle the special case of: icmp (cast bool to X), <cst>
-    // This comes up when you have code like
-    //   int X = A < B;
-    //   if (X) ...
-    // For generality, we handle any zero-extension of any operand comparison
-    // with a constant or another cast from the same type.
-    if (isa<Constant>(Op1) || isa<CastInst>(Op1))
-      if (Instruction *R = foldICmpWithCastAndCast(I))
-        return R;
-  }
-
-  if (Instruction *Res = foldICmpBinOp(I))
-    return Res;
+  if (Instruction *R = foldICmpWithCastOp(I))
+    return R;
 
   if (Instruction *Res = foldICmpWithMinMax(I))
     return Res;
@@ -5214,6 +5621,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpEquality(I))
     return Res;
 
+  if (Instruction *Res = foldICmpOfUAddOv(I))
+    return Res;
+
   // The 'cmpxchg' instruction returns an aggregate containing the old value and
   // an i1 which indicates whether or not we successfully did the swap.
   //
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 434b0d591215..1a746cb87abb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -113,6 +113,48 @@ static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
   }
 }
 
+/// Given an exploded icmp instruction, return true if the comparison only
+/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
+/// result of the comparison is true when the input value is signed.
+inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
+                           bool &TrueIfSigned) {
+  switch (Pred) {
+  case ICmpInst::ICMP_SLT: // True if LHS s< 0
+    TrueIfSigned = true;
+    return RHS.isNullValue();
+  case ICmpInst::ICMP_SLE: // True if LHS s<= -1
+    TrueIfSigned = true;
+    return RHS.isAllOnesValue();
+  case ICmpInst::ICMP_SGT: // True if LHS s> -1
+    TrueIfSigned = false;
+    return RHS.isAllOnesValue();
+  case ICmpInst::ICMP_SGE: // True if LHS s>= 0
+    TrueIfSigned = false;
+    return RHS.isNullValue();
+  case ICmpInst::ICMP_UGT:
+    // True if LHS u> RHS and RHS == sign-bit-mask - 1
+    TrueIfSigned = true;
+    return RHS.isMaxSignedValue();
+  case ICmpInst::ICMP_UGE:
+    // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = true;
+    return RHS.isMinSignedValue();
+  case ICmpInst::ICMP_ULT:
+    // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = false;
+    return RHS.isMinSignedValue();
+  case ICmpInst::ICMP_ULE:
+    // True if LHS u<= RHS and RHS == sign-bit-mask - 1
+    TrueIfSigned = false;
+    return RHS.isMaxSignedValue();
+  default:
+    return false;
+  }
+}
+
+llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
+getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C);
+
 /// Return the source operand of a potentially bitcasted value while optionally
 /// checking if it has one use. If there is no bitcast or the one use check is
 /// not met, return the input value itself.
@@ -139,31 +181,16 @@ static inline Constant *SubOne(Constant *C) {
 /// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
 /// is true, work under the assumption that the caller intends to remove all
 /// uses of V and only keep uses of ~V.
-static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
+///
+/// See also: canFreelyInvertAllUsersOf()
+static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
   // ~(~(X)) -> X.
   if (match(V, m_Not(m_Value())))
     return true;
 
   // Constants can be considered to be not'ed values.
-  if (isa<ConstantInt>(V))
-    return true;
-
-  // A vector of constant integers can be inverted easily.
-  if (V->getType()->isVectorTy() && isa<Constant>(V)) {
-    unsigned NumElts = V->getType()->getVectorNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
-      if (!Elt)
-        return false;
-
-      if (isa<UndefValue>(Elt))
-        continue;
-
-      if (!isa<ConstantInt>(Elt))
-        return false;
-    }
+  if (match(V, m_AnyIntegralConstant()))
     return true;
-  }
 
   // Compares can be inverted if all of their uses are being modified to use the
   // ~V.
@@ -185,6 +212,32 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
   return false;
 }
 
+/// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
+///
+/// See also: isFreeToInvert()
+static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
+  // Look at every user of V.
+  for (User *U : V->users()) {
+    if (U == IgnoredUser)
+      continue; // Don't consider this user.
+
+    auto *I = cast<Instruction>(U);
+    switch (I->getOpcode()) {
+    case Instruction::Select:
+    case Instruction::Br:
+      break; // Free to invert by swapping true/false values/destinations.
+    case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it.
+      if (!match(I, m_Not(m_Value())))
+        return false; // Not a 'not'.
+      break;
+    default:
+      return false; // Don't know, likely not freely invertible.
+    }
+    // So far all users were free to invert...
+  }
+  return true; // Can freely invert all users!
+}
+
 /// Some binary operators require special handling to avoid poison and undefined
 /// behavior. If a constant vector has undef elements, replace those undefs with
 /// identity constants if possible because those are always safe to execute.
@@ -316,7 +369,8 @@ public:
   Instruction *visitFNeg(UnaryOperator &I);
   Instruction *visitAdd(BinaryOperator &I);
   Instruction *visitFAdd(BinaryOperator &I);
-  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
+  Value *OptimizePointerDifference(
+      Value *LHS, Value *RHS, Type *Ty, bool isNUW);
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
@@ -337,6 +391,13 @@ public:
   Instruction *visitOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
+  Value *reassociateShiftAmtsOfTwoSameDirectionShifts(
+      BinaryOperator *Sh0, const SimplifyQuery &SQ,
+      bool AnalyzeForSignBitExtraction = false);
+  Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+      BinaryOperator &I);
+  Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
+      BinaryOperator &OldAShr);
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
@@ -386,6 +447,7 @@ public:
   Instruction *visitLandingPadInst(LandingPadInst &LI);
   Instruction *visitVAStartInst(VAStartInst &I);
   Instruction *visitVACopyInst(VACopyInst &I);
+  Instruction *visitFreeze(FreezeInst &I);
 
   /// Specify what to return for unhandled instructions.
   Instruction *visitInstruction(Instruction &I) { return nullptr; }
@@ -405,6 +467,9 @@ public:
   /// \return true if successful.
   bool replacePointer(Instruction &I, Value *V);
 
+  LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                 const Twine &Suffix = "");
+
 private:
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
@@ -541,6 +606,7 @@ private:
   Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
   Instruction *narrowRotate(TruncInst &Trunc);
   Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
+  Instruction *matchSAddSubSat(SelectInst &MinMax1);
 
   /// Determine if a pair of casts can be replaced by a single cast.
   ///
@@ -557,7 +623,7 @@ private:
 
   Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
   Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
-  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &I);
 
   /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
   /// NOTE: Unlike most of instcombine, this returns a Value which should
@@ -644,7 +710,7 @@ public:
   Instruction *eraseInstFromFunction(Instruction &I) {
     LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
     assert(I.use_empty() && "Cannot erase instruction that is used!");
-    salvageDebugInfo(I);
+    salvageDebugInfoOrMarkUndef(I);
 
     // Make sure that we reprocess all operands now that we reduced their
     // use counts.
@@ -725,7 +791,7 @@ public:
       Value *LHS, Value *RHS, Instruction *CxtI) const;
 
   /// Maximum size of array considered when transforming.
-  uint64_t MaxArraySizeForCombine;
+  uint64_t MaxArraySizeForCombine = 0;
 
 private:
   /// Performs a few simplifications for operators which are associative
@@ -798,7 +864,8 @@ private:
                                                int DmaskIdx = -1);
 
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                    APInt &UndefElts, unsigned Depth = 0);
+                                    APInt &UndefElts, unsigned Depth = 0,
+                                    bool AllowMultipleUsers = false);
 
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
@@ -847,17 +914,21 @@ private:
                                     Constant *RHSC);
   Instruction *foldICmpAddOpConst(Value *X, const APInt &C,
                                   ICmpInst::Predicate Pred);
-  Instruction *foldICmpWithCastAndCast(ICmpInst &ICI);
+  Instruction *foldICmpWithCastOp(ICmpInst &ICI);
 
   Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
   Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
   Instruction *foldICmpWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
-  Instruction *foldICmpBinOp(ICmpInst &Cmp);
+  Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
+  Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
+  Instruction *foldSignBitTest(ICmpInst &I);
   Instruction *foldICmpWithZero(ICmpInst &Cmp);
 
+  Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp);
+
   Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
                                       ConstantInt *C);
   Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
@@ -874,6 +945,8 @@ private:
                                    const APInt &C);
   Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr,
                                    const APInt &C);
+  Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+                                    const APInt &C);
   Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
                                     const APInt &C);
   Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 054fb7da09a2..74654f7ef51d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -175,7 +175,7 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
   uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
   if (!AllocaSize)
     return false;
-  return isDereferenceableAndAlignedPointer(V, AI->getAlignment(),
+  return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
                                             APInt(64, AllocaSize), DL);
 }
 
@@ -197,7 +197,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
     if (C->getValue().getActiveBits() <= 64) {
       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
-      New->setAlignment(AI.getAlignment());
+      New->setAlignment(MaybeAlign(AI.getAlignment()));
 
       // Scan to the end of the allocation instructions, to skip over a block of
       // allocas if possible...also skip interleaved debug info
@@ -345,7 +345,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   if (AI.getAllocatedType()->isSized()) {
     // If the alignment is 0 (unspecified), assign it the preferred alignment.
     if (AI.getAlignment() == 0)
-      AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType()));
+      AI.setAlignment(
+          MaybeAlign(DL.getPrefTypeAlignment(AI.getAllocatedType())));
 
     // Move all alloca's of zero byte objects to the entry block and merge them
     // together.  Note that we only do this for alloca's, because malloc should
@@ -377,12 +378,12 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         // assign it the preferred alignment.
         if (EntryAI->getAlignment() == 0)
           EntryAI->setAlignment(
-              DL.getPrefTypeAlignment(EntryAI->getAllocatedType()));
+              MaybeAlign(DL.getPrefTypeAlignment(EntryAI->getAllocatedType())));
         // Replace this zero-sized alloca with the one at the start of the entry
         // block after ensuring that the address will be aligned enough for both
         // types.
-        unsigned MaxAlign = std::max(EntryAI->getAlignment(),
-                                     AI.getAlignment());
+        const MaybeAlign MaxAlign(
+            std::max(EntryAI->getAlignment(), AI.getAlignment()));
         EntryAI->setAlignment(MaxAlign);
         if (AI.getType() != EntryAI->getType())
           return new BitCastInst(EntryAI, AI.getType());
@@ -448,67 +449,30 @@ static bool isSupportedAtomicType(Type *Ty) {
 ///
 /// Note that this will create all of the instructions with whatever insert
 /// point the \c InstCombiner currently is using.
-static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy,
-                                      const Twine &Suffix = "") {
+LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                             const Twine &Suffix) {
   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
          "can't fold an atomic load to requested type");
 
   Value *Ptr = LI.getPointerOperand();
   unsigned AS = LI.getPointerAddressSpace();
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
-  LI.getAllMetadata(MD);
-
   Value *NewPtr = nullptr;
   if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
         NewPtr->getType()->getPointerElementType() == NewTy &&
         NewPtr->getType()->getPointerAddressSpace() == AS))
-    NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
+    NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
 
-  LoadInst *NewLoad = IC.Builder.CreateAlignedLoad(
-      NewTy, NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
-  NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
-  MDBuilder MDB(NewLoad->getContext());
-  for (const auto &MDPair : MD) {
-    unsigned ID = MDPair.first;
-    MDNode *N = MDPair.second;
-    // Note, essentially every kind of metadata should be preserved here! This
-    // routine is supposed to clone a load instruction changing *only its type*.
-    // The only metadata it makes sense to drop is metadata which is invalidated
-    // when the pointer type changes. This should essentially never be the case
-    // in LLVM, but we explicitly switch over only known metadata to be
-    // conservatively correct. If you are adding metadata to LLVM which pertains
-    // to loads, you almost certainly want to add it here.
-    switch (ID) {
-    case LLVMContext::MD_dbg:
-    case LLVMContext::MD_tbaa:
-    case LLVMContext::MD_prof:
-    case LLVMContext::MD_fpmath:
-    case LLVMContext::MD_tbaa_struct:
-    case LLVMContext::MD_invariant_load:
-    case LLVMContext::MD_alias_scope:
-    case LLVMContext::MD_noalias:
-    case LLVMContext::MD_nontemporal:
-    case LLVMContext::MD_mem_parallel_loop_access:
-    case LLVMContext::MD_access_group:
-      // All of these directly apply.
-      NewLoad->setMetadata(ID, N);
-      break;
+  unsigned Align = LI.getAlignment();
+  if (!Align)
+    // If old load did not have an explicit alignment specified,
+    // manually preserve the implied (ABI) alignment of the load.
+    // Else we may inadvertently incorrectly over-promise alignment.
+    Align = getDataLayout().getABITypeAlignment(LI.getType());
 
-    case LLVMContext::MD_nonnull:
-      copyNonnullMetadata(LI, N, *NewLoad);
-      break;
-    case LLVMContext::MD_align:
-    case LLVMContext::MD_dereferenceable:
-    case LLVMContext::MD_dereferenceable_or_null:
-      // These only directly apply if the new type is also a pointer.
-      if (NewTy->isPointerTy())
-        NewLoad->setMetadata(ID, N);
-      break;
-    case LLVMContext::MD_range:
-      copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
-      break;
-    }
-  }
+  LoadInst *NewLoad = Builder.CreateAlignedLoad(
+      NewTy, NewPtr, Align, LI.isVolatile(), LI.getName() + Suffix);
+  NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+  copyMetadataForLoad(*NewLoad, LI);
   return NewLoad;
 }
 
@@ -569,7 +533,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
 
 /// Returns true if instruction represent minmax pattern like:
 ///   select ((cmp load V1, load V2), V1, V2).
-static bool isMinMaxWithLoads(Value *V) {
+static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
   assert(V->getType()->isPointerTy() && "Expected pointer type.");
   // Ignore possible ty* to ixx* bitcast.
   V = peekThroughBitcast(V);
@@ -583,6 +547,7 @@ static bool isMinMaxWithLoads(Value *V) {
   if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
                          m_Value(LHS), m_Value(RHS))))
     return false;
+  LoadTy = L1->getType();
   return (match(L1, m_Load(m_Specific(LHS))) &&
           match(L2, m_Load(m_Specific(RHS)))) ||
          (match(L1, m_Load(m_Specific(RHS))) &&
@@ -628,20 +593,22 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
   // size is a legal integer type.
   // Do not perform canonicalization if minmax pattern is found (to avoid
   // infinite loop).
+  Type *Dummy;
   if (!Ty->isIntegerTy() && Ty->isSized() &&
+      !(Ty->isVectorTy() && Ty->getVectorIsScalable()) &&
       DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
       DL.typeSizeEqualsStoreSize(Ty) &&
       !DL.isNonIntegralPointerType(Ty) &&
       !isMinMaxWithLoads(
-          peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true))) {
+          peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true),
+          Dummy)) {
     if (all_of(LI.users(), [&LI](User *U) {
           auto *SI = dyn_cast<StoreInst>(U);
           return SI && SI->getPointerOperand() != &LI &&
                  !SI->getPointerOperand()->isSwiftError();
         })) {
-      LoadInst *NewLoad = combineLoadToNewType(
-          IC, LI,
-          Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
+      LoadInst *NewLoad = IC.combineLoadToNewType(
+          LI, Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
       // Replace all the stores with stores of the newly loaded value.
       for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
         auto *SI = cast<StoreInst>(*UI++);
@@ -663,7 +630,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
     if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
       if (CI->isNoopCast(DL))
         if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
-          LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
+          LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
           CI->replaceAllUsesWith(NewLoad);
           IC.eraseInstFromFunction(*CI);
           return &LI;
@@ -691,8 +658,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     // If the struct only have one element, we unpack.
     auto NumElements = ST->getNumElements();
     if (NumElements == 1) {
-      LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
-                                               ".unpack");
+      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
+                                                  ".unpack");
       AAMDNodes AAMD;
       LI.getAAMetadata(AAMD);
       NewLoad->setAAMetadata(AAMD);
@@ -741,7 +708,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     auto *ET = AT->getElementType();
     auto NumElements = AT->getNumElements();
     if (NumElements == 1) {
-      LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
+      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
       AAMDNodes AAMD;
       LI.getAAMetadata(AAMD);
       NewLoad->setAAMetadata(AAMD);
@@ -1004,9 +971,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
 
   if (KnownAlign > EffectiveLoadAlign)
-    LI.setAlignment(KnownAlign);
+    LI.setAlignment(MaybeAlign(KnownAlign));
   else if (LoadAlign == 0)
-    LI.setAlignment(EffectiveLoadAlign);
+    LI.setAlignment(MaybeAlign(EffectiveLoadAlign));
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
@@ -1063,11 +1030,11 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     //
     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
-      unsigned Align = LI.getAlignment();
-      if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(), Align,
-                                      DL, SI) &&
-          isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(), Align,
-                                      DL, SI)) {
+      const MaybeAlign Alignment(LI.getAlignment());
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
+                                      Alignment, DL, SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
+                                      Alignment, DL, SI)) {
         LoadInst *V1 =
             Builder.CreateLoad(LI.getType(), SI->getOperand(1),
                                SI->getOperand(1)->getName() + ".val");
@@ -1075,9 +1042,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
             Builder.CreateLoad(LI.getType(), SI->getOperand(2),
                                SI->getOperand(2)->getName() + ".val");
         assert(LI.isUnordered() && "implied by above");
-        V1->setAlignment(Align);
+        V1->setAlignment(Alignment);
         V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
-        V2->setAlignment(Align);
+        V2->setAlignment(Alignment);
         V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
         return SelectInst::Create(SI->getCondition(), V1, V2);
       }
@@ -1365,7 +1332,19 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
   auto *LI = cast<LoadInst>(SI.getValueOperand());
   if (!LI->getType()->isIntegerTy())
     return false;
-  if (!isMinMaxWithLoads(LoadAddr))
+  Type *CmpLoadTy;
+  if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
+    return false;
+
+  // Make sure the type would actually change.
+  // This condition can be hit with chains of bitcasts.
+  if (LI->getType() == CmpLoadTy)
+    return false;
+
+  // Make sure we're not changing the size of the load/store.
+  const auto &DL = IC.getDataLayout();
+  if (DL.getTypeStoreSizeInBits(LI->getType()) !=
+      DL.getTypeStoreSizeInBits(CmpLoadTy))
     return false;
 
   if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
@@ -1377,8 +1356,7 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
     return false;
 
   IC.Builder.SetInsertPoint(LI);
-  LoadInst *NewLI = combineLoadToNewType(
-      IC, *LI, LoadAddr->getType()->getPointerElementType());
+  LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
   // Replace all the stores with stores of the newly loaded value.
   for (auto *UI : LI->users()) {
     auto *USI = cast<StoreInst>(UI);
@@ -1399,15 +1377,15 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     return eraseInstFromFunction(SI);
 
   // Attempt to improve the alignment.
-  unsigned KnownAlign = getOrEnforceKnownAlignment(
-      Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT);
-  unsigned StoreAlign = SI.getAlignment();
-  unsigned EffectiveStoreAlign =
-      StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType());
+  const Align KnownAlign = Align(getOrEnforceKnownAlignment(
+      Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT));
+  const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment());
+  const Align EffectiveStoreAlign =
+      StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType()));
 
   if (KnownAlign > EffectiveStoreAlign)
     SI.setAlignment(KnownAlign);
-  else if (StoreAlign == 0)
+  else if (!StoreAlign)
     SI.setAlignment(EffectiveStoreAlign);
 
   // Try to canonicalize the stored type.
@@ -1466,9 +1444,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
                                                         SI.getOperand(1))) {
         ++NumDeadStore;
-        ++BBI;
+        // Manually add back the original store to the worklist now, so it will
+        // be processed after the operands of the removed store, as this may
+        // expose additional DSE opportunities.
+        Worklist.Add(&SI);
         eraseInstFromFunction(*PrevSI);
-        continue;
+        return nullptr;
       }
       break;
     }
@@ -1622,8 +1603,8 @@ bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
 
   // Advance to a place where it is safe to insert the new store and insert it.
   BBI = DestBB->getFirstInsertionPt();
-  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
-                                   SI.isVolatile(), SI.getAlignment(),
+  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(),
+                                   MaybeAlign(SI.getAlignment()),
                                    SI.getOrdering(), SI.getSyncScopeID());
   InsertNewInstBefore(NewSI, *BBI);
   NewSI->setDebugLoc(MergedLoc);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index cc753ce05313..2774e46151fa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -124,6 +124,50 @@ static Constant *getLogBase2(Type *Ty, Constant *C) {
   return ConstantVector::get(Elts);
 }
 
+// TODO: This is a specific form of a much more general pattern.
+//       We could detect a select with any binop identity constant, or we
+//       could use SimplifyBinOp to see if either arm of the select reduces.
+//       But that needs to be done carefully and/or while removing potential
+//       reverse canonicalizations as in InstCombiner::foldSelectIntoOp().
+static Value *foldMulSelectToNegate(BinaryOperator &I,
+                                    InstCombiner::BuilderTy &Builder) {
+  Value *Cond, *OtherOp;
+
+  // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp
+  // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp
+  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())),
+                        m_Value(OtherOp))))
+    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp));
+
+  // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp
+  // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp
+  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())),
+                        m_Value(OtherOp))))
+    return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp);
+
+  // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp
+  // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
+  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0),
+                                           m_SpecificFP(-1.0))),
+                         m_Value(OtherOp)))) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp));
+  }
+
+  // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp
+  // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp
+  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0),
+                                           m_SpecificFP(1.0))),
+                         m_Value(OtherOp)))) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
@@ -213,6 +257,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
     return FoldedMul;
 
+  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+    return replaceInstUsesWith(I, FoldedMul);
+
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
@@ -358,6 +405,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
     return FoldedMul;
 
+  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+    return replaceInstUsesWith(I, FoldedMul);
+
   // X * -1.0 --> -X
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (match(Op1, m_SpecificFP(-1.0)))
@@ -373,16 +423,6 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
     return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
 
-  // Sink negation: -X * Y --> -(X * Y)
-  // But don't transform constant expressions because there's an inverse fold.
-  if (match(Op0, m_OneUse(m_FNeg(m_Value(X)))) && !isa<ConstantExpr>(Op0))
-    return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op1, &I), &I);
-
-  // Sink negation: Y * -X --> -(X * Y)
-  // But don't transform constant expressions because there's an inverse fold.
-  if (match(Op1, m_OneUse(m_FNeg(m_Value(X)))) && !isa<ConstantExpr>(Op1))
-    return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op0, &I), &I);
-
   // fabs(X) * fabs(X) -> X * X
   if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
     return BinaryOperator::CreateFMulFMF(X, X, &I);
@@ -1199,6 +1239,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
       return BinaryOperator::CreateFDivFMF(YZ, X, &I);
     }
+    // Z / (1.0 / Y) => (Y * Z)
+    //
+    // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The
+    // m_OneUse check is avoided because even in the case of the multiple uses
+    // for 1.0/Y, the number of instructions remain the same and a division is
+    // replaced by a multiplication.
+    if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y))))
+      return BinaryOperator::CreateFMulFMF(Y, Op0, &I);
   }
 
   if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
@@ -1211,8 +1259,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
         !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
                   match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
 
-    if ((IsTan || IsCot) && hasUnaryFloatFn(&TLI, I.getType(), LibFunc_tan,
-                                            LibFunc_tanf, LibFunc_tanl)) {
+    if ((IsTan || IsCot) &&
+        hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
       IRBuilder<> B(&I);
       IRBuilder<>::FastMathFlagGuard FMFGuard(B);
       B.setFastMathFlags(I.getFastMathFlags());
@@ -1244,6 +1292,17 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     return &I;
   }
 
+  // X / fabs(X) -> copysign(1.0, X)
+  // fabs(X) / X -> copysign(1.0, X)
+  if (I.hasNoNaNs() && I.hasNoInfs() &&
+      (match(&I,
+             m_FDiv(m_Value(X), m_Intrinsic<Intrinsic::fabs>(m_Deferred(X)))) ||
+       match(&I, m_FDiv(m_Intrinsic<Intrinsic::fabs>(m_Value(X)),
+                        m_Deferred(X))))) {
+    Value *V = Builder.CreateBinaryIntrinsic(
+        Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
+    return replaceInstUsesWith(I, V);
+  }
   return nullptr;
 }
 
@@ -1309,14 +1368,18 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
   if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+    // This may increase instruction count, we don't enforce that Y is a
+    // constant.
     Constant *N1 = Constant::getAllOnesValue(Ty);
     Value *Add = Builder.CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
   }
 
   // 1 urem X -> zext(X != 1)
-  if (match(Op0, m_One()))
-    return CastInst::CreateZExtOrBitCast(Builder.CreateICmpNE(Op1, Op0), Ty);
+  if (match(Op0, m_One())) {
+    Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1));
+    return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+  }
 
   // X urem C -> X < C ? X : X - C, where C >= signbit.
   if (match(Op1, m_Negative())) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 5820ab726637..74e015a4f1d4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -14,9 +14,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -180,13 +181,14 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
          "Not enough available ptr typed incoming values");
   PHINode *MatchingPtrPHI = nullptr;
   unsigned NumPhis = 0;
-  for (auto II = BB->begin(), EI = BasicBlock::iterator(BB->getFirstNonPHI());
-       II != EI; II++, NumPhis++) {
+  for (auto II = BB->begin(); II != BB->end(); II++, NumPhis++) {
     // FIXME: consider handling this in AggressiveInstCombine
+    PHINode *PtrPHI = dyn_cast<PHINode>(II);
+    if (!PtrPHI)
+      break;
     if (NumPhis > MaxNumPhis)
       return nullptr;
-    PHINode *PtrPHI = dyn_cast<PHINode>(II);
-    if (!PtrPHI || PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType())
+    if (PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType())
       continue;
     MatchingPtrPHI = PtrPHI;
     for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) {
@@ -542,7 +544,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   // visitLoadInst will propagate an alignment onto the load when TD is around,
   // and if TD isn't around, we can't handle the mixed case.
   bool isVolatile = FirstLI->isVolatile();
-  unsigned LoadAlignment = FirstLI->getAlignment();
+  MaybeAlign LoadAlignment(FirstLI->getAlignment());
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
 
   // We can't sink the load if the loaded value could be modified between the
@@ -574,10 +576,10 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 
     // If some of the loads have an alignment specified but not all of them,
     // we can't do the transformation.
-    if ((LoadAlignment != 0) != (LI->getAlignment() != 0))
+    if ((LoadAlignment.hasValue()) != (LI->getAlignment() != 0))
       return nullptr;
 
-    LoadAlignment = std::min(LoadAlignment, LI->getAlignment());
+    LoadAlignment = std::min(LoadAlignment, MaybeAlign(LI->getAlignment()));
 
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index aefaf5af1750..49645e9460cd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -704,16 +704,24 @@ static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
   assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
          "Unexpected isUnsigned predicate!");
 
-  // Account for swapped form of subtraction: ((a > b) ? b - a : 0).
+  // Ensure the sub is of the form:
+  //  (a > b) ? a - b : 0 -> usub.sat(a, b)
+  //  (a > b) ? b - a : 0 -> -usub.sat(a, b)
+  // Checking for both a-b and a+(-b) as a constant.
   bool IsNegative = false;
-  if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))))
+  const APInt *C;
+  if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) ||
+      (match(A, m_APInt(C)) &&
+       match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C)))))
     IsNegative = true;
-  else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))))
+  else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) &&
+           !(match(B, m_APInt(C)) &&
+             match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C)))))
     return nullptr;
 
-  // If sub is used anywhere else, we wouldn't be able to eliminate it
-  // afterwards.
-  if (!TrueVal->hasOneUse())
+  // If we are adding a negate and the sub and icmp are used anywhere else, we
+  // would end up with more instructions.
+  if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse())
     return nullptr;
 
   // (a > b) ? a - b : 0 -> usub.sat(a, b)
@@ -781,10 +789,52 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
     return Builder.CreateBinaryIntrinsic(
         Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1));
   }
+  // The overflow may be detected via the add wrapping round.
+  if (match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) &&
+      match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) {
+    // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y)
+    // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
+    return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, Cmp1, Y);
+  }
 
   return nullptr;
 }
 
+/// Fold the following code sequence:
+/// \code
+///   int a = ctlz(x & -x);
+//    x ? 31 - a : a;
+/// \code
+///
+/// into:
+///   cttz(x)
+static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
+                                         Value *FalseVal,
+                                         InstCombiner::BuilderTy &Builder) {
+  unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
+  if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero()))
+    return nullptr;
+
+  if (ICI->getPredicate() == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  if (!match(FalseVal,
+             m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1))))
+    return nullptr;
+
+  if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>()))
+    return nullptr;
+
+  Value *X = ICI->getOperand(0);
+  auto *II = cast<IntrinsicInst>(TrueVal);
+  if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X)))))
+    return nullptr;
+
+  Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz,
+                                          II->getType());
+  return CallInst::Create(F, {X, II->getArgOperand(1)});
+}
+
 /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
 /// call to cttz/ctlz with flag 'is_zero_undef' cleared.
 ///
@@ -963,6 +1013,12 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
       Cmp.getPredicate() == CanonicalPred)
     return nullptr;
 
+  // Bail out on unsimplified X-0 operand (due to some worklist management bug),
+  // as this may cause an infinite combine loop. Let the sub be folded first.
+  if (match(LHS, m_Sub(m_Value(), m_Zero())) ||
+      match(RHS, m_Sub(m_Value(), m_Zero())))
+    return nullptr;
+
   // Create the canonical compare and plug it into the select.
   Sel.setCondition(Builder.CreateICmp(CanonicalPred, LHS, RHS));
 
@@ -973,8 +1029,7 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
   // If we are swapping the select operands, swap the metadata too.
   assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
          "Unexpected results from matchSelectPattern");
-  Sel.setTrueValue(LHS);
-  Sel.setFalseValue(RHS);
+  Sel.swapValues();
   Sel.swapProfMetadata();
   return &Sel;
 }
@@ -1056,17 +1111,293 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
   }
 
   // We are swapping the select operands, so swap the metadata too.
-  Sel.setTrueValue(FVal);
-  Sel.setFalseValue(TVal);
+  Sel.swapValues();
   Sel.swapProfMetadata();
   return &Sel;
 }
 
+static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *ReplaceOp,
+                                     const SimplifyQuery &Q) {
+  // If this is a binary operator, try to simplify it with the replaced op
+  // because we know Op and ReplaceOp are equivalant.
+  // For example: V = X + 1, Op = X, ReplaceOp = 42
+  // Simplifies as: add(42, 1) --> 43
+  if (auto *BO = dyn_cast<BinaryOperator>(V)) {
+    if (BO->getOperand(0) == Op)
+      return SimplifyBinOp(BO->getOpcode(), ReplaceOp, BO->getOperand(1), Q);
+    if (BO->getOperand(1) == Op)
+      return SimplifyBinOp(BO->getOpcode(), BO->getOperand(0), ReplaceOp, Q);
+  }
+
+  return nullptr;
+}
+
+/// If we have a select with an equality comparison, then we know the value in
+/// one of the arms of the select. See if substituting this value into an arm
+/// and simplifying the result yields the same value as the other arm.
+///
+/// To make this transform safe, we must drop poison-generating flags
+/// (nsw, etc) if we simplified to a binop because the select may be guarding
+/// that poison from propagating. If the existing binop already had no
+/// poison-generating flags, then this transform can be done by instsimplify.
+///
+/// Consider:
+///   %cmp = icmp eq i32 %x, 2147483647
+///   %add = add nsw i32 %x, 1
+///   %sel = select i1 %cmp, i32 -2147483648, i32 %add
+///
+/// We can't replace %sel with %add unless we strip away the flags.
+/// TODO: Wrapping flags could be preserved in some cases with better analysis.
+static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
+                                         const SimplifyQuery &Q) {
+  if (!Cmp.isEquality())
+    return nullptr;
+
+  // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
+  Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
+  if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  // Try each equivalence substitution possibility.
+  // We have an 'EQ' comparison, so the select's false value will propagate.
+  // Example:
+  // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
+  // (X == 42) ? (X + 1) : 43 --> (X == 42) ? (42 + 1) : 43 --> 43
+  Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
+  if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q) == TrueVal ||
+      simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q) == TrueVal ||
+      simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q) == FalseVal ||
+      simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q) == FalseVal) {
+    if (auto *FalseInst = dyn_cast<Instruction>(FalseVal))
+      FalseInst->dropPoisonGeneratingFlags();
+    return FalseVal;
+  }
+  return nullptr;
+}
+
+// See if this is a pattern like:
+//   %old_cmp1 = icmp slt i32 %x, C2
+//   %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high
+//   %old_x_offseted = add i32 %x, C1
+//   %old_cmp0 = icmp ult i32 %old_x_offseted, C0
+//   %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement
+// This can be rewritten as more canonical pattern:
+//   %new_cmp1 = icmp slt i32 %x, -C1
+//   %new_cmp2 = icmp sge i32 %x, C0-C1
+//   %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x
+//   %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low
+// Iff -C1 s<= C2 s<= C0-C1
+// Also ULT predicate can also be UGT iff C0 != -1 (+invert result)
+//      SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.)
+static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
+                                          InstCombiner::BuilderTy &Builder) {
+  Value *X = Sel0.getTrueValue();
+  Value *Sel1 = Sel0.getFalseValue();
+
+  // First match the condition of the outermost select.
+  // Said condition must be one-use.
+  if (!Cmp0.hasOneUse())
+    return nullptr;
+  Value *Cmp00 = Cmp0.getOperand(0);
+  Constant *C0;
+  if (!match(Cmp0.getOperand(1),
+             m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))
+    return nullptr;
+  // Canonicalize Cmp0 into the form we expect.
+  // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+  switch (Cmp0.getPredicate()) {
+  case ICmpInst::Predicate::ICMP_ULT:
+    break; // Great!
+  case ICmpInst::Predicate::ICMP_ULE:
+    // We'd have to increment C0 by one, and for that it must not have all-ones
+    // element, but then it would have been canonicalized to 'ult' before
+    // we get here. So we can't do anything useful with 'ule'.
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_UGT:
+    // We want to canonicalize it to 'ult', so we'll need to increment C0,
+    // which again means it must not have any all-ones elements.
+    if (!match(C0,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                  APInt::getAllOnesValue(
+                                      C0->getType()->getScalarSizeInBits()))))
+      return nullptr; // Can't do, have all-ones element[s].
+    C0 = AddOne(C0);
+    std::swap(X, Sel1);
+    break;
+  case ICmpInst::Predicate::ICMP_UGE:
+    // The only way we'd get this predicate if this `icmp` has extra uses,
+    // but then we won't be able to do this fold.
+    return nullptr;
+  default:
+    return nullptr; // Unknown predicate.
+  }
+
+  // Now that we've canonicalized the ICmp, we know the X we expect;
+  // the select in other hand should be one-use.
+  if (!Sel1->hasOneUse())
+    return nullptr;
+
+  // We now can finish matching the condition of the outermost select:
+  // it should either be the X itself, or an addition of some constant to X.
+  Constant *C1;
+  if (Cmp00 == X)
+    C1 = ConstantInt::getNullValue(Sel0.getType());
+  else if (!match(Cmp00,
+                  m_Add(m_Specific(X),
+                        m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1)))))
+    return nullptr;
+
+  Value *Cmp1;
+  ICmpInst::Predicate Pred1;
+  Constant *C2;
+  Value *ReplacementLow, *ReplacementHigh;
+  if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow),
+                            m_Value(ReplacementHigh))) ||
+      !match(Cmp1,
+             m_ICmp(Pred1, m_Specific(X),
+                    m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2)))))
+    return nullptr;
+
+  if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse()))
+    return nullptr; // Not enough one-use instructions for the fold.
+  // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of
+  //        two comparisons we'll need to build.
+
+  // Canonicalize Cmp1 into the form we expect.
+  // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+  switch (Pred1) {
+  case ICmpInst::Predicate::ICMP_SLT:
+    break;
+  case ICmpInst::Predicate::ICMP_SLE:
+    // We'd have to increment C2 by one, and for that it must not have signed
+    // max element, but then it would have been canonicalized to 'slt' before
+    // we get here. So we can't do anything useful with 'sle'.
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_SGT:
+    // We want to canonicalize it to 'slt', so we'll need to increment C2,
+    // which again means it must not have any signed max elements.
+    if (!match(C2,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                  APInt::getSignedMaxValue(
+                                      C2->getType()->getScalarSizeInBits()))))
+      return nullptr; // Can't do, have signed max element[s].
+    C2 = AddOne(C2);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::Predicate::ICMP_SGE:
+    // Also non-canonical, but here we don't need to change C2,
+    // so we don't have any restrictions on C2, so we can just handle it.
+    std::swap(ReplacementLow, ReplacementHigh);
+    break;
+  default:
+    return nullptr; // Unknown predicate.
+  }
+
+  // The thresholds of this clamp-like pattern.
+  auto *ThresholdLowIncl = ConstantExpr::getNeg(C1);
+  auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1);
+
+  // The fold has a precondition 1: C2 s>= ThresholdLow
+  auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2,
+                                         ThresholdLowIncl);
+  if (!match(Precond1, m_One()))
+    return nullptr;
+  // The fold has a precondition 2: C2 s<= ThresholdHigh
+  auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2,
+                                         ThresholdHighExcl);
+  if (!match(Precond2, m_One()))
+    return nullptr;
+
+  // All good, finally emit the new pattern.
+  Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl);
+  Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl);
+  Value *MaybeReplacedLow =
+      Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X);
+  Instruction *MaybeReplacedHigh =
+      SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow);
+
+  return MaybeReplacedHigh;
+}
+
+// If we have
+//  %cmp = icmp [canonical predicate] i32 %x, C0
+//  %r = select i1 %cmp, i32 %y, i32 C1
+// Where C0 != C1 and %x may be different from %y, see if the constant that we
+// will have if we flip the strictness of the predicate (i.e. without changing
+// the result) is identical to the C1 in select. If it matches we can change
+// original comparison to one with swapped predicate, reuse the constant,
+// and swap the hands of select.
+static Instruction *
+tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
+                                         InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred;
+  Value *X;
+  Constant *C0;
+  if (!match(&Cmp, m_OneUse(m_ICmp(
+                       Pred, m_Value(X),
+                       m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))))
+    return nullptr;
+
+  // If comparison predicate is non-relational, we won't be able to do anything.
+  if (ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  // If comparison predicate is non-canonical, then we certainly won't be able
+  // to make it canonical; canonicalizeCmpWithConstant() already tried.
+  if (!isCanonicalPredicate(Pred))
+    return nullptr;
+
+  // If the [input] type of comparison and select type are different, lets abort
+  // for now. We could try to compare constants with trunc/[zs]ext though.
+  if (C0->getType() != Sel.getType())
+    return nullptr;
+
+  // FIXME: are there any magic icmp predicate+constant pairs we must not touch?
+
+  Value *SelVal0, *SelVal1; // We do not care which one is from where.
+  match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1)));
+  // At least one of these values we are selecting between must be a constant
+  // else we'll never succeed.
+  if (!match(SelVal0, m_AnyIntegralConstant()) &&
+      !match(SelVal1, m_AnyIntegralConstant()))
+    return nullptr;
+
+  // Does this constant C match any of the `select` values?
+  auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) {
+    return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1);
+  };
+
+  // If C0 *already* matches true/false value of select, we are done.
+  if (MatchesSelectValue(C0))
+    return nullptr;
+
+  // Check the constant we'd have with flipped-strictness predicate.
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0);
+  if (!FlippedStrictness)
+    return nullptr;
+
+  // If said constant doesn't match either, then there is no hope,
+  if (!MatchesSelectValue(FlippedStrictness->second))
+    return nullptr;
+
+  // It matched! Lets insert the new comparison just before select.
+  InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(&Sel);
+
+  Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped.
+  Value *NewCmp = Builder.CreateICmp(Pred, X, FlippedStrictness->second,
+                                     Cmp.getName() + ".inv");
+  Sel.setCondition(NewCmp);
+  Sel.swapValues();
+  Sel.swapProfMetadata();
+
+  return &Sel;
+}
+
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
                                                   ICmpInst *ICI) {
-  Value *TrueVal = SI.getTrueValue();
-  Value *FalseVal = SI.getFalseValue();
+  if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ))
+    return replaceInstUsesWith(SI, V);
 
   if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder))
     return NewSel;
@@ -1074,12 +1405,21 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, Builder))
     return NewAbs;
 
+  if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder))
+    return NewAbs;
+
+  if (Instruction *NewSel =
+          tryToReuseConstantFromSelectInComparison(SI, *ICI, Builder))
+    return NewSel;
+
   bool Changed = adjustMinMax(SI, *ICI);
 
   if (Value *V = foldSelectICmpAnd(SI, ICI, Builder))
     return replaceInstUsesWith(SI, V);
 
   // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
   ICmpInst::Predicate Pred = ICI->getPredicate();
   Value *CmpLHS = ICI->getOperand(0);
   Value *CmpRHS = ICI->getOperand(1);
@@ -1149,6 +1489,9 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
           foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
     return V;
 
+  if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder))
+    return V;
+
   if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
@@ -1253,6 +1596,16 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
     }
   }
 
+  // max(max(A, B), min(A, B)) --> max(A, B)
+  // min(min(A, B), max(A, B)) --> min(A, B)
+  // TODO: This could be done in instsimplify.
+  if (SPF1 == SPF2 &&
+      ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B))))))
+    return replaceInstUsesWith(Outer, Inner);
+
   // ABS(ABS(X)) -> ABS(X)
   // NABS(NABS(X)) -> NABS(X)
   // TODO: This could be done in instsimplify.
@@ -1280,7 +1633,7 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
       return true;
     }
 
-    if (IsFreeToInvert(V, !V->hasNUsesOrMore(3))) {
+    if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) {
       NotV = nullptr;
       return true;
     }
@@ -1393,6 +1746,128 @@ static Instruction *foldAddSubSelect(SelectInst &SI,
   return nullptr;
 }
 
+/// Turn X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
+/// And X - Y overflows ? 0 : X - Y -> usub_sat X, Y
+/// Along with a number of patterns similar to:
+/// X + Y overflows ? (X < 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+/// X - Y overflows ? (X > 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+static Instruction *
+foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  WithOverflowInst *II;
+  if (!match(CondVal, m_ExtractValue<1>(m_WithOverflowInst(II))) ||
+      !match(FalseVal, m_ExtractValue<0>(m_Specific(II))))
+    return nullptr;
+
+  Value *X = II->getLHS();
+  Value *Y = II->getRHS();
+
+  auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) {
+    Type *Ty = Limit->getType();
+
+    ICmpInst::Predicate Pred;
+    Value *TrueVal, *FalseVal, *Op;
+    const APInt *C;
+    if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)),
+                               m_Value(TrueVal), m_Value(FalseVal))))
+      return false;
+
+    auto IsZeroOrOne = [](const APInt &C) {
+      return C.isNullValue() || C.isOneValue();
+    };
+    auto IsMinMax = [&](Value *Min, Value *Max) {
+      APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits());
+      APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits());
+      return match(Min, m_SpecificInt(MinVal)) &&
+             match(Max, m_SpecificInt(MaxVal));
+    };
+
+    if (Op != X && Op != Y)
+      return false;
+
+    if (IsAdd) {
+      // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      if (Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
+          IsMinMax(TrueVal, FalseVal))
+        return true;
+      // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      if (Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
+          IsMinMax(FalseVal, TrueVal))
+        return true;
+    } else {
+      // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      if (Op == X && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C + 1) &&
+          IsMinMax(TrueVal, FalseVal))
+        return true;
+      // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      if (Op == X && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 2) &&
+          IsMinMax(FalseVal, TrueVal))
+        return true;
+      // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      if (Op == Y && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
+          IsMinMax(FalseVal, TrueVal))
+        return true;
+      // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      if (Op == Y && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
+          IsMinMax(TrueVal, FalseVal))
+        return true;
+    }
+
+    return false;
+  };
+
+  Intrinsic::ID NewIntrinsicID;
+  if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow &&
+      match(TrueVal, m_AllOnes()))
+    // X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
+    NewIntrinsicID = Intrinsic::uadd_sat;
+  else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow &&
+           match(TrueVal, m_Zero()))
+    // X - Y overflows ? 0 : X - Y -> usub_sat X, Y
+    NewIntrinsicID = Intrinsic::usub_sat;
+  else if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow &&
+           IsSignedSaturateLimit(TrueVal, /*IsAdd=*/true))
+    // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    NewIntrinsicID = Intrinsic::sadd_sat;
+  else if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow &&
+           IsSignedSaturateLimit(TrueVal, /*IsAdd=*/false))
+    // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    NewIntrinsicID = Intrinsic::ssub_sat;
+  else
+    return nullptr;
+
+  Function *F =
+      Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType());
+  return CallInst::Create(F, {X, Y});
+}
+
 Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
   Constant *C;
   if (!match(Sel.getTrueValue(), m_Constant(C)) &&
@@ -1492,6 +1967,30 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
                                ConstantVector::get(Mask));
 }
 
+/// If we have a select of vectors with a scalar condition, try to convert that
+/// to a vector select by splatting the condition. A splat may get folded with
+/// other operations in IR and having all operands of a select be vector types
+/// is likely better for vector codegen.
+static Instruction *canonicalizeScalarSelectOfVecs(
+    SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
+  Type *Ty = Sel.getType();
+  if (!Ty->isVectorTy())
+    return nullptr;
+
+  // We can replace a single-use extract with constant index.
+  Value *Cond = Sel.getCondition();
+  if (!match(Cond, m_OneUse(m_ExtractElement(m_Value(), m_ConstantInt()))))
+    return nullptr;
+
+  // select (extelt V, Index), T, F --> select (splat V, Index), T, F
+  // Splatting the extracted condition reduces code (we could directly create a
+  // splat shuffle of the source vector to eliminate the intermediate step).
+  unsigned NumElts = Ty->getVectorNumElements();
+  Value *SplatCond = Builder.CreateVectorSplat(NumElts, Cond);
+  Sel.setCondition(SplatCond);
+  return &Sel;
+}
+
 /// Reuse bitcasted operands between a compare and select:
 /// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
 /// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D))
@@ -1648,6 +2147,71 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
   return nullptr;
 }
 
+/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
+Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) {
+  Type *Ty = MinMax1.getType();
+
+  // We are looking for a tree of:
+  // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B))))
+  // Where the min and max could be reversed
+  Instruction *MinMax2;
+  BinaryOperator *AddSub;
+  const APInt *MinValue, *MaxValue;
+  if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) {
+    if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue))))
+      return nullptr;
+  } else if (match(&MinMax1,
+                   m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) {
+    if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue))))
+      return nullptr;
+  } else
+    return nullptr;
+
+  // Check that the constants clamp a saturate, and that the new type would be
+  // sensible to convert to.
+  if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1)
+    return nullptr;
+  // In what bitwidth can this be treated as saturating arithmetics?
+  unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1;
+  // FIXME: This isn't quite right for vectors, but using the scalar type is a
+  // good first approximation for what should be done there.
+  if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
+    return nullptr;
+
+  // Also make sure that the number of uses is as expected. The "3"s are for the
+  // the two items of min/max (the compare and the select).
+  if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3))
+    return nullptr;
+
+  // Create the new type (which can be a vector type)
+  Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
+  // Match the two extends from the add/sub
+  Value *A, *B;
+  if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B)))))
+    return nullptr;
+  // And check the incoming values are of a type smaller than or equal to the
+  // size of the saturation. Otherwise the higher bits can cause different
+  // results.
+  if (A->getType()->getScalarSizeInBits() > NewBitWidth ||
+      B->getType()->getScalarSizeInBits() > NewBitWidth)
+    return nullptr;
+
+  Intrinsic::ID IntrinsicID;
+  if (AddSub->getOpcode() == Instruction::Add)
+    IntrinsicID = Intrinsic::sadd_sat;
+  else if (AddSub->getOpcode() == Instruction::Sub)
+    IntrinsicID = Intrinsic::ssub_sat;
+  else
+    return nullptr;
+
+  // Finally create and return the sat intrinsic, truncated to the new type
+  Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
+  Value *AT = Builder.CreateSExt(A, NewTy);
+  Value *BT = Builder.CreateSExt(B, NewTy);
+  Value *Sat = Builder.CreateCall(F, {AT, BT});
+  return CastInst::Create(Instruction::SExt, Sat, Ty);
+}
+
 /// Reduce a sequence of min/max with a common operand.
 static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
                                         Value *RHS,
@@ -1788,6 +2352,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = canonicalizeSelectToShuffle(SI))
     return I;
 
+  if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, Builder))
+    return I;
+
   // Canonicalize a one-use integer compare with a non-canonical predicate by
   // inverting the predicate and swapping the select operands. This matches a
   // compare canonicalization for conditional branches.
@@ -1872,7 +2439,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
   // See if we are selecting two values based on a comparison of the two values.
   if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
-    if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
+    Value *Cmp0 = FCI->getOperand(0), *Cmp1 = FCI->getOperand(1);
+    if ((Cmp0 == TrueVal && Cmp1 == FalseVal) ||
+        (Cmp0 == FalseVal && Cmp1 == TrueVal)) {
       // Canonicalize to use ordered comparisons by swapping the select
       // operands.
       //
@@ -1881,30 +2450,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
         FCmpInst::Predicate InvPred = FCI->getInversePredicate();
         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        // FIXME: The FMF should propagate from the select, not the fcmp.
         Builder.setFastMathFlags(FCI->getFastMathFlags());
-        Value *NewCond = Builder.CreateFCmp(InvPred, TrueVal, FalseVal,
+        Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1,
                                             FCI->getName() + ".inv");
-
-        return SelectInst::Create(NewCond, FalseVal, TrueVal,
-                                  SI.getName() + ".p");
-      }
-
-      // NOTE: if we wanted to, this is where to detect MIN/MAX
-    } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
-      // Canonicalize to use ordered comparisons by swapping the select
-      // operands.
-      //
-      // e.g.
-      // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y
-      if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
-        FCmpInst::Predicate InvPred = FCI->getInversePredicate();
-        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-        Builder.setFastMathFlags(FCI->getFastMathFlags());
-        Value *NewCond = Builder.CreateFCmp(InvPred, FalseVal, TrueVal,
-                                            FCI->getName() + ".inv");
-
-        return SelectInst::Create(NewCond, FalseVal, TrueVal,
-                                  SI.getName() + ".p");
+        Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal);
+        return replaceInstUsesWith(SI, NewSel);
       }
 
       // NOTE: if we wanted to, this is where to detect MIN/MAX
@@ -1967,6 +2518,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
   if (Instruction *Add = foldAddSubSelect(SI, Builder))
     return Add;
+  if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder))
+    return Add;
 
   // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
   auto *TI = dyn_cast<Instruction>(TrueVal);
@@ -2013,16 +2566,17 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
           (LHS->getType()->isFPOrFPVectorTy() &&
            ((CmpLHS != LHS && CmpLHS != RHS) ||
             (CmpRHS != LHS && CmpRHS != RHS)))) {
-        CmpInst::Predicate Pred = getMinMaxPred(SPF, SPR.Ordered);
+        CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered);
 
         Value *Cmp;
-        if (CmpInst::isIntPredicate(Pred)) {
-          Cmp = Builder.CreateICmp(Pred, LHS, RHS);
+        if (CmpInst::isIntPredicate(MinMaxPred)) {
+          Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS);
         } else {
           IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-          auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
+          auto FMF =
+              cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
           Builder.setFastMathFlags(FMF);
-          Cmp = Builder.CreateFCmp(Pred, LHS, RHS);
+          Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS);
         }
 
         Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI);
@@ -2040,9 +2594,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
         Value *A;
         if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) &&
-            !IsFreeToInvert(A, A->hasOneUse()) &&
+            !isFreeToInvert(A, A->hasOneUse()) &&
             // Passing false to only consider m_Not and constants.
-            IsFreeToInvert(Y, false)) {
+            isFreeToInvert(Y, false)) {
           Value *B = Builder.CreateNot(Y);
           Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF),
                                           A, B);
@@ -2070,6 +2624,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
       if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
         return I;
+      if (Instruction *I = matchSAddSubSat(SI))
+        return I;
     }
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index c821292400cd..fbff5dd4a8cd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -25,50 +25,332 @@ using namespace PatternMatch;
 // we should rewrite it as
 //   x shiftopcode (Q+K)  iff (Q+K) u< bitwidth(x)
 // This is valid for any shift, but they must be identical.
-static Instruction *
-reassociateShiftAmtsOfTwoSameDirectionShifts(BinaryOperator *Sh0,
-                                             const SimplifyQuery &SQ) {
-  // Look for:  (x shiftopcode ShAmt0) shiftopcode ShAmt1
-  Value *X, *ShAmt1, *ShAmt0;
+//
+// AnalyzeForSignBitExtraction indicates that we will only analyze whether this
+// pattern has any 2 right-shifts that sum to 1 less than original bit width.
+Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts(
+    BinaryOperator *Sh0, const SimplifyQuery &SQ,
+    bool AnalyzeForSignBitExtraction) {
+  // Look for a shift of some instruction, ignore zext of shift amount if any.
+  Instruction *Sh0Op0;
+  Value *ShAmt0;
+  if (!match(Sh0,
+             m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0)))))
+    return nullptr;
+
+  // If there is a truncation between the two shifts, we must make note of it
+  // and look through it. The truncation imposes additional constraints on the
+  // transform.
   Instruction *Sh1;
-  if (!match(Sh0, m_Shift(m_CombineAnd(m_Shift(m_Value(X), m_Value(ShAmt1)),
-                                       m_Instruction(Sh1)),
-                          m_Value(ShAmt0))))
+  Value *Trunc = nullptr;
+  match(Sh0Op0,
+        m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)),
+                    m_Instruction(Sh1)));
+
+  // Inner shift: (x shiftopcode ShAmt1)
+  // Like with other shift, ignore zext of shift amount if any.
+  Value *X, *ShAmt1;
+  if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1)))))
+    return nullptr;
+
+  // We have two shift amounts from two different shifts. The types of those
+  // shift amounts may not match. If that's the case let's bailout now..
+  if (ShAmt0->getType() != ShAmt1->getType())
+    return nullptr;
+
+  // We are only looking for signbit extraction if we have two right shifts.
+  bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) &&
+                           match(Sh1, m_Shr(m_Value(), m_Value()));
+  // ... and if it's not two right-shifts, we know the answer already.
+  if (AnalyzeForSignBitExtraction && !HadTwoRightShifts)
     return nullptr;
 
-  // The shift opcodes must be identical.
+  // The shift opcodes must be identical, unless we are just checking whether
+  // this pattern can be interpreted as a sign-bit-extraction.
   Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode();
-  if (ShiftOpcode != Sh1->getOpcode())
+  bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode();
+  if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction)
+    return nullptr;
+
+  // If we saw truncation, we'll need to produce extra instruction,
+  // and for that one of the operands of the shift must be one-use,
+  // unless of course we don't actually plan to produce any instructions here.
+  if (Trunc && !AnalyzeForSignBitExtraction &&
+      !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
     return nullptr;
+
   // Can we fold (ShAmt0+ShAmt1) ?
-  Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, ShAmt0, ShAmt1,
-                                  SQ.getWithInstruction(Sh0));
+  auto *NewShAmt = dyn_cast_or_null<Constant>(
+      SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false,
+                      SQ.getWithInstruction(Sh0)));
   if (!NewShAmt)
     return nullptr; // Did not simplify.
-  // Is the new shift amount smaller than the bit width?
-  // FIXME: could also rely on ConstantRange.
-  unsigned BitWidth = X->getType()->getScalarSizeInBits();
+  unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits();
+  unsigned XBitWidth = X->getType()->getScalarSizeInBits();
+  // Is the new shift amount smaller than the bit width of inner/new shift?
   if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
-                                          APInt(BitWidth, BitWidth))))
-    return nullptr;
+                                          APInt(NewShAmtBitWidth, XBitWidth))))
+    return nullptr; // FIXME: could perform constant-folding.
+
+  // If there was a truncation, and we have a right-shift, we can only fold if
+  // we are left with the original sign bit. Likewise, if we were just checking
+  // that this is a sighbit extraction, this is the place to check it.
+  // FIXME: zero shift amount is also legal here, but we can't *easily* check
+  // more than one predicate so it's not really worth it.
+  if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) {
+    // If it's not a sign bit extraction, then we're done.
+    if (!match(NewShAmt,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                  APInt(NewShAmtBitWidth, XBitWidth - 1))))
+      return nullptr;
+    // If it is, and that was the question, return the base value.
+    if (AnalyzeForSignBitExtraction)
+      return X;
+  }
+
+  assert(IdenticalShOpcodes && "Should not get here with different shifts.");
+
   // All good, we can do this fold.
+  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType());
+
   BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt);
-  // If both of the original shifts had the same flag set, preserve the flag.
-  if (ShiftOpcode == Instruction::BinaryOps::Shl) {
-    NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() &&
-                                   Sh1->hasNoUnsignedWrap());
-    NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() &&
-                                 Sh1->hasNoSignedWrap());
-  } else {
-    NewShift->setIsExact(Sh0->isExact() && Sh1->isExact());
+
+  // The flags can only be propagated if there wasn't a trunc.
+  if (!Trunc) {
+    // If the pattern did not involve trunc, and both of the original shifts
+    // had the same flag set, preserve the flag.
+    if (ShiftOpcode == Instruction::BinaryOps::Shl) {
+      NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() &&
+                                     Sh1->hasNoUnsignedWrap());
+      NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() &&
+                                   Sh1->hasNoSignedWrap());
+    } else {
+      NewShift->setIsExact(Sh0->isExact() && Sh1->isExact());
+    }
   }
-  return NewShift;
+
+  Instruction *Ret = NewShift;
+  if (Trunc) {
+    Builder.Insert(NewShift);
+    Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType());
+  }
+
+  return Ret;
+}
+
+// If we have some pattern that leaves only some low bits set, and then performs
+// left-shift of those bits, if none of the bits that are left after the final
+// shift are modified by the mask, we can omit the mask.
+//
+// There are many variants to this pattern:
+//   a)  (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt
+//   b)  (x & (~(-1 << MaskShAmt))) << ShiftShAmt
+//   c)  (x & (-1 >> MaskShAmt)) << ShiftShAmt
+//   d)  (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt
+//   e)  ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt
+//   f)  ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt
+// All these patterns can be simplified to just:
+//   x << ShiftShAmt
+// iff:
+//   a,b)     (MaskShAmt+ShiftShAmt) u>= bitwidth(x)
+//   c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt)
+static Instruction *
+dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
+                                     const SimplifyQuery &Q,
+                                     InstCombiner::BuilderTy &Builder) {
+  assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl &&
+         "The input must be 'shl'!");
+
+  Value *Masked, *ShiftShAmt;
+  match(OuterShift,
+        m_Shift(m_Value(Masked), m_ZExtOrSelf(m_Value(ShiftShAmt))));
+
+  // *If* there is a truncation between an outer shift and a possibly-mask,
+  // then said truncation *must* be one-use, else we can't perform the fold.
+  Value *Trunc;
+  if (match(Masked, m_CombineAnd(m_Trunc(m_Value(Masked)), m_Value(Trunc))) &&
+      !Trunc->hasOneUse())
+    return nullptr;
+
+  Type *NarrowestTy = OuterShift->getType();
+  Type *WidestTy = Masked->getType();
+  bool HadTrunc = WidestTy != NarrowestTy;
+
+  // The mask must be computed in a type twice as wide to ensure
+  // that no bits are lost if the sum-of-shifts is wider than the base type.
+  Type *ExtendedTy = WidestTy->getExtendedType();
+
+  Value *MaskShAmt;
+
+  // ((1 << MaskShAmt) - 1)
+  auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes());
+  // (~(-1 << maskNbits))
+  auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes());
+  // (-1 >> MaskShAmt)
+  auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt));
+  // ((-1 << MaskShAmt) >> MaskShAmt)
+  auto MaskD =
+      m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt));
+
+  Value *X;
+  Constant *NewMask;
+
+  if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) {
+    // Peek through an optional zext of the shift amount.
+    match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
+
+    // We have two shift amounts from two different shifts. The types of those
+    // shift amounts may not match. If that's the case let's bailout now.
+    if (MaskShAmt->getType() != ShiftShAmt->getType())
+      return nullptr;
+
+    // Can we simplify (MaskShAmt+ShiftShAmt) ?
+    auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst(
+        MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+    if (!SumOfShAmts)
+      return nullptr; // Did not simplify.
+    // In this pattern SumOfShAmts correlates with the number of low bits
+    // that shall remain in the root value (OuterShift).
+
+    // An extend of an undef value becomes zero because the high bits are never
+    // completely unknown. Replace the the `undef` shift amounts with final
+    // shift bitwidth to ensure that the value remains undef when creating the
+    // subsequent shift op.
+    SumOfShAmts = Constant::replaceUndefsWith(
+        SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(),
+                                      ExtendedTy->getScalarSizeInBits()));
+    auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy);
+    // And compute the mask as usual: ~(-1 << (SumOfShAmts))
+    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+    auto *ExtendedInvertedMask =
+        ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts);
+    NewMask = ConstantExpr::getNot(ExtendedInvertedMask);
+  } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) ||
+             match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)),
+                                 m_Deferred(MaskShAmt)))) {
+    // Peek through an optional zext of the shift amount.
+    match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
+
+    // We have two shift amounts from two different shifts. The types of those
+    // shift amounts may not match. If that's the case let's bailout now.
+    if (MaskShAmt->getType() != ShiftShAmt->getType())
+      return nullptr;
+
+    // Can we simplify (ShiftShAmt-MaskShAmt) ?
+    auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst(
+        ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+    if (!ShAmtsDiff)
+      return nullptr; // Did not simplify.
+    // In this pattern ShAmtsDiff correlates with the number of high bits that
+    // shall be unset in the root value (OuterShift).
+
+    // An extend of an undef value becomes zero because the high bits are never
+    // completely unknown. Replace the the `undef` shift amounts with negated
+    // bitwidth of innermost shift to ensure that the value remains undef when
+    // creating the subsequent shift op.
+    unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits();
+    ShAmtsDiff = Constant::replaceUndefsWith(
+        ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(),
+                                     -WidestTyBitWidth));
+    auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt(
+        ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(),
+                                              WidestTyBitWidth,
+                                              /*isSigned=*/false),
+                             ShAmtsDiff),
+        ExtendedTy);
+    // And compute the mask as usual: (-1 l>> (NumHighBitsToClear))
+    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+    NewMask =
+        ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear);
+  } else
+    return nullptr; // Don't know anything about this pattern.
+
+  NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy);
+
+  // Does this mask has any unset bits? If not then we can just not apply it.
+  bool NeedMask = !match(NewMask, m_AllOnes());
+
+  // If we need to apply a mask, there are several more restrictions we have.
+  if (NeedMask) {
+    // The old masking instruction must go away.
+    if (!Masked->hasOneUse())
+      return nullptr;
+    // The original "masking" instruction must not have been`ashr`.
+    if (match(Masked, m_AShr(m_Value(), m_Value())))
+      return nullptr;
+  }
+
+  // If we need to apply truncation, let's do it first, since we can.
+  // We have already ensured that the old truncation will go away.
+  if (HadTrunc)
+    X = Builder.CreateTrunc(X, NarrowestTy);
+
+  // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits.
+  // We didn't change the Type of this outermost shift, so we can just do it.
+  auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X,
+                                          OuterShift->getOperand(1));
+  if (!NeedMask)
+    return NewShift;
+
+  Builder.Insert(NewShift);
+  return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
+}
+
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This eliminates a
+/// a use of an intermediate value (reduces dependency chain).
+static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
+                                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isShift() && "Expected a shift as input");
+  auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0));
+  if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
+    return nullptr;
+
+  const APInt *C0, *C1;
+  if (!match(I.getOperand(1), m_APInt(C1)))
+    return nullptr;
+
+  Instruction::BinaryOps ShiftOpcode = I.getOpcode();
+  Type *Ty = I.getType();
+
+  // Find a matching one-use shift by constant. The fold is not valid if the sum
+  // of the shift values equals or exceeds bitwidth.
+  // TODO: Remove the one-use check if the other logic operand (Y) is constant.
+  Value *X, *Y;
+  auto matchFirstShift = [&](Value *V) {
+    return !isa<ConstantExpr>(V) &&
+           match(V, m_OneUse(m_Shift(m_Value(X), m_APInt(C0)))) &&
+           cast<BinaryOperator>(V)->getOpcode() == ShiftOpcode &&
+           (*C0 + *C1).ult(Ty->getScalarSizeInBits());
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  if (matchFirstShift(LogicInst->getOperand(0)))
+    Y = LogicInst->getOperand(1);
+  else if (matchFirstShift(LogicInst->getOperand(1)))
+    Y = LogicInst->getOperand(0);
+  else
+    return nullptr;
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  Constant *ShiftSumC = ConstantInt::get(Ty, *C0 + *C1);
+  Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
+  Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
+  return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
 }
 
 Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
 
+  // If the shift amount is a one-use `sext`, we can demote it to `zext`.
+  Value *Y;
+  if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) {
+    Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName());
+    return BinaryOperator::Create(I.getOpcode(), Op0, NewExt);
+  }
+
   // See if we can fold away this shift.
   if (SimplifyDemandedInstructionBits(I))
     return &I;
@@ -83,8 +365,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
       return Res;
 
-  if (Instruction *NewShift =
-          reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ))
+  if (auto *NewShift = cast_or_null<Instruction>(
+          reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)))
     return NewShift;
 
   // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
@@ -110,6 +392,9 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     return &I;
   }
 
+  if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
+    return Logic;
+
   return nullptr;
 }
 
@@ -618,9 +903,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
 }
 
 Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
+
   if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
-                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
-                                 SQ.getWithInstruction(&I)))
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *X = foldVectorBinop(I))
@@ -629,6 +915,9 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   if (Instruction *V = commonShiftTransforms(I))
     return V;
 
+  if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder))
+    return V;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
   unsigned BitWidth = Ty->getScalarSizeInBits();
@@ -636,12 +925,11 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   const APInt *ShAmtAPInt;
   if (match(Op1, m_APInt(ShAmtAPInt))) {
     unsigned ShAmt = ShAmtAPInt->getZExtValue();
-    unsigned BitWidth = Ty->getScalarSizeInBits();
 
     // shl (zext X), ShAmt --> zext (shl X, ShAmt)
     // This is only valid if X would have zeros shifted out.
     Value *X;
-    if (match(Op0, m_ZExt(m_Value(X)))) {
+    if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) {
       unsigned SrcWidth = X->getType()->getScalarSizeInBits();
       if (ShAmt < SrcWidth &&
           MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
@@ -719,6 +1007,12 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
     // (X * C2) << C1 --> X * (C2 << C1)
     if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
       return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+
+    // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
+    if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+      auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1);
+      return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+    }
   }
 
   // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1
@@ -859,6 +1153,75 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
   return nullptr;
 }
 
+Instruction *
+InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract(
+    BinaryOperator &OldAShr) {
+  assert(OldAShr.getOpcode() == Instruction::AShr &&
+         "Must be called with arithmetic right-shift instruction only.");
+
+  // Check that constant C is a splat of the element-wise bitwidth of V.
+  auto BitWidthSplat = [](Constant *C, Value *V) {
+    return match(
+        C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                              APInt(C->getType()->getScalarSizeInBits(),
+                                    V->getType()->getScalarSizeInBits())));
+  };
+
+  // It should look like variable-length sign-extension on the outside:
+  //   (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits)
+  Value *NBits;
+  Instruction *MaybeTrunc;
+  Constant *C1, *C2;
+  if (!match(&OldAShr,
+             m_AShr(m_Shl(m_Instruction(MaybeTrunc),
+                          m_ZExtOrSelf(m_Sub(m_Constant(C1),
+                                             m_ZExtOrSelf(m_Value(NBits))))),
+                    m_ZExtOrSelf(m_Sub(m_Constant(C2),
+                                       m_ZExtOrSelf(m_Deferred(NBits)))))) ||
+      !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr))
+    return nullptr;
+
+  // There may or may not be a truncation after outer two shifts.
+  Instruction *HighBitExtract;
+  match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract)));
+  bool HadTrunc = MaybeTrunc != HighBitExtract;
+
+  // And finally, the innermost part of the pattern must be a right-shift.
+  Value *X, *NumLowBitsToSkip;
+  if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip))))
+    return nullptr;
+
+  // Said right-shift must extract high NBits bits - C0 must be it's bitwidth.
+  Constant *C0;
+  if (!match(NumLowBitsToSkip,
+             m_ZExtOrSelf(
+                 m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) ||
+      !BitWidthSplat(C0, HighBitExtract))
+    return nullptr;
+
+  // Since the NBits is identical for all shifts, if the outermost and
+  // innermost shifts are identical, then outermost shifts are redundant.
+  // If we had truncation, do keep it though.
+  if (HighBitExtract->getOpcode() == OldAShr.getOpcode())
+    return replaceInstUsesWith(OldAShr, MaybeTrunc);
+
+  // Else, if there was a truncation, then we need to ensure that one
+  // instruction will go away.
+  if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
+  // Finally, bypass two innermost shifts, and perform the outermost shift on
+  // the operands of the innermost shift.
+  Instruction *NewAShr =
+      BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip);
+  NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness.
+  if (!HadTrunc)
+    return NewAShr;
+
+  Builder.Insert(NewAShr);
+  return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
+}
+
 Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
@@ -933,6 +1296,9 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
     }
   }
 
+  if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I))
+    return R;
+
   // See if we can turn a signed shr into an unsigned shr.
   if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I))
     return BinaryOperator::CreateLShr(Op0, Op1);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index e0d85c4b49ae..47ce83974c8d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -14,6 +14,8 @@
 #include "InstCombineInternal.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
 
@@ -348,8 +350,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
-    if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
-        ShrinkDemandedConstant(I, 2, DemandedMask))
+    // This is similar to ShrinkDemandedConstant, but for a select we want to
+    // try to keep the selected constants the same as icmp value constants, if
+    // we can. This helps not break apart (or helps put back together)
+    // canonical patterns like min and max.
+    auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo,
+                                         APInt DemandedMask) {
+      const APInt *SelC;
+      if (!match(I->getOperand(OpNo), m_APInt(SelC)))
+        return false;
+
+      // Get the constant out of the ICmp, if there is one.
+      const APInt *CmpC;
+      ICmpInst::Predicate Pred;
+      if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) ||
+          CmpC->getBitWidth() != SelC->getBitWidth())
+        return ShrinkDemandedConstant(I, OpNo, DemandedMask);
+
+      // If the constant is already the same as the ICmp, leave it as-is.
+      if (*CmpC == *SelC)
+        return false;
+      // If the constants are not already the same, but can be with the demand
+      // mask, use the constant value from the ICmp.
+      if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) {
+        I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC));
+        return true;
+      }
+      return ShrinkDemandedConstant(I, OpNo, DemandedMask);
+    };
+    if (CanonicalizeSelectConstant(I, 1, DemandedMask) ||
+        CanonicalizeSelectConstant(I, 2, DemandedMask))
       return I;
 
     // Only known if known in both the LHS and RHS.
@@ -971,6 +1001,13 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
 Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                            APInt DemandedElts,
                                                            int DMaskIdx) {
+
+  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
+  if (DMaskIdx < 0 &&
+      II->getType()->getScalarSizeInBits() != 32 &&
+      DemandedElts.getActiveBits() == 3)
+    return nullptr;
+
   unsigned VWidth = II->getType()->getVectorNumElements();
   if (VWidth == 1)
     return nullptr;
@@ -1067,16 +1104,22 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
 }
 
 /// The specified value produces a vector with any number of elements.
+/// This method analyzes which elements of the operand are undef and returns
+/// that information in UndefElts.
+///
 /// DemandedElts contains the set of elements that are actually used by the
-/// caller. This method analyzes which elements of the operand are undef and
-/// returns that information in UndefElts.
+/// caller, and by default (AllowMultipleUsers equals false) the value is
+/// simplified only if it has a single caller. If AllowMultipleUsers is set
+/// to true, DemandedElts refers to the union of sets of elements that are
+/// used by all callers.
 ///
 /// If the information about demanded elements can be used to simplify the
 /// operation, the operation is simplified, then the resultant value is
 /// returned.  This returns null if no change was made.
 Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                                 APInt &UndefElts,
-                                                unsigned Depth) {
+                                                unsigned Depth,
+                                                bool AllowMultipleUsers) {
   unsigned VWidth = V->getType()->getVectorNumElements();
   APInt EltMask(APInt::getAllOnesValue(VWidth));
   assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
@@ -1130,19 +1173,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   if (Depth == 10)
     return nullptr;
 
-  // If multiple users are using the root value, proceed with
-  // simplification conservatively assuming that all elements
-  // are needed.
-  if (!V->hasOneUse()) {
-    // Quit if we find multiple users of a non-root value though.
-    // They'll be handled when it's their turn to be visited by
-    // the main instcombine process.
-    if (Depth != 0)
-      // TODO: Just compute the UndefElts information recursively.
-      return nullptr;
+  if (!AllowMultipleUsers) {
+    // If multiple users are using the root value, proceed with
+    // simplification conservatively assuming that all elements
+    // are needed.
+    if (!V->hasOneUse()) {
+      // Quit if we find multiple users of a non-root value though.
+      // They'll be handled when it's their turn to be visited by
+      // the main instcombine process.
+      if (Depth != 0)
+        // TODO: Just compute the UndefElts information recursively.
+        return nullptr;
 
-    // Conservatively assume that all elements are needed.
-    DemandedElts = EltMask;
+      // Conservatively assume that all elements are needed.
+      DemandedElts = EltMask;
+    }
   }
 
   Instruction *I = dyn_cast<Instruction>(V);
@@ -1232,30 +1277,57 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     break;
   }
   case Instruction::ShuffleVector: {
-    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
-    unsigned LHSVWidth =
-      Shuffle->getOperand(0)->getType()->getVectorNumElements();
-    APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
+    auto *Shuffle = cast<ShuffleVectorInst>(I);
+    assert(Shuffle->getOperand(0)->getType() ==
+           Shuffle->getOperand(1)->getType() &&
+           "Expected shuffle operands to have same type");
+    unsigned OpWidth =
+        Shuffle->getOperand(0)->getType()->getVectorNumElements();
+    APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);
     for (unsigned i = 0; i < VWidth; i++) {
       if (DemandedElts[i]) {
         unsigned MaskVal = Shuffle->getMaskValue(i);
         if (MaskVal != -1u) {
-          assert(MaskVal < LHSVWidth * 2 &&
+          assert(MaskVal < OpWidth * 2 &&
                  "shufflevector mask index out of range!");
-          if (MaskVal < LHSVWidth)
+          if (MaskVal < OpWidth)
             LeftDemanded.setBit(MaskVal);
           else
-            RightDemanded.setBit(MaskVal - LHSVWidth);
+            RightDemanded.setBit(MaskVal - OpWidth);
         }
       }
     }
 
-    APInt LHSUndefElts(LHSVWidth, 0);
+    APInt LHSUndefElts(OpWidth, 0);
     simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
 
-    APInt RHSUndefElts(LHSVWidth, 0);
+    APInt RHSUndefElts(OpWidth, 0);
     simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts);
 
+    // If this shuffle does not change the vector length and the elements
+    // demanded by this shuffle are an identity mask, then this shuffle is
+    // unnecessary.
+    //
+    // We are assuming canonical form for the mask, so the source vector is
+    // operand 0 and operand 1 is not used.
+    //
+    // Note that if an element is demanded and this shuffle mask is undefined
+    // for that element, then the shuffle is not considered an identity
+    // operation. The shuffle prevents poison from the operand vector from
+    // leaking to the result by replacing poison with an undefined value.
+    if (VWidth == OpWidth) {
+      bool IsIdentityShuffle = true;
+      for (unsigned i = 0; i < VWidth; i++) {
+        unsigned MaskVal = Shuffle->getMaskValue(i);
+        if (DemandedElts[i] && i != MaskVal) {
+          IsIdentityShuffle = false;
+          break;
+        }
+      }
+      if (IsIdentityShuffle)
+        return Shuffle->getOperand(0);
+    }
+
     bool NewUndefElts = false;
     unsigned LHSIdx = -1u, LHSValIdx = -1u;
     unsigned RHSIdx = -1u, RHSValIdx = -1u;
@@ -1268,23 +1340,23 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       } else if (!DemandedElts[i]) {
         NewUndefElts = true;
         UndefElts.setBit(i);
-      } else if (MaskVal < LHSVWidth) {
+      } else if (MaskVal < OpWidth) {
         if (LHSUndefElts[MaskVal]) {
           NewUndefElts = true;
           UndefElts.setBit(i);
         } else {
-          LHSIdx = LHSIdx == -1u ? i : LHSVWidth;
-          LHSValIdx = LHSValIdx == -1u ? MaskVal : LHSVWidth;
+          LHSIdx = LHSIdx == -1u ? i : OpWidth;
+          LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth;
           LHSUniform = LHSUniform && (MaskVal == i);
         }
       } else {
-        if (RHSUndefElts[MaskVal - LHSVWidth]) {
+        if (RHSUndefElts[MaskVal - OpWidth]) {
           NewUndefElts = true;
           UndefElts.setBit(i);
         } else {
-          RHSIdx = RHSIdx == -1u ? i : LHSVWidth;
-          RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth;
-          RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i);
+          RHSIdx = RHSIdx == -1u ? i : OpWidth;
+          RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth;
+          RHSUniform = RHSUniform && (MaskVal - OpWidth == i);
         }
       }
     }
@@ -1293,20 +1365,20 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // this constant vector to single insertelement instruction.
     // shufflevector V, C, <v1, v2, .., ci, .., vm> ->
     // insertelement V, C[ci], ci-n
-    if (LHSVWidth == Shuffle->getType()->getNumElements()) {
+    if (OpWidth == Shuffle->getType()->getNumElements()) {
       Value *Op = nullptr;
       Constant *Value = nullptr;
       unsigned Idx = -1u;
 
       // Find constant vector with the single element in shuffle (LHS or RHS).
-      if (LHSIdx < LHSVWidth && RHSUniform) {
+      if (LHSIdx < OpWidth && RHSUniform) {
         if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
           Op = Shuffle->getOperand(1);
           Value = CV->getOperand(LHSValIdx);
           Idx = LHSIdx;
         }
       }
-      if (RHSIdx < LHSVWidth && LHSUniform) {
+      if (RHSIdx < OpWidth && LHSUniform) {
         if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
           Op = Shuffle->getOperand(0);
           Value = CV->getOperand(RHSValIdx);
@@ -1674,8 +1746,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::amdgcn_buffer_load_format:
     case Intrinsic::amdgcn_raw_buffer_load:
     case Intrinsic::amdgcn_raw_buffer_load_format:
+    case Intrinsic::amdgcn_raw_tbuffer_load:
     case Intrinsic::amdgcn_struct_buffer_load:
     case Intrinsic::amdgcn_struct_buffer_load_format:
+    case Intrinsic::amdgcn_struct_tbuffer_load:
+    case Intrinsic::amdgcn_tbuffer_load:
       return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
     default: {
       if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index dc9abdd7f47a..f604c9dc32ca 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -253,6 +253,69 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
   return nullptr;
 }
 
+/// Find elements of V demanded by UserInstr.
+static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
+  unsigned VWidth = V->getType()->getVectorNumElements();
+
+  // Conservatively assume that all elements are needed.
+  APInt UsedElts(APInt::getAllOnesValue(VWidth));
+
+  switch (UserInstr->getOpcode()) {
+  case Instruction::ExtractElement: {
+    ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
+    assert(EEI->getVectorOperand() == V);
+    ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
+    if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
+      UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
+    }
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
+    unsigned MaskNumElts = UserInstr->getType()->getVectorNumElements();
+
+    UsedElts = APInt(VWidth, 0);
+    for (unsigned i = 0; i < MaskNumElts; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u || MaskVal >= 2 * VWidth)
+        continue;
+      if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
+        UsedElts.setBit(MaskVal);
+      if (Shuffle->getOperand(1) == V &&
+          ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
+        UsedElts.setBit(MaskVal - VWidth);
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return UsedElts;
+}
+
+/// Find union of elements of V demanded by all its users.
+/// If it is known by querying findDemandedEltsBySingleUser that
+/// no user demands an element of V, then the corresponding bit
+/// remains unset in the returned value.
+static APInt findDemandedEltsByAllUsers(Value *V) {
+  unsigned VWidth = V->getType()->getVectorNumElements();
+
+  APInt UnionUsedElts(VWidth, 0);
+  for (const Use &U : V->uses()) {
+    if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+      UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
+    } else {
+      UnionUsedElts = APInt::getAllOnesValue(VWidth);
+      break;
+    }
+
+    if (UnionUsedElts.isAllOnesValue())
+      break;
+  }
+
+  return UnionUsedElts;
+}
+
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   Value *SrcVec = EI.getVectorOperand();
   Value *Index = EI.getIndexOperand();
@@ -271,19 +334,35 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       return nullptr;
 
     // This instruction only demands the single element from the input vector.
-    // If the input vector has a single use, simplify it based on this use
-    // property.
-    if (SrcVec->hasOneUse() && NumElts != 1) {
-      APInt UndefElts(NumElts, 0);
-      APInt DemandedElts(NumElts, 0);
-      DemandedElts.setBit(IndexC->getZExtValue());
-      if (Value *V = SimplifyDemandedVectorElts(SrcVec, DemandedElts,
-                                                UndefElts)) {
-        EI.setOperand(0, V);
-        return &EI;
+    if (NumElts != 1) {
+      // If the input vector has a single use, simplify it based on this use
+      // property.
+      if (SrcVec->hasOneUse()) {
+        APInt UndefElts(NumElts, 0);
+        APInt DemandedElts(NumElts, 0);
+        DemandedElts.setBit(IndexC->getZExtValue());
+        if (Value *V =
+                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) {
+          EI.setOperand(0, V);
+          return &EI;
+        }
+      } else {
+        // If the input vector has multiple uses, simplify it based on a union
+        // of all elements used.
+        APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
+        if (!DemandedElts.isAllOnesValue()) {
+          APInt UndefElts(NumElts, 0);
+          if (Value *V = SimplifyDemandedVectorElts(
+                  SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
+                  true /* AllowMultipleUsers */)) {
+            if (V != SrcVec) {
+              SrcVec->replaceAllUsesWith(V);
+              return &EI;
+            }
+          }
+        }
       }
     }
-
     if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
       return I;
 
@@ -766,6 +845,55 @@ static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
   return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
 }
 
+/// Try to fold an extract+insert element into an existing identity shuffle by
+/// changing the shuffle's mask to include the index of this insert element.
+static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
+  // Check if the vector operand of this insert is an identity shuffle.
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
+  if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) ||
+      !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
+    return nullptr;
+
+  // Check for a constant insertion index.
+  uint64_t IdxC;
+  if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
+    return nullptr;
+
+  // Check if this insert's scalar op is extracted from the identity shuffle's
+  // input vector.
+  Value *Scalar = InsElt.getOperand(1);
+  Value *X = Shuf->getOperand(0);
+  if (!match(Scalar, m_ExtractElement(m_Specific(X), m_SpecificInt(IdxC))))
+    return nullptr;
+
+  // Replace the shuffle mask element at the index of this extract+insert with
+  // that same index value.
+  // For example:
+  // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
+  unsigned NumMaskElts = Shuf->getType()->getVectorNumElements();
+  SmallVector<Constant *, 16> NewMaskVec(NumMaskElts);
+  Type *I32Ty = IntegerType::getInt32Ty(Shuf->getContext());
+  Constant *NewMaskEltC = ConstantInt::get(I32Ty, IdxC);
+  Constant *OldMask = Shuf->getMask();
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    if (i != IdxC) {
+      // All mask elements besides the inserted element remain the same.
+      NewMaskVec[i] = OldMask->getAggregateElement(i);
+    } else if (OldMask->getAggregateElement(i) == NewMaskEltC) {
+      // If the mask element was already set, there's nothing to do
+      // (demanded elements analysis may unset it later).
+      return nullptr;
+    } else {
+      assert(isa<UndefValue>(OldMask->getAggregateElement(i)) &&
+             "Unexpected shuffle mask element for identity shuffle");
+      NewMaskVec[i] = NewMaskEltC;
+    }
+  }
+
+  Constant *NewMask = ConstantVector::get(NewMaskVec);
+  return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
+}
+
 /// If we have an insertelement instruction feeding into another insertelement
 /// and the 2nd is inserting a constant into the vector, canonicalize that
 /// constant insertion before the insertion of a variable:
@@ -987,6 +1115,9 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *Splat = foldInsEltIntoSplat(IE))
     return Splat;
 
+  if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
+    return IdentityShuf;
+
   return nullptr;
 }
 
@@ -1009,17 +1140,23 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
   if (Depth == 0) return false;
 
   switch (I->getOpcode()) {
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+      // Propagating an undefined shuffle mask element to integer div/rem is not
+      // allowed because those opcodes can create immediate undefined behavior
+      // from an undefined element in an operand.
+      if (llvm::any_of(Mask, [](int M){ return M == -1; }))
+        return false;
+      LLVM_FALLTHROUGH;
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
     case Instruction::FSub:
     case Instruction::Mul:
     case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
     case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
     case Instruction::FRem:
     case Instruction::Shl:
     case Instruction::LShr:
@@ -1040,9 +1177,7 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
     case Instruction::FPExt:
     case Instruction::GetElementPtr: {
       // Bail out if we would create longer vector ops. We could allow creating
-      // longer vector ops, but that may result in more expensive codegen. We
-      // would also need to limit the transform to avoid undefined behavior for
-      // integer div/rem.
+      // longer vector ops, but that may result in more expensive codegen.
       Type *ITy = I->getType();
       if (ITy->isVectorTy() && Mask.size() > ITy->getVectorNumElements())
         return false;
@@ -1255,20 +1390,6 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
   llvm_unreachable("failed to reorder elements of vector instruction!");
 }
 
-static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask,
-                                  bool &isLHSID, bool &isRHSID) {
-  isLHSID = isRHSID = true;
-
-  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
-    if (Mask[i] < 0) continue;  // Ignore undef values.
-    // Is this an identity shuffle of the LHS value?
-    isLHSID &= (Mask[i] == (int)i);
-
-    // Is this an identity shuffle of the RHS value?
-    isRHSID &= (Mask[i]-e == i);
-  }
-}
-
 // Returns true if the shuffle is extracting a contiguous range of values from
 // LHS, for example:
 //                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
@@ -1425,9 +1546,11 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
   if (!Shuf.isSelect())
     return nullptr;
 
-  // Canonicalize to choose from operand 0 first.
+  // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
+  // Commuting undef to operand 0 conflicts with another canonicalization.
   unsigned NumElts = Shuf.getType()->getVectorNumElements();
-  if (Shuf.getMaskValue(0) >= (int)NumElts) {
+  if (!isa<UndefValue>(Shuf.getOperand(1)) &&
+      Shuf.getMaskValue(0) >= (int)NumElts) {
     // TODO: Can we assert that both operands of a shuffle-select are not undef
     // (otherwise, it would have been folded by instsimplify?
     Shuf.commute();
@@ -1618,7 +1741,8 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
   return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
 }
 
-/// Try to replace a shuffle with an insertelement.
+/// Try to replace a shuffle with an insertelement or try to replace a shuffle
+/// operand with the operand of an insertelement.
 static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) {
   Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
   SmallVector<int, 16> Mask = Shuf.getShuffleMask();
@@ -1630,6 +1754,31 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) {
   if (NumElts != (int)(V0->getType()->getVectorNumElements()))
     return nullptr;
 
+  // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
+  // not be able to handle it there if the insertelement has >1 use.
+  // If the shuffle has an insertelement operand but does not choose the
+  // inserted scalar element from that value, then we can replace that shuffle
+  // operand with the source vector of the insertelement.
+  Value *X;
+  uint64_t IdxC;
+  if (match(V0, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+    // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
+    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) {
+      Shuf.setOperand(0, X);
+      return &Shuf;
+    }
+  }
+  if (match(V1, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+    // Offset the index constant by the vector width because we are checking for
+    // accesses to the 2nd vector input of the shuffle.
+    IdxC += NumElts;
+    // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
+    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) {
+      Shuf.setOperand(1, X);
+      return &Shuf;
+    }
+  }
+
   // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
   auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
     // We need an insertelement with a constant index.
@@ -1756,29 +1905,21 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
           LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI)))
     return replaceInstUsesWith(SVI, V);
 
-  // Canonicalize shuffle(x    ,x,mask) -> shuffle(x, undef,mask')
-  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  // shuffle x, x, mask --> shuffle x, undef, mask'
   unsigned VWidth = SVI.getType()->getVectorNumElements();
   unsigned LHSWidth = LHS->getType()->getVectorNumElements();
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
-  if (LHS == RHS || isa<UndefValue>(LHS)) {
+  if (LHS == RHS) {
+    assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?");
     // Remap any references to RHS to use LHS.
     SmallVector<Constant*, 16> Elts;
-    for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
-      if (Mask[i] < 0) {
-        Elts.push_back(UndefValue::get(Int32Ty));
-        continue;
-      }
-
-      if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
-          (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
-        Mask[i] = -1;     // Turn into undef.
+    for (unsigned i = 0; i != VWidth; ++i) {
+      // Propagate undef elements or force mask to LHS.
+      if (Mask[i] < 0)
         Elts.push_back(UndefValue::get(Int32Ty));
-      } else {
-        Mask[i] = Mask[i] % e;  // Force to LHS.
-        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i]));
-      }
+      else
+        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i] % LHSWidth));
     }
     SVI.setOperand(0, SVI.getOperand(1));
     SVI.setOperand(1, UndefValue::get(RHS->getType()));
@@ -1786,6 +1927,12 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     return &SVI;
   }
 
+  // shuffle undef, x, mask --> shuffle x, undef, mask'
+  if (isa<UndefValue>(LHS)) {
+    SVI.commute();
+    return &SVI;
+  }
+
   if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
     return I;
 
@@ -1813,16 +1960,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (Instruction *I = foldIdentityPaddedShuffles(SVI))
     return I;
 
-  if (VWidth == LHSWidth) {
-    // Analyze the shuffle, are the LHS or RHS and identity shuffles?
-    bool isLHSID, isRHSID;
-    recognizeIdentityMask(Mask, isLHSID, isRHSID);
-
-    // Eliminate identity shuffles.
-    if (isLHSID) return replaceInstUsesWith(SVI, LHS);
-    if (isRHSID) return replaceInstUsesWith(SVI, RHS);
-  }
-
   if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) {
     Value *V = evaluateInDifferentElementOrder(LHS, Mask);
     return replaceInstUsesWith(SVI, V);
@@ -2100,12 +2237,5 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
   }
 
-  // If the result mask is an identity, replace uses of this instruction with
-  // corresponding argument.
-  bool isLHSID, isRHSID;
-  recognizeIdentityMask(newMask, isLHSID, isRHSID);
-  if (isLHSID && VWidth == LHSOp0Width) return replaceInstUsesWith(SVI, newLHS);
-  if (isRHSID && VWidth == RHSOp0Width) return replaceInstUsesWith(SVI, newRHS);
-
   return MadeChange ? &SVI : nullptr;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 385f4926b845..bf32996d96e2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -86,6 +86,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Casting.h"
@@ -121,6 +122,9 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
+static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
+static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;
+
 static cl::opt<bool>
 EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
                                               cl::init(true));
@@ -129,6 +133,17 @@ static cl::opt<bool>
 EnableExpensiveCombines("expensive-combines",
                         cl::desc("Enable expensive instruction combines"));
 
+static cl::opt<unsigned> LimitMaxIterations(
+    "instcombine-max-iterations",
+    cl::desc("Limit the maximum number of instruction combining iterations"),
+    cl::init(InstCombineDefaultMaxIterations));
+
+static cl::opt<unsigned> InfiniteLoopDetectionThreshold(
+    "instcombine-infinite-loop-threshold",
+    cl::desc("Number of instruction combining iterations considered an "
+             "infinite loop"),
+    cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden);
+
 static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
@@ -200,8 +215,8 @@ bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
 // where both B and C should be ConstantInts, results in a constant that does
 // not overflow. This function only handles the Add and Sub opcodes. For
 // all other opcodes, the function conservatively returns false.
-static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
-  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
   if (!OBO || !OBO->hasNoSignedWrap())
     return false;
 
@@ -224,10 +239,15 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
 }
 
 static bool hasNoUnsignedWrap(BinaryOperator &I) {
-  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
   return OBO && OBO->hasNoUnsignedWrap();
 }
 
+static bool hasNoSignedWrap(BinaryOperator &I) {
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  return OBO && OBO->hasNoSignedWrap();
+}
+
 /// Conservatively clears subclassOptionalData after a reassociation or
 /// commutation. We preserve fast-math flags when applicable as they can be
 /// preserved.
@@ -332,22 +352,21 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           // It simplifies to V.  Form "A op V".
           I.setOperand(0, A);
           I.setOperand(1, V);
-          // Conservatively clear the optional flags, since they may not be
-          // preserved by the reassociation.
           bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
-          bool IsNSW = MaintainNoSignedWrap(I, B, C);
+          bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);
 
+          // Conservatively clear all optional flags since they may not be
+          // preserved by the reassociation. Reset nsw/nuw based on the above
+          // analysis.
           ClearSubclassDataAfterReassociation(I);
 
+          // Note: this is only valid because SimplifyBinOp doesn't look at
+          // the operands to Op0.
           if (IsNUW)
             I.setHasNoUnsignedWrap(true);
 
-          if (IsNSW &&
-              (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
-            // Note: this is only valid because SimplifyBinOp doesn't look at
-            // the operands to Op0.
+          if (IsNSW)
             I.setHasNoSignedWrap(true);
-          }
 
           Changed = true;
           ++NumReassoc;
@@ -610,7 +629,6 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
           HasNUW &= ROBO->hasNoUnsignedWrap();
         }
 
-        const APInt *CInt;
         if (TopLevelOpcode == Instruction::Add &&
             InnerOpcode == Instruction::Mul) {
           // We can propagate 'nsw' if we know that
@@ -620,6 +638,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
           //  %Z = mul nsw i16 %X, C+1
           //
           // iff C+1 isn't INT_MIN
+          const APInt *CInt;
           if (match(V, m_APInt(CInt))) {
             if (!CInt->isMinSignedValue())
               BO->setHasNoSignedWrap(HasNSW);
@@ -755,31 +774,52 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
 
 Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
                                                     Value *LHS, Value *RHS) {
+  Value *A, *B, *C, *D, *E, *F;
+  bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
+  bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
+  if (!LHSIsSelect && !RHSIsSelect)
+    return nullptr;
+
+  FastMathFlags FMF;
+  BuilderTy::FastMathFlagGuard Guard(Builder);
+  if (isa<FPMathOperator>(&I)) {
+    FMF = I.getFastMathFlags();
+    Builder.setFastMathFlags(FMF);
+  }
+
   Instruction::BinaryOps Opcode = I.getOpcode();
-  // (op (select (a, b, c)), (select (a, d, e))) -> (select (a, (op b, d), (op
-  // c, e)))
-  Value *A, *B, *C, *D, *E;
-  Value *SI = nullptr;
-  if (match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))) &&
-      match(RHS, m_Select(m_Specific(A), m_Value(D), m_Value(E)))) {
-    bool SelectsHaveOneUse = LHS->hasOneUse() && RHS->hasOneUse();
-    BuilderTy::FastMathFlagGuard Guard(Builder);
-    if (isa<FPMathOperator>(&I))
-      Builder.setFastMathFlags(I.getFastMathFlags());
-
-    Value *V1 = SimplifyBinOp(Opcode, C, E, SQ.getWithInstruction(&I));
-    Value *V2 = SimplifyBinOp(Opcode, B, D, SQ.getWithInstruction(&I));
-    if (V1 && V2)
-      SI = Builder.CreateSelect(A, V2, V1);
-    else if (V2 && SelectsHaveOneUse)
-      SI = Builder.CreateSelect(A, V2, Builder.CreateBinOp(Opcode, C, E));
-    else if (V1 && SelectsHaveOneUse)
-      SI = Builder.CreateSelect(A, Builder.CreateBinOp(Opcode, B, D), V1);
-
-    if (SI)
-      SI->takeName(&I);
+  SimplifyQuery Q = SQ.getWithInstruction(&I);
+
+  Value *Cond, *True = nullptr, *False = nullptr;
+  if (LHSIsSelect && RHSIsSelect && A == D) {
+    // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
+    Cond = A;
+    True = SimplifyBinOp(Opcode, B, E, FMF, Q);
+    False = SimplifyBinOp(Opcode, C, F, FMF, Q);
+
+    if (LHS->hasOneUse() && RHS->hasOneUse()) {
+      if (False && !True)
+        True = Builder.CreateBinOp(Opcode, B, E);
+      else if (True && !False)
+        False = Builder.CreateBinOp(Opcode, C, F);
+    }
+  } else if (LHSIsSelect && LHS->hasOneUse()) {
+    // (A ? B : C) op Y -> A ? (B op Y) : (C op Y)
+    Cond = A;
+    True = SimplifyBinOp(Opcode, B, RHS, FMF, Q);
+    False = SimplifyBinOp(Opcode, C, RHS, FMF, Q);
+  } else if (RHSIsSelect && RHS->hasOneUse()) {
+    // X op (D ? E : F) -> D ? (X op E) : (X op F)
+    Cond = D;
+    True = SimplifyBinOp(Opcode, LHS, E, FMF, Q);
+    False = SimplifyBinOp(Opcode, LHS, F, FMF, Q);
   }
 
+  if (!True || !False)
+    return nullptr;
+
+  Value *SI = Builder.CreateSelect(Cond, True, False);
+  SI->takeName(&I);
   return SI;
 }
 
@@ -1518,11 +1558,13 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
       // If this is a widening shuffle, we must be able to extend with undef
       // elements. If the original binop does not produce an undef in the high
       // lanes, then this transform is not safe.
+      // Similarly for undef lanes due to the shuffle mask, we can only
+      // transform binops that preserve undef.
       // TODO: We could shuffle those non-undef constant values into the
       //       result by using a constant vector (rather than an undef vector)
       //       as operand 1 of the new binop, but that might be too aggressive
       //       for target-independent shuffle creation.
-      if (I >= SrcVecNumElts) {
+      if (I >= SrcVecNumElts || ShMask[I] < 0) {
         Constant *MaybeUndef =
             ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt)
                      : ConstantExpr::get(Opcode, CElt, UndefScalar);
@@ -1607,6 +1649,15 @@ Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) {
   return CastInst::Create(CastOpc, NarrowBO, BO.getType());
 }
 
+static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) {
+  // At least one GEP must be inbounds.
+  if (!GEP1.isInBounds() && !GEP2.isInBounds())
+    return false;
+
+  return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) &&
+         (GEP2.isInBounds() || GEP2.hasAllZeroIndices());
+}
+
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
   Type *GEPType = GEP.getType();
@@ -1659,7 +1710,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     // to an index of zero, so replace it with zero if it is not zero already.
     Type *EltTy = GTI.getIndexedType();
     if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0)
-      if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
+      if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
         *I = Constant::getNullValue(NewIndexType);
         MadeChange = true;
       }
@@ -1716,8 +1767,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
             // The first two arguments can vary for any GEP, the rest have to be
             // static for struct slots
-            if (J > 1 && CurTy->isStructTy())
-              return nullptr;
+            if (J > 1) {
+              assert(CurTy && "No current type?");
+              if (CurTy->isStructTy())
+                return nullptr;
+            }
 
             DI = J;
           } else {
@@ -1877,6 +1931,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
       // Update the GEP in place if possible.
       if (Src->getNumOperands() == 2) {
+        GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
         GEP.setOperand(0, Src->getOperand(0));
         GEP.setOperand(1, Sum);
         return &GEP;
@@ -1893,7 +1948,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
 
     if (!Indices.empty())
-      return GEP.isInBounds() && Src->isInBounds()
+      return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
                  ? GetElementPtrInst::CreateInBounds(
                        Src->getSourceElementType(), Src->getOperand(0), Indices,
                        GEP.getName())
@@ -2146,15 +2201,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     // of a bitcasted pointer to vector or array of the same dimensions:
     // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
     // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
-    auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy) {
+    auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
+                                          const DataLayout &DL) {
       return ArrTy->getArrayElementType() == VecTy->getVectorElementType() &&
-             ArrTy->getArrayNumElements() == VecTy->getVectorNumElements();
+             ArrTy->getArrayNumElements() == VecTy->getVectorNumElements() &&
+             DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
     };
     if (GEP.getNumOperands() == 3 &&
         ((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() &&
-          areMatchingArrayAndVecTypes(GEPEltType, SrcEltType)) ||
+          areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
          (GEPEltType->isVectorTy() && SrcEltType->isArrayTy() &&
-          areMatchingArrayAndVecTypes(SrcEltType, GEPEltType)))) {
+          areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
 
       // Create a new GEP here, as using `setOperand()` followed by
       // `setSourceElementType()` won't actually update the type of the
@@ -2393,12 +2450,13 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
         replaceInstUsesWith(*C,
                             ConstantInt::get(Type::getInt1Ty(C->getContext()),
                                              C->isFalseWhenEqual()));
-      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
-                 isa<AddrSpaceCastInst>(I)) {
-        replaceInstUsesWith(*I, UndefValue::get(I->getType()));
       } else if (auto *SI = dyn_cast<StoreInst>(I)) {
         for (auto *DII : DIIs)
           ConvertDebugDeclareToDebugValue(DII, SI, *DIB);
+      } else {
+        // Casts, GEP, or anything else: we're about to delete this instruction,
+        // so it can not have any valid uses.
+        replaceInstUsesWith(*I, UndefValue::get(I->getType()));
       }
       eraseInstFromFunction(*I);
     }
@@ -2549,9 +2607,7 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
 Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
   // Change br (not X), label True, label False to: br X, label False, True
   Value *X = nullptr;
-  BasicBlock *TrueDest;
-  BasicBlock *FalseDest;
-  if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
+  if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
       !isa<Constant>(X)) {
     // Swap Destinations and condition...
     BI.setCondition(X);
@@ -2569,8 +2625,8 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
 
   // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq.
   CmpInst::Predicate Pred;
-  if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), TrueDest,
-                      FalseDest)) &&
+  if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())),
+                      m_BasicBlock(), m_BasicBlock())) &&
       !isCanonicalPredicate(Pred)) {
     // Swap destinations and condition.
     CmpInst *Cond = cast<CmpInst>(BI.getCondition());
@@ -3105,6 +3161,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
   return nullptr;
 }
 
+Instruction *InstCombiner::visitFreeze(FreezeInst &I) {
+  Value *Op0 = I.getOperand(0);
+
+  if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  return nullptr;
+}
+
 /// Try to move the specified instruction from its current block into the
 /// beginning of DestBlock, which can only happen if it's safe to move the
 /// instruction past all of the instructions between it and the end of its
@@ -3156,6 +3221,21 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   findDbgUsers(DbgUsers, I);
   for (auto *DII : reverse(DbgUsers)) {
     if (DII->getParent() == SrcBlock) {
+      if (isa<DbgDeclareInst>(DII)) {
+        // A dbg.declare instruction should not be cloned, since there can only be
+        // one per variable fragment. It should be left in the original place since
+        // sunk instruction is not an alloca(otherwise we could not be here).
+        // But we need to update arguments of dbg.declare instruction, so that it
+        // would not point into sunk instruction.
+        if (!isa<CastInst>(I))
+          continue; // dbg.declare points at something it shouldn't
+
+        DII->setOperand(
+            0, MetadataAsValue::get(I->getContext(),
+                                    ValueAsMetadata::get(I->getOperand(0))));
+        continue;
+      }
+
       // dbg.value is in the same basic block as the sunk inst, see if we can
       // salvage it. Clone a new copy of the instruction: on success we need
       // both salvaged and unsalvaged copies.
@@ -3301,10 +3381,6 @@ bool InstCombiner::run() {
         // Move the name to the new instruction first.
         Result->takeName(I);
 
-        // Push the new instruction and any users onto the worklist.
-        Worklist.AddUsersToWorkList(*Result);
-        Worklist.Add(Result);
-
         // Insert the new instruction into the basic block...
         BasicBlock *InstParent = I->getParent();
         BasicBlock::iterator InsertPos = I->getIterator();
@@ -3316,6 +3392,10 @@ bool InstCombiner::run() {
 
         InstParent->getInstList().insert(InsertPos, Result);
 
+        // Push the new instruction and any users onto the worklist.
+        Worklist.AddUsersToWorkList(*Result);
+        Worklist.Add(Result);
+
         eraseInstFromFunction(*I);
       } else {
         LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
@@ -3371,8 +3451,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
         LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
-        if (!salvageDebugInfo(*Inst))
-          replaceDbgUsesWithUndef(Inst);
+        salvageDebugInfoOrMarkUndef(*Inst);
         Inst->eraseFromParent();
         MadeIRChange = true;
         continue;
@@ -3486,10 +3565,12 @@ static bool combineInstructionsOverFunction(
     Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
     AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
     OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
-    ProfileSummaryInfo *PSI, bool ExpensiveCombines = true,
-    LoopInfo *LI = nullptr) {
+    ProfileSummaryInfo *PSI, bool ExpensiveCombines, unsigned MaxIterations,
+    LoopInfo *LI) {
   auto &DL = F.getParent()->getDataLayout();
-  ExpensiveCombines |= EnableExpensiveCombines;
+  if (EnableExpensiveCombines.getNumOccurrences())
+    ExpensiveCombines = EnableExpensiveCombines;
+  MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
 
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
@@ -3508,9 +3589,23 @@ static bool combineInstructionsOverFunction(
     MadeIRChange = LowerDbgDeclare(F);
 
   // Iterate while there is work to do.
-  int Iteration = 0;
+  unsigned Iteration = 0;
   while (true) {
     ++Iteration;
+
+    if (Iteration > InfiniteLoopDetectionThreshold) {
+      report_fatal_error(
+          "Instruction Combining seems stuck in an infinite loop after " +
+          Twine(InfiniteLoopDetectionThreshold) + " iterations.");
+    }
+
+    if (Iteration > MaxIterations) {
+      LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations
+                        << " on " << F.getName()
+                        << " reached; stopping before reaching a fixpoint\n");
+      break;
+    }
+
     LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                       << F.getName() << "\n");
 
@@ -3522,11 +3617,19 @@ static bool combineInstructionsOverFunction(
 
     if (!IC.run())
       break;
+
+    MadeIRChange = true;
   }
 
-  return MadeIRChange || Iteration > 1;
+  return MadeIRChange;
 }
 
+InstCombinePass::InstCombinePass(bool ExpensiveCombines)
+    : ExpensiveCombines(ExpensiveCombines), MaxIterations(LimitMaxIterations) {}
+
+InstCombinePass::InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations)
+    : ExpensiveCombines(ExpensiveCombines), MaxIterations(MaxIterations) {}
+
 PreservedAnalyses InstCombinePass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
@@ -3544,8 +3647,9 @@ PreservedAnalyses InstCombinePass::run(Function &F,
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
 
-  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE,
-                                       BFI, PSI, ExpensiveCombines, LI))
+  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
+                                       PSI, ExpensiveCombines, MaxIterations,
+                                       LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3580,7 +3684,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
   // Required analyses.
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
@@ -3594,12 +3698,26 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
       &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
       nullptr;
 
-  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE,
-                                         BFI, PSI, ExpensiveCombines, LI);
+  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
+                                         PSI, ExpensiveCombines, MaxIterations,
+                                         LI);
 }
 
 char InstructionCombiningPass::ID = 0;
 
+InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines)
+    : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines),
+      MaxIterations(InstCombineDefaultMaxIterations) {
+  initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
+}
+
+InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines,
+                                                   unsigned MaxIterations)
+    : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines),
+      MaxIterations(MaxIterations) {
+  initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
+}
+
 INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
                       "Combine redundant instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
@@ -3626,6 +3744,11 @@ FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) {
   return new InstructionCombiningPass(ExpensiveCombines);
 }
 
+FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines,
+                                                   unsigned MaxIterations) {
+  return new InstructionCombiningPass(ExpensiveCombines, MaxIterations);
+}
+
 void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createInstructionCombiningPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 89a90eab8978..79c119489a65 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -59,6 +59,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -129,6 +130,8 @@ static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
 static const char *const kAsanModuleCtorName = "asan.module_ctor";
 static const char *const kAsanModuleDtorName = "asan.module_dtor";
 static const uint64_t kAsanCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50;
 static const char *const kAsanReportErrorTemplate = "__asan_report_";
 static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *const kAsanUnregisterGlobalsName =
@@ -191,6 +194,11 @@ static cl::opt<bool> ClRecover(
     cl::desc("Enable recovery mode (continue-after-error)."),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClInsertVersionCheck(
+    "asan-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."),
+    cl::Hidden, cl::init(true));
+
 // This flag may need to be replaced with -f[no-]asan-reads.
 static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
                                        cl::desc("instrument read instructions"),
@@ -530,6 +538,14 @@ static size_t RedzoneSizeForScale(int MappingScale) {
   return std::max(32U, 1U << MappingScale);
 }
 
+static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
+  if (TargetTriple.isOSEmscripten()) {
+    return kAsanEmscriptenCtorAndDtorPriority;
+  } else {
+    return kAsanCtorAndDtorPriority;
+  }
+}
+
 namespace {
 
 /// Module analysis for getting various metadata about the module.
@@ -565,10 +581,10 @@ char ASanGlobalsMetadataWrapperPass::ID = 0;
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer {
-  AddressSanitizer(Module &M, GlobalsMetadata &GlobalsMD,
+  AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
                    bool CompileKernel = false, bool Recover = false,
                    bool UseAfterScope = false)
-      : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(GlobalsMD) {
+      : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
     this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
     this->CompileKernel =
         ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
@@ -706,8 +722,8 @@ public:
     GlobalsMetadata &GlobalsMD =
         getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    AddressSanitizer ASan(*F.getParent(), GlobalsMD, CompileKernel, Recover,
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover,
                           UseAfterScope);
     return ASan.instrumentFunction(F, TLI);
   }
@@ -720,10 +736,10 @@ private:
 
 class ModuleAddressSanitizer {
 public:
-  ModuleAddressSanitizer(Module &M, GlobalsMetadata &GlobalsMD,
+  ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
                          bool CompileKernel = false, bool Recover = false,
                          bool UseGlobalsGC = true, bool UseOdrIndicator = false)
-      : GlobalsMD(GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+      : GlobalsMD(*GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
         // Enable aliases as they should have no downside with ODR indicators.
         UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
         UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
@@ -830,7 +846,7 @@ public:
   bool runOnModule(Module &M) override {
     GlobalsMetadata &GlobalsMD =
         getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
-    ModuleAddressSanitizer ASanModule(M, GlobalsMD, CompileKernel, Recover,
+    ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover,
                                       UseGlobalGC, UseOdrIndicator);
     return ASanModule.instrumentModule(M);
   }
@@ -1033,7 +1049,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     if (!II.isLifetimeStartOrEnd())
       return;
     // Found lifetime intrinsic, add ASan instrumentation if necessary.
-    ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    auto *Size = cast<ConstantInt>(II.getArgOperand(0));
     // If size argument is undefined, don't do anything.
     if (Size->isMinusOne()) return;
     // Check that size doesn't saturate uint64_t and can
@@ -1156,7 +1172,7 @@ PreservedAnalyses AddressSanitizerPass::run(Function &F,
   Module &M = *F.getParent();
   if (auto *R = MAM.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
     const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
-    AddressSanitizer Sanitizer(M, *R, CompileKernel, Recover, UseAfterScope);
+    AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope);
     if (Sanitizer.instrumentFunction(F, TLI))
       return PreservedAnalyses::none();
     return PreservedAnalyses::all();
@@ -1178,7 +1194,7 @@ ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel,
 PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
                                                   AnalysisManager<Module> &AM) {
   GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M);
-  ModuleAddressSanitizer Sanitizer(M, GlobalsMD, CompileKernel, Recover,
+  ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover,
                                    UseGlobalGC, UseOdrIndicator);
   if (Sanitizer.instrumentModule(M))
     return PreservedAnalyses::none();
@@ -1331,7 +1347,7 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
                                                    unsigned *Alignment,
                                                    Value **MaybeMask) {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->getMetadata("nosanitize")) return nullptr;
+  if (I->hasMetadata("nosanitize")) return nullptr;
 
   // Do not instrument the load fetching the dynamic shadow address.
   if (LocalDynamicShadow == I)
@@ -1775,9 +1791,10 @@ void ModuleAddressSanitizer::createInitializerPoisonCalls(
     // Must have a function or null ptr.
     if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
       if (F->getName() == kAsanModuleCtorName) continue;
-      ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+      auto *Priority = cast<ConstantInt>(CS->getOperand(0));
       // Don't instrument CTORs that will run before asan.module_ctor.
-      if (Priority->getLimitedValue() <= kAsanCtorAndDtorPriority) continue;
+      if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple))
+        continue;
       poisonOneInitializer(*F, ModuleName);
     }
   }
@@ -1792,6 +1809,8 @@ bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
   if (GlobalsMD.get(G).IsBlacklisted) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
+  // Only instrument globals of default address spaces
+  if (G->getAddressSpace()) return false;
   if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
   // Two problems with thread-locals:
   //   - The address of the main thread's copy can't be computed at link-time.
@@ -1919,7 +1938,12 @@ StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
   case Triple::COFF:  return ".ASAN$GL";
   case Triple::ELF:   return "asan_globals";
   case Triple::MachO: return "__DATA,__asan_globals,regular";
-  default: break;
+  case Triple::Wasm:
+  case Triple::XCOFF:
+    report_fatal_error(
+        "ModuleAddressSanitizer not implemented for object file format.");
+  case Triple::UnknownObjectFormat:
+    break;
   }
   llvm_unreachable("unsupported object format");
 }
@@ -2033,7 +2057,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
     unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
     assert(isPowerOf2_32(SizeOfGlobalStruct) &&
            "global metadata will not be padded appropriately");
-    Metadata->setAlignment(SizeOfGlobalStruct);
+    Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct));
 
     SetComdatForGlobalMetadata(G, Metadata, "");
   }
@@ -2170,7 +2194,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
       M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
       ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
   if (Mapping.Scale > 3)
-    AllGlobals->setAlignment(1ULL << Mapping.Scale);
+    AllGlobals->setAlignment(Align(1ULL << Mapping.Scale));
 
   IRB.CreateCall(AsanRegisterGlobals,
                  {IRB.CreatePointerCast(AllGlobals, IntptrTy),
@@ -2270,7 +2294,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
                            "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
     NewGlobal->setComdat(G->getComdat());
-    NewGlobal->setAlignment(MinRZ);
+    NewGlobal->setAlignment(MaybeAlign(MinRZ));
     // Don't fold globals with redzones. ODR violation detector and redzone
     // poisoning implicitly creates a dependence on the global's address, so it
     // is no longer valid for it to be marked unnamed_addr.
@@ -2338,7 +2362,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
       // Set meaningful attributes for indicator symbol.
       ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
       ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
-      ODRIndicatorSym->setAlignment(1);
+      ODRIndicatorSym->setAlignment(Align::None());
       ODRIndicator = ODRIndicatorSym;
     }
 
@@ -2410,39 +2434,39 @@ bool ModuleAddressSanitizer::instrumentModule(Module &M) {
 
   // Create a module constructor. A destructor is created lazily because not all
   // platforms, and not all modules need it.
+  std::string AsanVersion = std::to_string(GetAsanVersion(M));
   std::string VersionCheckName =
-      kAsanVersionCheckNamePrefix + std::to_string(GetAsanVersion(M));
+      ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
   std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
       M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
       /*InitArgs=*/{}, VersionCheckName);
 
   bool CtorComdat = true;
-  bool Changed = false;
   // TODO(glider): temporarily disabled globals instrumentation for KASan.
   if (ClGlobals) {
     IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
-    Changed |= InstrumentGlobals(IRB, M, &CtorComdat);
+    InstrumentGlobals(IRB, M, &CtorComdat);
   }
 
+  const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple);
+
   // Put the constructor and destructor in comdat if both
   // (1) global instrumentation is not TU-specific
   // (2) target is ELF.
   if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
     AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
-    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
-                        AsanCtorFunction);
+    appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction);
     if (AsanDtorFunction) {
       AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
-      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority,
-                          AsanDtorFunction);
+      appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction);
     }
   } else {
-    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
+    appendToGlobalCtors(M, AsanCtorFunction, Priority);
     if (AsanDtorFunction)
-      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
+      appendToGlobalDtors(M, AsanDtorFunction, Priority);
   }
 
-  return Changed;
+  return true;
 }
 
 void AddressSanitizer::initializeCallbacks(Module &M) {
@@ -2664,7 +2688,7 @@ bool AddressSanitizer::instrumentFunction(Function &F,
         if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CS.doesNotReturn() && !CS->getMetadata("nosanitize"))
+          if (CS.doesNotReturn() && !CS->hasMetadata("nosanitize"))
             NoReturnCalls.push_back(CS.getInstruction());
         }
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
@@ -2877,18 +2901,18 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
   for (Argument &Arg : F.args()) {
     if (Arg.hasByValAttr()) {
       Type *Ty = Arg.getType()->getPointerElementType();
-      unsigned Align = Arg.getParamAlignment();
-      if (Align == 0) Align = DL.getABITypeAlignment(Ty);
+      const Align Alignment =
+          DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty);
 
       AllocaInst *AI = IRB.CreateAlloca(
           Ty, nullptr,
           (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) +
               ".byval");
-      AI->setAlignment(Align);
+      AI->setAlignment(Alignment);
       Arg.replaceAllUsesWith(AI);
 
       uint64_t AllocSize = DL.getTypeAllocSize(Ty);
-      IRB.CreateMemCpy(AI, Align, &Arg, Align, AllocSize);
+      IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize);
     }
   }
 }
@@ -2919,7 +2943,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
   }
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
   size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
-  Alloca->setAlignment(FrameAlignment);
+  Alloca->setAlignment(MaybeAlign(FrameAlignment));
   return IRB.CreatePointerCast(Alloca, IntptrTy);
 }
 
@@ -2928,7 +2952,7 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
   IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
   DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
   IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
-  DynamicAllocaLayout->setAlignment(32);
+  DynamicAllocaLayout->setAlignment(Align(32));
 }
 
 void FunctionStackPoisoner::processDynamicAllocas() {
@@ -2971,7 +2995,6 @@ void FunctionStackPoisoner::processStaticAllocas() {
 
   Instruction *InsBefore = AllocaVec[0];
   IRBuilder<> IRB(InsBefore);
-  IRB.SetCurrentDebugLocation(EntryDebugLocation);
 
   // Make sure non-instrumented allocas stay in the entry block. Otherwise,
   // debug info is broken, because only entry-block allocas are treated as
@@ -3066,14 +3089,12 @@ void FunctionStackPoisoner::processStaticAllocas() {
     Instruction *Term =
         SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
     IRBuilder<> IRBIf(Term);
-    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
     StackMallocIdx = StackMallocSizeClass(LocalStackSize);
     assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
     Value *FakeStackValue =
         IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
                          ConstantInt::get(IntptrTy, LocalStackSize));
     IRB.SetInsertPoint(InsBefore);
-    IRB.SetCurrentDebugLocation(EntryDebugLocation);
     FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
                           ConstantInt::get(IntptrTy, 0));
 
@@ -3081,14 +3102,11 @@ void FunctionStackPoisoner::processStaticAllocas() {
         IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
     Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
     IRBIf.SetInsertPoint(Term);
-    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
     Value *AllocaValue =
         DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
 
     IRB.SetInsertPoint(InsBefore);
-    IRB.SetCurrentDebugLocation(EntryDebugLocation);
     LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
-    IRB.SetCurrentDebugLocation(EntryDebugLocation);
     IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
     DIExprFlags |= DIExpression::DerefBefore;
   } else {
@@ -3275,7 +3293,7 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
 
   // Insert new alloca with new NewSize and Align params.
   AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
-  NewAlloca->setAlignment(Align);
+  NewAlloca->setAlignment(MaybeAlign(Align));
 
   // NewAddress = Address + Align
   Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 4dc9b611c156..9abb62ac788c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -224,7 +225,7 @@ struct BoundsCheckingLegacyPass : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     return addBoundsChecking(F, TLI, SE);
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h
index 971e00041762..8bb6f47c4846 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -257,13 +257,13 @@ public:
     std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
     if (Inserted) {
       // Newly inserted, update the real info.
-      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
       Index++;
     }
     std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
     if (Inserted)
       // Newly inserted, update the real info.
-      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
     AllEdges.emplace_back(new Edge(Src, Dest, W));
     return *AllEdges.back();
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 3f4f9bc7145d..d35abb92dd08 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -27,7 +27,9 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -512,30 +514,38 @@ static bool isHoistable(Instruction *I, DominatorTree &DT) {
 // first-region entry block) or the (hoistable or unhoistable) base values that
 // are defined outside (including the first-region entry block) of the
 // scope. The returned set doesn't include constants.
-static std::set<Value *> getBaseValues(Value *V,
-                                       DominatorTree &DT) {
+static std::set<Value *> getBaseValues(
+    Value *V, DominatorTree &DT,
+    DenseMap<Value *, std::set<Value *>> &Visited) {
+  if (Visited.count(V)) {
+    return Visited[V];
+  }
   std::set<Value *> Result;
   if (auto *I = dyn_cast<Instruction>(V)) {
     // We don't stop at a block that's not in the Scope because we would miss some
     // instructions that are based on the same base values if we stop there.
     if (!isHoistable(I, DT)) {
       Result.insert(I);
+      Visited.insert(std::make_pair(V, Result));
       return Result;
     }
     // I is hoistable above the Scope.
     for (Value *Op : I->operands()) {
-      std::set<Value *> OpResult = getBaseValues(Op, DT);
+      std::set<Value *> OpResult = getBaseValues(Op, DT, Visited);
       Result.insert(OpResult.begin(), OpResult.end());
     }
+    Visited.insert(std::make_pair(V, Result));
     return Result;
   }
   if (isa<Argument>(V)) {
     Result.insert(V);
+    Visited.insert(std::make_pair(V, Result));
     return Result;
   }
   // We don't include others like constants because those won't lead to any
   // chance of folding of conditions (eg two bit checks merged into one check)
   // after CHR.
+  Visited.insert(std::make_pair(V, Result));
   return Result;  // empty
 }
 
@@ -615,6 +625,10 @@ static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb,
   assert(SumWt >= TrueWt && SumWt >= FalseWt &&
          "Overflow calculating branch probabilities.");
 
+  // Guard against 0-to-0 branch weights to avoid a division-by-zero crash.
+  if (SumWt == 0)
+    return false;
+
   TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt);
   FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt);
   return true;
@@ -1053,6 +1067,7 @@ static bool shouldSplit(Instruction *InsertPoint,
                         DenseSet<Value *> &ConditionValues,
                         DominatorTree &DT,
                         DenseSet<Instruction *> &Unhoistables) {
+  assert(InsertPoint && "Null InsertPoint");
   CHR_DEBUG(
       dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues ";
       for (Value *V : PrevConditionValues) {
@@ -1063,7 +1078,6 @@ static bool shouldSplit(Instruction *InsertPoint,
         dbgs() << *V << ", ";
       }
       dbgs() << "\n");
-  assert(InsertPoint && "Null InsertPoint");
   // If any of Bases isn't hoistable to the hoist point, split.
   for (Value *V : ConditionValues) {
     DenseMap<Instruction *, bool> Visited;
@@ -1078,12 +1092,13 @@ static bool shouldSplit(Instruction *InsertPoint,
   if (!PrevConditionValues.empty() && !ConditionValues.empty()) {
     // Use std::set as DenseSet doesn't work with set_intersection.
     std::set<Value *> PrevBases, Bases;
+    DenseMap<Value *, std::set<Value *>> Visited;
     for (Value *V : PrevConditionValues) {
-      std::set<Value *> BaseValues = getBaseValues(V, DT);
+      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
       PrevBases.insert(BaseValues.begin(), BaseValues.end());
     }
     for (Value *V : ConditionValues) {
-      std::set<Value *> BaseValues = getBaseValues(V, DT);
+      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
       Bases.insert(BaseValues.begin(), BaseValues.end());
     }
     CHR_DEBUG(
@@ -1538,10 +1553,7 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
     }
     if (auto *SI = dyn_cast<SelectInst>(U)) {
       // Swap operands
-      Value *TrueValue = SI->getTrueValue();
-      Value *FalseValue = SI->getFalseValue();
-      SI->setTrueValue(FalseValue);
-      SI->setFalseValue(TrueValue);
+      SI->swapValues();
       SI->swapProfMetadata();
       if (Scope->TrueBiasedSelects.count(SI)) {
         assert(Scope->FalseBiasedSelects.count(SI) == 0 &&
@@ -2073,7 +2085,7 @@ bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
       getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
-      llvm::make_unique<OptimizationRemarkEmitter>(&F);
+      std::make_unique<OptimizationRemarkEmitter>(&F);
   return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 2279c1bcb6a8..cf9a6a321c7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -55,7 +55,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -83,13 +82,16 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -479,7 +481,9 @@ DataFlowSanitizer::DataFlowSanitizer(
   std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
   AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
                          ClABIListFiles.end());
-  ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles));
+  // FIXME: should we propagate vfs::FileSystem to this constructor?
+  ABIList.set(
+      SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem()));
 }
 
 FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
@@ -1212,7 +1216,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     return DFS.ZeroShadow;
   case 1: {
     LoadInst *LI = new LoadInst(DFS.ShadowTy, ShadowAddr, "", Pos);
-    LI->setAlignment(ShadowAlign);
+    LI->setAlignment(MaybeAlign(ShadowAlign));
     return LI;
   }
   case 2: {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 59950ffc4e9a..bf3e4ed3e31f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -86,7 +87,9 @@ public:
     ReversedVersion[3] = Options.Version[0];
     ReversedVersion[4] = '\0';
   }
-  bool runOnModule(Module &M, const TargetLibraryInfo &TLI);
+  bool
+  runOnModule(Module &M,
+              std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
 
 private:
   // Create the .gcno files for the Module based on DebugInfo.
@@ -102,9 +105,9 @@ private:
                                       std::vector<Regex> &Regexes);
 
   // Get pointers to the functions in the runtime library.
-  FunctionCallee getStartFileFunc();
-  FunctionCallee getEmitFunctionFunc();
-  FunctionCallee getEmitArcsFunc();
+  FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI);
   FunctionCallee getSummaryInfoFunc();
   FunctionCallee getEndFileFunc();
 
@@ -126,9 +129,9 @@ private:
   // Checksum, produced by hash of EdgeDestinations
   SmallVector<uint32_t, 4> FileChecksums;
 
-  Module *M;
-  const TargetLibraryInfo *TLI;
-  LLVMContext *Ctx;
+  Module *M = nullptr;
+  std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
+  LLVMContext *Ctx = nullptr;
   SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
   std::vector<Regex> FilterRe;
   std::vector<Regex> ExcludeRe;
@@ -147,8 +150,9 @@ public:
   StringRef getPassName() const override { return "GCOV Profiler"; }
 
   bool runOnModule(Module &M) override {
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return Profiler.runOnModule(M, TLI);
+    return Profiler.runOnModule(M, [this](Function &F) -> TargetLibraryInfo & {
+      return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    });
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -381,7 +385,7 @@ namespace {
       return EdgeDestinations;
     }
 
-    uint32_t getFuncChecksum() {
+    uint32_t getFuncChecksum() const {
       return FuncChecksum;
     }
 
@@ -555,9 +559,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   return CurPath.str();
 }
 
-bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) {
+bool GCOVProfiler::runOnModule(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
   this->M = &M;
-  this->TLI = &TLI;
+  this->GetTLI = std::move(GetTLI);
   Ctx = &M.getContext();
 
   AddFlushBeforeForkAndExec();
@@ -574,9 +579,12 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
                                         ModuleAnalysisManager &AM) {
 
   GCOVProfiler Profiler(GCOVOpts);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
-  if (!Profiler.runOnModule(M, TLI))
+  if (!Profiler.runOnModule(M, [&](Function &F) -> TargetLibraryInfo & {
+        return FAM.getResult<TargetLibraryAnalysis>(F);
+      }))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -624,6 +632,7 @@ static bool shouldKeepInEntry(BasicBlock::iterator It) {
 void GCOVProfiler::AddFlushBeforeForkAndExec() {
   SmallVector<Instruction *, 2> ForkAndExecs;
   for (auto &F : M->functions()) {
+    auto *TLI = &GetTLI(F);
     for (auto &I : instructions(F)) {
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         if (Function *Callee = CI->getCalledFunction()) {
@@ -669,7 +678,8 @@ void GCOVProfiler::emitProfileNotes() {
       continue;
 
     std::error_code EC;
-    raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, sys::fs::F_None);
+    raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC,
+                       sys::fs::OF_None);
     if (EC) {
       Ctx->emitError(Twine("failed to open coverage notes file for writing: ") +
                      EC.message());
@@ -695,7 +705,7 @@ void GCOVProfiler::emitProfileNotes() {
         ++It;
       EntryBlock.splitBasicBlock(It);
 
-      Funcs.push_back(make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
+      Funcs.push_back(std::make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
                                                 Options.UseCfgChecksum,
                                                 Options.ExitBlockBeforeBody));
       GCOVFunction &Func = *Funcs.back();
@@ -704,7 +714,10 @@ void GCOVProfiler::emitProfileNotes() {
       // to have a counter for the function definition.
       uint32_t Line = SP->getLine();
       auto Filename = getFilename(SP);
-      Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
+
+      // Artificial functions such as global initializers
+      if (!SP->isArtificial())
+        Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
 
       for (auto &BB : F) {
         GCOVBlock &Block = Func.getBlock(&BB);
@@ -873,7 +886,7 @@ bool GCOVProfiler::emitProfileArcs() {
   return Result;
 }
 
-FunctionCallee GCOVProfiler::getStartFileFunc() {
+FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
     Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
     Type::getInt8PtrTy(*Ctx),  // const char version[4]
@@ -887,7 +900,7 @@ FunctionCallee GCOVProfiler::getStartFileFunc() {
   return Res;
 }
 
-FunctionCallee GCOVProfiler::getEmitFunctionFunc() {
+FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
@@ -906,7 +919,7 @@ FunctionCallee GCOVProfiler::getEmitFunctionFunc() {
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
 }
 
-FunctionCallee GCOVProfiler::getEmitArcsFunc() {
+FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
     Type::getInt32Ty(*Ctx),     // uint32_t num_counters
     Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
@@ -943,9 +956,11 @@ Function *GCOVProfiler::insertCounterWriteout(
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
   IRBuilder<> Builder(BB);
 
-  FunctionCallee StartFile = getStartFileFunc();
-  FunctionCallee EmitFunction = getEmitFunctionFunc();
-  FunctionCallee EmitArcs = getEmitArcsFunc();
+  auto *TLI = &GetTLI(*WriteoutF);
+
+  FunctionCallee StartFile = getStartFileFunc(TLI);
+  FunctionCallee EmitFunction = getEmitFunctionFunc(TLI);
+  FunctionCallee EmitArcs = getEmitArcsFunc(TLI);
   FunctionCallee SummaryInfo = getSummaryInfoFunc();
   FunctionCallee EndFile = getEndFileFunc();
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 90a9f4955a4b..7e8f8e27a97b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -12,10 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -36,6 +38,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -52,7 +55,10 @@ using namespace llvm;
 #define DEBUG_TYPE "hwasan"
 
 static const char *const kHwasanModuleCtorName = "hwasan.module_ctor";
+static const char *const kHwasanNoteName = "hwasan.note";
 static const char *const kHwasanInitName = "__hwasan_init";
+static const char *const kHwasanPersonalityThunkName =
+    "__hwasan_personality_thunk";
 
 static const char *const kHwasanShadowMemoryDynamicAddress =
     "__hwasan_shadow_memory_dynamic_address";
@@ -112,6 +118,9 @@ static cl::opt<bool> ClGenerateTagsWithCalls(
     cl::desc("generate new tags with runtime library calls"), cl::Hidden,
     cl::init(false));
 
+static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
+                               cl::Hidden, cl::init(false));
+
 static cl::opt<int> ClMatchAllTag(
     "hwasan-match-all-tag",
     cl::desc("don't report bad accesses via pointers with this tag"),
@@ -155,8 +164,18 @@ static cl::opt<bool>
 
 static cl::opt<bool>
     ClInstrumentLandingPads("hwasan-instrument-landing-pads",
-                              cl::desc("instrument landing pads"), cl::Hidden,
-                              cl::init(true));
+                            cl::desc("instrument landing pads"), cl::Hidden,
+                            cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClUseShortGranules(
+    "hwasan-use-short-granules",
+    cl::desc("use short granules in allocas and outlined checks"), cl::Hidden,
+    cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClInstrumentPersonalityFunctions(
+    "hwasan-instrument-personality-functions",
+    cl::desc("instrument personality functions"), cl::Hidden, cl::init(false),
+    cl::ZeroOrMore);
 
 static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
                                        cl::desc("inline all checks"),
@@ -169,16 +188,16 @@ namespace {
 class HWAddressSanitizer {
 public:
   explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
-                              bool Recover = false) {
+                              bool Recover = false) : M(M) {
     this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
     this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
         ClEnableKhwasan : CompileKernel;
 
-    initializeModule(M);
+    initializeModule();
   }
 
   bool sanitizeFunction(Function &F);
-  void initializeModule(Module &M);
+  void initializeModule();
 
   void initializeCallbacks(Module &M);
 
@@ -203,7 +222,7 @@ public:
   Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
   bool instrumentStack(
       SmallVectorImpl<AllocaInst *> &Allocas,
-      DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap,
+      DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
       SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
   Value *readRegister(IRBuilder<> &IRB, StringRef Name);
   bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
@@ -216,9 +235,14 @@ public:
   Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
   void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord);
 
+  void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
+  void instrumentGlobals();
+
+  void instrumentPersonalityFunctions();
+
 private:
   LLVMContext *C;
-  std::string CurModuleUniqueId;
+  Module &M;
   Triple TargetTriple;
   FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset;
   FunctionCallee HWAsanHandleVfork;
@@ -238,17 +262,21 @@ private:
     bool InTls;
 
     void init(Triple &TargetTriple);
-    unsigned getAllocaAlignment() const { return 1U << Scale; }
+    unsigned getObjectAlignment() const { return 1U << Scale; }
   };
   ShadowMapping Mapping;
 
+  Type *VoidTy = Type::getVoidTy(M.getContext());
   Type *IntptrTy;
   Type *Int8PtrTy;
   Type *Int8Ty;
   Type *Int32Ty;
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
 
   bool CompileKernel;
   bool Recover;
+  bool UseShortGranules;
+  bool InstrumentLandingPads;
 
   Function *HwasanCtorFunction;
 
@@ -257,7 +285,6 @@ private:
 
   FunctionCallee HwasanTagMemoryFunc;
   FunctionCallee HwasanGenerateTagFunc;
-  FunctionCallee HwasanThreadEnterFunc;
 
   Constant *ShadowGlobal;
 
@@ -278,7 +305,7 @@ public:
   StringRef getPassName() const override { return "HWAddressSanitizer"; }
 
   bool doInitialization(Module &M) override {
-    HWASan = llvm::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
+    HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
     return true;
   }
 
@@ -333,7 +360,7 @@ PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
 /// Module-level initialization.
 ///
 /// inserts a call to __hwasan_init to the module's constructor list.
-void HWAddressSanitizer::initializeModule(Module &M) {
+void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
   auto &DL = M.getDataLayout();
 
@@ -342,7 +369,6 @@ void HWAddressSanitizer::initializeModule(Module &M) {
   Mapping.init(TargetTriple);
 
   C = &(M.getContext());
-  CurModuleUniqueId = getUniqueModuleId(&M);
   IRBuilder<> IRB(*C);
   IntptrTy = IRB.getIntPtrTy(DL);
   Int8PtrTy = IRB.getInt8PtrTy();
@@ -350,6 +376,21 @@ void HWAddressSanitizer::initializeModule(Module &M) {
   Int32Ty = IRB.getInt32Ty();
 
   HwasanCtorFunction = nullptr;
+
+  // Older versions of Android do not have the required runtime support for
+  // short granules, global or personality function instrumentation. On other
+  // platforms we currently require using the latest version of the runtime.
+  bool NewRuntime =
+      !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
+
+  UseShortGranules =
+      ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
+
+  // If we don't have personality function support, fall back to landing pads.
+  InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
+                              ? ClInstrumentLandingPads
+                              : !NewRuntime;
+
   if (!CompileKernel) {
     std::tie(HwasanCtorFunction, std::ignore) =
         getOrCreateSanitizerCtorAndInitFunctions(
@@ -363,6 +404,18 @@ void HWAddressSanitizer::initializeModule(Module &M) {
               Ctor->setComdat(CtorComdat);
               appendToGlobalCtors(M, Ctor, 0, Ctor);
             });
+
+    bool InstrumentGlobals =
+        ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
+    if (InstrumentGlobals)
+      instrumentGlobals();
+
+    bool InstrumentPersonalityFunctions =
+        ClInstrumentPersonalityFunctions.getNumOccurrences()
+            ? ClInstrumentPersonalityFunctions
+            : NewRuntime;
+    if (InstrumentPersonalityFunctions)
+      instrumentPersonalityFunctions();
   }
 
   if (!TargetTriple.isAndroid()) {
@@ -420,9 +473,6 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
 
   HWAsanHandleVfork =
       M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy);
-
-  HwasanThreadEnterFunc =
-      M.getOrInsertFunction("__hwasan_thread_enter", IRB.getVoidTy());
 }
 
 Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) {
@@ -456,7 +506,7 @@ Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
                                                      unsigned *Alignment,
                                                      Value **MaybeMask) {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->getMetadata("nosanitize")) return nullptr;
+  if (I->hasMetadata("nosanitize")) return nullptr;
 
   // Do not instrument the load fetching the dynamic shadow address.
   if (LocalDynamicShadow == I)
@@ -564,9 +614,11 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
       TargetTriple.isOSBinFormatELF() && !Recover) {
     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
     Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
-    IRB.CreateCall(
-        Intrinsic::getDeclaration(M, Intrinsic::hwasan_check_memaccess),
-        {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+    IRB.CreateCall(Intrinsic::getDeclaration(
+                       M, UseShortGranules
+                              ? Intrinsic::hwasan_check_memaccess_shortgranules
+                              : Intrinsic::hwasan_check_memaccess),
+                   {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
     return;
   }
 
@@ -718,7 +770,9 @@ static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
 
 bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
                                    Value *Tag, size_t Size) {
-  size_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
+  size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+  if (!UseShortGranules)
+    Size = AlignedSize;
 
   Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
   if (ClInstrumentWithCalls) {
@@ -735,10 +789,10 @@ bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
     // llvm.memset right here into either a sequence of stores, or a call to
     // hwasan_tag_memory.
     if (ShadowSize)
-      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align::None());
     if (Size != AlignedSize) {
       IRB.CreateStore(
-          ConstantInt::get(Int8Ty, Size % Mapping.getAllocaAlignment()),
+          ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
           IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
       IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
                                    Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
@@ -778,8 +832,9 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
   // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
   // first).
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  auto GetStackPointerFn =
-      Intrinsic::getDeclaration(M, Intrinsic::frameaddress);
+  auto GetStackPointerFn = Intrinsic::getDeclaration(
+      M, Intrinsic::frameaddress,
+      IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
   Value *StackPointer = IRB.CreateCall(
       GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
 
@@ -876,34 +931,13 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
   Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy);
   assert(SlotPtr);
 
-  Instruction *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
-
-  Function *F = IRB.GetInsertBlock()->getParent();
-  if (F->getFnAttribute("hwasan-abi").getValueAsString() == "interceptor") {
-    Value *ThreadLongEqZero =
-        IRB.CreateICmpEQ(ThreadLong, ConstantInt::get(IntptrTy, 0));
-    auto *Br = cast<BranchInst>(SplitBlockAndInsertIfThen(
-        ThreadLongEqZero, cast<Instruction>(ThreadLongEqZero)->getNextNode(),
-        false, MDBuilder(*C).createBranchWeights(1, 100000)));
-
-    IRB.SetInsertPoint(Br);
-    // FIXME: This should call a new runtime function with a custom calling
-    // convention to avoid needing to spill all arguments here.
-    IRB.CreateCall(HwasanThreadEnterFunc);
-    LoadInst *ReloadThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
-
-    IRB.SetInsertPoint(&*Br->getSuccessor(0)->begin());
-    PHINode *ThreadLongPhi = IRB.CreatePHI(IntptrTy, 2);
-    ThreadLongPhi->addIncoming(ThreadLong, ThreadLong->getParent());
-    ThreadLongPhi->addIncoming(ReloadThreadLong, ReloadThreadLong->getParent());
-    ThreadLong = ThreadLongPhi;
-  }
-
+  Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
   // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI.
   Value *ThreadLongMaybeUntagged =
       TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong);
 
   if (WithFrameRecord) {
+    Function *F = IRB.GetInsertBlock()->getParent();
     StackBaseTag = IRB.CreateAShr(ThreadLong, 3);
 
     // Prepare ring buffer data.
@@ -912,8 +946,10 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
       PC = readRegister(IRB, "pc");
     else
       PC = IRB.CreatePtrToInt(F, IntptrTy);
-    auto GetStackPointerFn =
-        Intrinsic::getDeclaration(F->getParent(), Intrinsic::frameaddress);
+    Module *M = F->getParent();
+    auto GetStackPointerFn = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
     Value *SP = IRB.CreatePtrToInt(
         IRB.CreateCall(GetStackPointerFn,
                        {Constant::getNullValue(IRB.getInt32Ty())}),
@@ -980,7 +1016,7 @@ bool HWAddressSanitizer::instrumentLandingPads(
 
 bool HWAddressSanitizer::instrumentStack(
     SmallVectorImpl<AllocaInst *> &Allocas,
-    DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap,
+    DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
     SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
   // Ideally, we want to calculate tagged stack base pointer, and rewrite all
   // alloca addresses using that. Unfortunately, offsets are not known yet
@@ -999,17 +1035,18 @@ bool HWAddressSanitizer::instrumentStack(
         AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
     Replacement->setName(Name + ".hwasan");
 
-    for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) {
-      Use &U = *UI++;
-      if (U.getUser() != AILong)
-        U.set(Replacement);
-    }
-
-    for (auto *DDI : AllocaDeclareMap.lookup(AI)) {
-      DIExpression *OldExpr = DDI->getExpression();
-      DIExpression *NewExpr = DIExpression::append(
-          OldExpr, {dwarf::DW_OP_LLVM_tag_offset, RetagMask(N)});
-      DDI->setArgOperand(2, MetadataAsValue::get(*C, NewExpr));
+    AI->replaceUsesWithIf(Replacement,
+                          [AILong](Use &U) { return U.getUser() != AILong; });
+
+    for (auto *DDI : AllocaDbgMap.lookup(AI)) {
+      // Prepend "tag_offset, N" to the dwarf expression.
+      // Tag offset logically applies to the alloca pointer, and it makes sense
+      // to put it at the beginning of the expression.
+      SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset,
+                                         RetagMask(N)};
+      DDI->setArgOperand(
+          2, MetadataAsValue::get(*C, DIExpression::prependOpcodes(
+                                          DDI->getExpression(), NewOps)));
     }
 
     size_t Size = getAllocaSizeInBytes(*AI);
@@ -1020,7 +1057,7 @@ bool HWAddressSanitizer::instrumentStack(
 
       // Re-tag alloca memory with the special UAR tag.
       Value *Tag = getUARTag(IRB, StackTag);
-      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getAllocaAlignment()));
+      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
     }
   }
 
@@ -1056,7 +1093,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   SmallVector<AllocaInst*, 8> AllocasToInstrument;
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> LandingPadVec;
-  DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> AllocaDeclareMap;
+  DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap;
   for (auto &BB : F) {
     for (auto &Inst : BB) {
       if (ClInstrumentStack)
@@ -1070,11 +1107,12 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
           isa<CleanupReturnInst>(Inst))
         RetVec.push_back(&Inst);
 
-      if (auto *DDI = dyn_cast<DbgDeclareInst>(&Inst))
-        if (auto *Alloca = dyn_cast_or_null<AllocaInst>(DDI->getAddress()))
-          AllocaDeclareMap[Alloca].push_back(DDI);
+      if (auto *DDI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+        if (auto *Alloca =
+                dyn_cast_or_null<AllocaInst>(DDI->getVariableLocation()))
+          AllocaDbgMap[Alloca].push_back(DDI);
 
-      if (ClInstrumentLandingPads && isa<LandingPadInst>(Inst))
+      if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
         LandingPadVec.push_back(&Inst);
 
       Value *MaybeMask = nullptr;
@@ -1093,6 +1131,13 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   if (!LandingPadVec.empty())
     instrumentLandingPads(LandingPadVec);
 
+  if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
+      F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
+    // __hwasan_personality_thunk is a no-op for functions without an
+    // instrumented stack, so we can drop it.
+    F.setPersonalityFn(nullptr);
+  }
+
   if (AllocasToInstrument.empty() && ToInstrument.empty())
     return false;
 
@@ -1108,7 +1153,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   if (!AllocasToInstrument.empty()) {
     Value *StackTag =
         ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
-    Changed |= instrumentStack(AllocasToInstrument, AllocaDeclareMap, RetVec,
+    Changed |= instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec,
                                StackTag);
   }
 
@@ -1118,8 +1163,9 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
   for (AllocaInst *AI : AllocasToInstrument) {
     uint64_t Size = getAllocaSizeInBytes(*AI);
-    uint64_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
-    AI->setAlignment(std::max(AI->getAlignment(), 16u));
+    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    AI->setAlignment(
+        MaybeAlign(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
     if (Size != AlignedSize) {
       Type *AllocatedType = AI->getAllocatedType();
       if (AI->isArrayAllocation()) {
@@ -1132,7 +1178,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       auto *NewAI = new AllocaInst(
           TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
       NewAI->takeName(AI);
-      NewAI->setAlignment(AI->getAlignment());
+      NewAI->setAlignment(MaybeAlign(AI->getAlignment()));
       NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
       NewAI->setSwiftError(AI->isSwiftError());
       NewAI->copyMetadata(*AI);
@@ -1179,6 +1225,257 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   return Changed;
 }
 
+void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
+  Constant *Initializer = GV->getInitializer();
+  uint64_t SizeInBytes =
+      M.getDataLayout().getTypeAllocSize(Initializer->getType());
+  uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment());
+  if (SizeInBytes != NewSize) {
+    // Pad the initializer out to the next multiple of 16 bytes and add the
+    // required short granule tag.
+    std::vector<uint8_t> Init(NewSize - SizeInBytes, 0);
+    Init.back() = Tag;
+    Constant *Padding = ConstantDataArray::get(*C, Init);
+    Initializer = ConstantStruct::getAnon({Initializer, Padding});
+  }
+
+  auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(),
+                                   GlobalValue::ExternalLinkage, Initializer,
+                                   GV->getName() + ".hwasan");
+  NewGV->copyAttributesFrom(GV);
+  NewGV->setLinkage(GlobalValue::PrivateLinkage);
+  NewGV->copyMetadata(GV, 0);
+  NewGV->setAlignment(
+      MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment())));
+
+  // It is invalid to ICF two globals that have different tags. In the case
+  // where the size of the global is a multiple of the tag granularity the
+  // contents of the globals may be the same but the tags (i.e. symbol values)
+  // may be different, and the symbols are not considered during ICF. In the
+  // case where the size is not a multiple of the granularity, the short granule
+  // tags would discriminate two globals with different tags, but there would
+  // otherwise be nothing stopping such a global from being incorrectly ICF'd
+  // with an uninstrumented (i.e. tag 0) global that happened to have the short
+  // granule tag in the last byte.
+  NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+  // Descriptor format (assuming little-endian):
+  // bytes 0-3: relative address of global
+  // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case
+  // it isn't, we create multiple descriptors)
+  // byte 7: tag
+  auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty);
+  const uint64_t MaxDescriptorSize = 0xfffff0;
+  for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes;
+       DescriptorPos += MaxDescriptorSize) {
+    auto *Descriptor =
+        new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage,
+                           nullptr, GV->getName() + ".hwasan.descriptor");
+    auto *GVRelPtr = ConstantExpr::getTrunc(
+        ConstantExpr::getAdd(
+            ConstantExpr::getSub(
+                ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+                ConstantExpr::getPtrToInt(Descriptor, Int64Ty)),
+            ConstantInt::get(Int64Ty, DescriptorPos)),
+        Int32Ty);
+    uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize);
+    auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24));
+    Descriptor->setComdat(NewGV->getComdat());
+    Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag}));
+    Descriptor->setSection("hwasan_globals");
+    Descriptor->setMetadata(LLVMContext::MD_associated,
+                            MDNode::get(*C, ValueAsMetadata::get(NewGV)));
+    appendToCompilerUsed(M, Descriptor);
+  }
+
+  Constant *Aliasee = ConstantExpr::getIntToPtr(
+      ConstantExpr::getAdd(
+          ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+          ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)),
+      GV->getType());
+  auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(),
+                                    GV->getLinkage(), "", Aliasee, &M);
+  Alias->setVisibility(GV->getVisibility());
+  Alias->takeName(GV);
+  GV->replaceAllUsesWith(Alias);
+  GV->eraseFromParent();
+}
+
+void HWAddressSanitizer::instrumentGlobals() {
+  // Start by creating a note that contains pointers to the list of global
+  // descriptors. Adding a note to the output file will cause the linker to
+  // create a PT_NOTE program header pointing to the note that we can use to
+  // find the descriptor list starting from the program headers. A function
+  // provided by the runtime initializes the shadow memory for the globals by
+  // accessing the descriptor list via the note. The dynamic loader needs to
+  // call this function whenever a library is loaded.
+  //
+  // The reason why we use a note for this instead of a more conventional
+  // approach of having a global constructor pass a descriptor list pointer to
+  // the runtime is because of an order of initialization problem. With
+  // constructors we can encounter the following problematic scenario:
+  //
+  // 1) library A depends on library B and also interposes one of B's symbols
+  // 2) B's constructors are called before A's (as required for correctness)
+  // 3) during construction, B accesses one of its "own" globals (actually
+  //    interposed by A) and triggers a HWASAN failure due to the initialization
+  //    for A not having happened yet
+  //
+  // Even without interposition it is possible to run into similar situations in
+  // cases where two libraries mutually depend on each other.
+  //
+  // We only need one note per binary, so put everything for the note in a
+  // comdat.
+  Comdat *NoteComdat = M.getOrInsertComdat(kHwasanNoteName);
+
+  Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
+  auto Start =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__start_hwasan_globals");
+  Start->setVisibility(GlobalValue::HiddenVisibility);
+  Start->setDSOLocal(true);
+  auto Stop =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__stop_hwasan_globals");
+  Stop->setVisibility(GlobalValue::HiddenVisibility);
+  Stop->setDSOLocal(true);
+
+  // Null-terminated so actually 8 bytes, which are required in order to align
+  // the note properly.
+  auto *Name = ConstantDataArray::get(*C, "LLVM\0\0\0");
+
+  auto *NoteTy = StructType::get(Int32Ty, Int32Ty, Int32Ty, Name->getType(),
+                                 Int32Ty, Int32Ty);
+  auto *Note =
+      new GlobalVariable(M, NoteTy, /*isConstantGlobal=*/true,
+                         GlobalValue::PrivateLinkage, nullptr, kHwasanNoteName);
+  Note->setSection(".note.hwasan.globals");
+  Note->setComdat(NoteComdat);
+  Note->setAlignment(Align(4));
+  Note->setDSOLocal(true);
+
+  // The pointers in the note need to be relative so that the note ends up being
+  // placed in rodata, which is the standard location for notes.
+  auto CreateRelPtr = [&](Constant *Ptr) {
+    return ConstantExpr::getTrunc(
+        ConstantExpr::getSub(ConstantExpr::getPtrToInt(Ptr, Int64Ty),
+                             ConstantExpr::getPtrToInt(Note, Int64Ty)),
+        Int32Ty);
+  };
+  Note->setInitializer(ConstantStruct::getAnon(
+      {ConstantInt::get(Int32Ty, 8),                           // n_namesz
+       ConstantInt::get(Int32Ty, 8),                           // n_descsz
+       ConstantInt::get(Int32Ty, ELF::NT_LLVM_HWASAN_GLOBALS), // n_type
+       Name, CreateRelPtr(Start), CreateRelPtr(Stop)}));
+  appendToCompilerUsed(M, Note);
+
+  // Create a zero-length global in hwasan_globals so that the linker will
+  // always create start and stop symbols.
+  auto Dummy = new GlobalVariable(
+      M, Int8Arr0Ty, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(Int8Arr0Ty), "hwasan.dummy.global");
+  Dummy->setSection("hwasan_globals");
+  Dummy->setComdat(NoteComdat);
+  Dummy->setMetadata(LLVMContext::MD_associated,
+                     MDNode::get(*C, ValueAsMetadata::get(Note)));
+  appendToCompilerUsed(M, Dummy);
+
+  std::vector<GlobalVariable *> Globals;
+  for (GlobalVariable &GV : M.globals()) {
+    if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
+        GV.isThreadLocal())
+      continue;
+
+    // Common symbols can't have aliases point to them, so they can't be tagged.
+    if (GV.hasCommonLinkage())
+      continue;
+
+    // Globals with custom sections may be used in __start_/__stop_ enumeration,
+    // which would be broken both by adding tags and potentially by the extra
+    // padding/alignment that we insert.
+    if (GV.hasSection())
+      continue;
+
+    Globals.push_back(&GV);
+  }
+
+  MD5 Hasher;
+  Hasher.update(M.getSourceFileName());
+  MD5::MD5Result Hash;
+  Hasher.final(Hash);
+  uint8_t Tag = Hash[0];
+
+  for (GlobalVariable *GV : Globals) {
+    // Skip tag 0 in order to avoid collisions with untagged memory.
+    if (Tag == 0)
+      Tag = 1;
+    instrumentGlobal(GV, Tag++);
+  }
+}
+
+void HWAddressSanitizer::instrumentPersonalityFunctions() {
+  // We need to untag stack frames as we unwind past them. That is the job of
+  // the personality function wrapper, which either wraps an existing
+  // personality function or acts as a personality function on its own. Each
+  // function that has a personality function or that can be unwound past has
+  // its personality function changed to a thunk that calls the personality
+  // function wrapper in the runtime.
+  MapVector<Constant *, std::vector<Function *>> PersonalityFns;
+  for (Function &F : M) {
+    if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress))
+      continue;
+
+    if (F.hasPersonalityFn()) {
+      PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F);
+    } else if (!F.hasFnAttribute(Attribute::NoUnwind)) {
+      PersonalityFns[nullptr].push_back(&F);
+    }
+  }
+
+  if (PersonalityFns.empty())
+    return;
+
+  FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction(
+      "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty,
+      Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy);
+  FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy);
+  FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy);
+
+  for (auto &P : PersonalityFns) {
+    std::string ThunkName = kHwasanPersonalityThunkName;
+    if (P.first)
+      ThunkName += ("." + P.first->getName()).str();
+    FunctionType *ThunkFnTy = FunctionType::get(
+        Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false);
+    bool IsLocal = P.first && (!isa<GlobalValue>(P.first) ||
+                               cast<GlobalValue>(P.first)->hasLocalLinkage());
+    auto *ThunkFn = Function::Create(ThunkFnTy,
+                                     IsLocal ? GlobalValue::InternalLinkage
+                                             : GlobalValue::LinkOnceODRLinkage,
+                                     ThunkName, &M);
+    if (!IsLocal) {
+      ThunkFn->setVisibility(GlobalValue::HiddenVisibility);
+      ThunkFn->setComdat(M.getOrInsertComdat(ThunkName));
+    }
+
+    auto *BB = BasicBlock::Create(*C, "entry", ThunkFn);
+    IRBuilder<> IRB(BB);
+    CallInst *WrapperCall = IRB.CreateCall(
+        HwasanPersonalityWrapper,
+        {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2),
+         ThunkFn->getArg(3), ThunkFn->getArg(4),
+         P.first ? IRB.CreateBitCast(P.first, Int8PtrTy)
+                 : Constant::getNullValue(Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)});
+    WrapperCall->setTailCall();
+    IRB.CreateRet(WrapperCall);
+
+    for (Function *F : P.second)
+      F->setPersonalityFn(ThunkFn);
+  }
+}
+
 void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
   Scale = kDefaultShadowScale;
   if (ClMappingOffset.getNumOccurrences() > 0) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index c7371f567ff3..d5787c8f62a1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
@@ -403,7 +404,7 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
           AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
       ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
     } else {
-      OwnedORE = llvm::make_unique<OptimizationRemarkEmitter>(&F);
+      OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
       ORE = OwnedORE.get();
     }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index a2c1ddfd279e..518b8895e836 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -9,6 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -19,6 +20,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -28,7 +30,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include <fstream>
 #include <map>
 #include <mutex>
@@ -100,7 +101,8 @@ public:
     if (!ClOrderFileWriteMapping.empty()) {
       std::lock_guard<std::mutex> LogLock(MappingMutex);
       std::error_code EC;
-      llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC, llvm::sys::fs::F_Append);
+      llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC,
+                              llvm::sys::fs::OF_Append);
       if (EC) {
         report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping +
                            " to save mapping file for order file instrumentation\n");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 5d3a793dda19..04c7e856b5d4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
@@ -157,7 +158,10 @@ public:
   }
 
   bool runOnModule(Module &M) override {
-    return InstrProf.run(M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return InstrProf.run(M, GetTLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -370,8 +374,12 @@ private:
 } // end anonymous namespace
 
 PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
-  if (!run(M, TLI))
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  if (!run(M, GetTLI))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -441,7 +449,7 @@ void InstrProfiling::promoteCounterLoadStores(Function *F) {
   std::unique_ptr<BlockFrequencyInfo> BFI;
   if (Options.UseBFIInPromotion) {
     std::unique_ptr<BranchProbabilityInfo> BPI;
-    BPI.reset(new BranchProbabilityInfo(*F, LI, TLI));
+    BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F)));
     BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI));
   }
 
@@ -482,9 +490,10 @@ static bool containsProfilingIntrinsics(Module &M) {
   return false;
 }
 
-bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
+bool InstrProfiling::run(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
   this->M = &M;
-  this->TLI = &TLI;
+  this->GetTLI = std::move(GetTLI);
   NamesVar = nullptr;
   NamesSize = 0;
   ProfileDataMap.clear();
@@ -601,6 +610,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   bool IsRange = (Ind->getValueKind()->getZExtValue() ==
                   llvm::InstrProfValueKind::IPVK_MemOPSize);
   CallInst *Call = nullptr;
+  auto *TLI = &GetTLI(*Ind->getFunction());
   if (!IsRange) {
     Value *Args[3] = {Ind->getTargetValue(),
                       Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
@@ -749,7 +759,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   // of the parent function, that will result in relocations against discarded
   // sections.
   bool NeedComdat = needsComdatForCounter(*Fn, *M);
-  Comdat *Cmdt = nullptr; // Comdat group.
   if (NeedComdat) {
     if (TT.isOSBinFormatCOFF()) {
       // For COFF, put the counters, data, and values each into their own
@@ -758,14 +767,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
       // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE.
       Linkage = GlobalValue::LinkOnceODRLinkage;
       Visibility = GlobalValue::HiddenVisibility;
-    } else {
-      // Otherwise, create one comdat group for everything.
-      Cmdt = M->getOrInsertComdat(getVarName(Inc, getInstrProfComdatPrefix()));
     }
   }
   auto MaybeSetComdat = [=](GlobalVariable *GV) {
     if (NeedComdat)
-      GV->setComdat(Cmdt ? Cmdt : M->getOrInsertComdat(GV->getName()));
+      GV->setComdat(M->getOrInsertComdat(GV->getName()));
   };
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
@@ -780,7 +786,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   CounterPtr->setVisibility(Visibility);
   CounterPtr->setSection(
       getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
-  CounterPtr->setAlignment(8);
+  CounterPtr->setAlignment(Align(8));
   MaybeSetComdat(CounterPtr);
   CounterPtr->setLinkage(Linkage);
 
@@ -802,7 +808,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
       ValuesVar->setVisibility(Visibility);
       ValuesVar->setSection(
           getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
-      ValuesVar->setAlignment(8);
+      ValuesVar->setAlignment(Align(8));
       MaybeSetComdat(ValuesVar);
       ValuesPtrExpr =
           ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
@@ -835,7 +841,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                                   getVarName(Inc, getInstrProfDataVarPrefix()));
   Data->setVisibility(Visibility);
   Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
-  Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT);
+  Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
   MaybeSetComdat(Data);
   Data->setLinkage(Linkage);
 
@@ -926,7 +932,7 @@ void InstrProfiling::emitNameData() {
   // On COFF, it's important to reduce the alignment down to 1 to prevent the
   // linker from inserting padding before the start of the names section or
   // between names entries.
-  NamesVar->setAlignment(1);
+  NamesVar->setAlignment(Align::None());
   UsedVars.push_back(NamesVar);
 
   for (auto *NamePtr : ReferencedNames)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index f56a1bd91b89..a6c2c9b464b6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -68,7 +68,8 @@ GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
                          GlobalValue::PrivateLinkage, StrConst, NamePrefix);
   if (AllowMerging)
     GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  GV->setAlignment(Align::None()); // Strings may not be merged w/o setting
+                                   // alignment explicitly.
   return GV;
 }
 
@@ -116,7 +117,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeMemorySanitizerLegacyPassPass(Registry);
   initializeHWAddressSanitizerLegacyPassPass(Registry);
   initializeThreadSanitizerLegacyPassPass(Registry);
-  initializeSanitizerCoverageModulePass(Registry);
+  initializeModuleSanitizerCoverageLegacyPassPass(Registry);
   initializeDataFlowSanitizerPass(Registry);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b25cbed1bb02..f581142df8f7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -170,12 +170,14 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
@@ -202,8 +204,8 @@ using namespace llvm;
 #define DEBUG_TYPE "msan"
 
 static const unsigned kOriginSize = 4;
-static const unsigned kMinOriginAlignment = 4;
-static const unsigned kShadowTLSAlignment = 8;
+static const Align kMinOriginAlignment = Align(4);
+static const Align kShadowTLSAlignment = Align(8);
 
 // These constants must be kept in sync with the ones in msan.h.
 static const unsigned kParamTLSSize = 800;
@@ -462,16 +464,9 @@ namespace {
 /// the module.
 class MemorySanitizer {
 public:
-  MemorySanitizer(Module &M, MemorySanitizerOptions Options) {
-    this->CompileKernel =
-        ClEnableKmsan.getNumOccurrences() > 0 ? ClEnableKmsan : Options.Kernel;
-    if (ClTrackOrigins.getNumOccurrences() > 0)
-      this->TrackOrigins = ClTrackOrigins;
-    else
-      this->TrackOrigins = this->CompileKernel ? 2 : Options.TrackOrigins;
-    this->Recover = ClKeepGoing.getNumOccurrences() > 0
-                        ? ClKeepGoing
-                        : (this->CompileKernel | Options.Recover);
+  MemorySanitizer(Module &M, MemorySanitizerOptions Options)
+      : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
+        Recover(Options.Recover) {
     initializeModule(M);
   }
 
@@ -594,10 +589,26 @@ private:
 
   /// An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
-
-  Function *MsanCtorFunction;
 };
 
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kMsanModuleCtorName, kMsanInitName,
+      /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) {
+        if (!ClWithComdat) {
+          appendToGlobalCtors(M, Ctor, 0);
+          return;
+        }
+        Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
+        Ctor->setComdat(MsanCtorComdat);
+        appendToGlobalCtors(M, Ctor, 0, Ctor);
+      });
+}
+
 /// A legacy function pass for msan instrumentation.
 ///
 /// Instruments functions to detect unitialized reads.
@@ -615,7 +626,7 @@ struct MemorySanitizerLegacyPass : public FunctionPass {
 
   bool runOnFunction(Function &F) override {
     return MSan->sanitizeFunction(
-        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
+        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
   }
   bool doInitialization(Module &M) override;
 
@@ -623,8 +634,17 @@ struct MemorySanitizerLegacyPass : public FunctionPass {
   MemorySanitizerOptions Options;
 };
 
+template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
+  return (Opt.getNumOccurrences() > 0) ? Opt : Default;
+}
+
 } // end anonymous namespace
 
+MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
+    : Kernel(getOptOrDefault(ClEnableKmsan, K)),
+      TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
+      Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
+
 PreservedAnalyses MemorySanitizerPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
   MemorySanitizer Msan(*F.getParent(), Options);
@@ -633,6 +653,14 @@ PreservedAnalyses MemorySanitizerPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
+PreservedAnalyses MemorySanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+  if (Options.Kernel)
+    return PreservedAnalyses::all();
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
 char MemorySanitizerLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
@@ -918,23 +946,6 @@ void MemorySanitizer::initializeModule(Module &M) {
   OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
 
   if (!CompileKernel) {
-    std::tie(MsanCtorFunction, std::ignore) =
-        getOrCreateSanitizerCtorAndInitFunctions(
-            M, kMsanModuleCtorName, kMsanInitName,
-            /*InitArgTypes=*/{},
-            /*InitArgs=*/{},
-            // This callback is invoked when the functions are created the first
-            // time. Hook them into the global ctors list in that case:
-            [&](Function *Ctor, FunctionCallee) {
-              if (!ClWithComdat) {
-                appendToGlobalCtors(M, Ctor, 0);
-                return;
-              }
-              Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
-              Ctor->setComdat(MsanCtorComdat);
-              appendToGlobalCtors(M, Ctor, 0, Ctor);
-            });
-
     if (TrackOrigins)
       M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] {
         return new GlobalVariable(
@@ -952,6 +963,8 @@ void MemorySanitizer::initializeModule(Module &M) {
 }
 
 bool MemorySanitizerLegacyPass::doInitialization(Module &M) {
+  if (!Options.Kernel)
+    insertModuleCtor(M);
   MSan.emplace(M, Options);
   return true;
 }
@@ -1075,15 +1088,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   /// Fill memory range with the given origin value.
   void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
-                   unsigned Size, unsigned Alignment) {
+                   unsigned Size, Align Alignment) {
     const DataLayout &DL = F.getParent()->getDataLayout();
-    unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy);
+    const Align IntptrAlignment = Align(DL.getABITypeAlignment(MS.IntptrTy));
     unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
     assert(IntptrAlignment >= kMinOriginAlignment);
     assert(IntptrSize >= kOriginSize);
 
     unsigned Ofs = 0;
-    unsigned CurrentAlignment = Alignment;
+    Align CurrentAlignment = Alignment;
     if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
       Value *IntptrOrigin = originToIntptr(IRB, Origin);
       Value *IntptrOriginPtr =
@@ -1091,7 +1104,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       for (unsigned i = 0; i < Size / IntptrSize; ++i) {
         Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
                        : IntptrOriginPtr;
-        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
+        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment.value());
         Ofs += IntptrSize / kOriginSize;
         CurrentAlignment = IntptrAlignment;
       }
@@ -1100,23 +1113,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
       Value *GEP =
           i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr;
-      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
+      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment.value());
       CurrentAlignment = kMinOriginAlignment;
     }
   }
 
   void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
-                   Value *OriginPtr, unsigned Alignment, bool AsCall) {
+                   Value *OriginPtr, Align Alignment, bool AsCall) {
     const DataLayout &DL = F.getParent()->getDataLayout();
-    unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+    const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
     unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
     if (Shadow->getType()->isAggregateType()) {
       paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
                   OriginAlignment);
     } else {
       Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
-      Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
-      if (ConstantShadow) {
+      if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
         if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
           paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
                       OriginAlignment);
@@ -1153,12 +1165,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr, *OriginPtr;
       Type *ShadowTy = Shadow->getType();
-      unsigned Alignment = SI->getAlignment();
-      unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+      const Align Alignment = assumeAligned(SI->getAlignment());
+      const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
 
-      StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
+      StoreInst *NewSI =
+          IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment.value());
       LLVM_DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
 
@@ -1196,8 +1209,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
     LLVM_DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
 
-    Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
-    if (ConstantShadow) {
+    if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
       if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
         insertWarningFn(IRB, Origin);
       }
@@ -1392,10 +1404,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Shadow = ShadowBase + Offset
   /// Origin = (OriginBase + Offset) & ~3ULL
-  std::pair<Value *, Value *> getShadowOriginPtrUserspace(Value *Addr,
-                                                          IRBuilder<> &IRB,
-                                                          Type *ShadowTy,
-                                                          unsigned Alignment) {
+  std::pair<Value *, Value *>
+  getShadowOriginPtrUserspace(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy,
+                              MaybeAlign Alignment) {
     Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
     Value *ShadowLong = ShadowOffset;
     uint64_t ShadowBase = MS.MapParams->ShadowBase;
@@ -1413,8 +1424,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       if (OriginBase != 0)
         OriginLong = IRB.CreateAdd(OriginLong,
                                    ConstantInt::get(MS.IntptrTy, OriginBase));
-      if (Alignment < kMinOriginAlignment) {
-        uint64_t Mask = kMinOriginAlignment - 1;
+      if (!Alignment || *Alignment < kMinOriginAlignment) {
+        uint64_t Mask = kMinOriginAlignment.value() - 1;
         OriginLong =
             IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask));
       }
@@ -1424,9 +1435,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return std::make_pair(ShadowPtr, OriginPtr);
   }
 
-  std::pair<Value *, Value *>
-  getShadowOriginPtrKernel(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy,
-                           unsigned Alignment, bool isStore) {
+  std::pair<Value *, Value *> getShadowOriginPtrKernel(Value *Addr,
+                                                       IRBuilder<> &IRB,
+                                                       Type *ShadowTy,
+                                                       bool isStore) {
     Value *ShadowOriginPtrs;
     const DataLayout &DL = F.getParent()->getDataLayout();
     int Size = DL.getTypeStoreSize(ShadowTy);
@@ -1451,14 +1463,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
                                                  Type *ShadowTy,
-                                                 unsigned Alignment,
+                                                 MaybeAlign Alignment,
                                                  bool isStore) {
-    std::pair<Value *, Value *> ret;
     if (MS.CompileKernel)
-      ret = getShadowOriginPtrKernel(Addr, IRB, ShadowTy, Alignment, isStore);
-    else
-      ret = getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
-    return ret;
+      return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore);
+    return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
   }
 
   /// Compute the shadow address for a given function argument.
@@ -1608,11 +1617,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             // ByVal pointer itself has clean shadow. We copy the actual
             // argument shadow to the underlying memory.
             // Figure out maximal valid memcpy alignment.
-            unsigned ArgAlign = FArg.getParamAlignment();
-            if (ArgAlign == 0) {
-              Type *EltType = A->getType()->getPointerElementType();
-              ArgAlign = DL.getABITypeAlignment(EltType);
-            }
+            const Align ArgAlign = DL.getValueOrABITypeAlignment(
+                MaybeAlign(FArg.getParamAlignment()),
+                A->getType()->getPointerElementType());
             Value *CpShadowPtr =
                 getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
                                    /*isStore*/ true)
@@ -1624,7 +1631,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                   CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()),
                   Size, ArgAlign);
             } else {
-              unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
+              const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
               Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
                                                  CopyAlign, Size);
               LLVM_DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
@@ -1636,8 +1643,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
               // ParamTLS overflow.
               *ShadowPtr = getCleanShadow(V);
             } else {
-              *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
-                                                      kShadowTLSAlignment);
+              *ShadowPtr = EntryIRB.CreateAlignedLoad(
+                  getShadowTy(&FArg), Base, kShadowTLSAlignment.value());
             }
           }
           LLVM_DEBUG(dbgs()
@@ -1771,13 +1778,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(I.getNextNode());
     Type *ShadowTy = getShadowTy(&I);
     Value *Addr = I.getPointerOperand();
-    Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = I.getAlignment();
+    Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
+    const Align Alignment = assumeAligned(I.getAlignment());
     if (PropagateShadow) {
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
-      setShadow(&I,
-                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+      setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr,
+                                          Alignment.value(), "_msld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
     }
@@ -1790,9 +1797,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     if (MS.TrackOrigins) {
       if (PropagateShadow) {
-        unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
-        setOrigin(
-            &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment));
+        const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+        setOrigin(&I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr,
+                                            OriginAlignment.value()));
       } else {
         setOrigin(&I, getCleanOrigin());
       }
@@ -1814,8 +1821,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     IRBuilder<> IRB(&I);
     Value *Addr = I.getOperand(0);
-    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(),
-                                          /*Alignment*/ 1, /*isStore*/ true)
+    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align::None(),
+                                          /*isStore*/ true)
                            .first;
 
     if (ClCheckAccessAddress)
@@ -2447,7 +2454,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // We don't know the pointer alignment (could be unaligned SSE store!).
     // Have to assume to worst case.
     std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
-        Addr, IRB, Shadow->getType(), /*Alignment*/ 1, /*isStore*/ true);
+        Addr, IRB, Shadow->getType(), Align::None(), /*isStore*/ true);
     IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
 
     if (ClCheckAccessAddress)
@@ -2467,15 +2474,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *Addr = I.getArgOperand(0);
 
     Type *ShadowTy = getShadowTy(&I);
-    Value *ShadowPtr, *OriginPtr;
+    Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
     if (PropagateShadow) {
       // We don't know the pointer alignment (could be unaligned SSE load!).
       // Have to assume to worst case.
-      unsigned Alignment = 1;
+      const Align Alignment = Align::None();
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
-      setShadow(&I,
-                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+      setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr,
+                                          Alignment.value(), "_msld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
     }
@@ -2562,6 +2569,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return false;
   }
 
+  void handleInvariantGroup(IntrinsicInst &I) {
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
   void handleLifetimeStart(IntrinsicInst &I) {
     if (!PoisonStack)
       return;
@@ -2857,7 +2869,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value* Addr = I.getArgOperand(0);
     Type *Ty = IRB.getInt32Ty();
     Value *ShadowPtr =
-        getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1, /*isStore*/ true)
+        getShadowOriginPtr(Addr, IRB, Ty, Align::None(), /*isStore*/ true)
             .first;
 
     IRB.CreateStore(getCleanShadow(Ty),
@@ -2873,7 +2885,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *Addr = I.getArgOperand(0);
     Type *Ty = IRB.getInt32Ty();
-    unsigned Alignment = 1;
+    const Align Alignment = Align::None();
     Value *ShadowPtr, *OriginPtr;
     std::tie(ShadowPtr, OriginPtr) =
         getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
@@ -2881,7 +2893,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
 
-    Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr");
+    Value *Shadow =
+        IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment.value(), "_ldmxcsr");
     Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr)
                                     : getCleanOrigin();
     insertShadowCheck(Shadow, Origin, &I);
@@ -2891,14 +2904,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *V = I.getArgOperand(0);
     Value *Addr = I.getArgOperand(1);
-    unsigned Align = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    const MaybeAlign Alignment(
+        cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
     Value *Mask = I.getArgOperand(3);
     Value *Shadow = getShadow(V);
 
     Value *ShadowPtr;
     Value *OriginPtr;
     std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
-        Addr, IRB, Shadow->getType(), Align, /*isStore*/ true);
+        Addr, IRB, Shadow->getType(), Alignment, /*isStore*/ true);
 
     if (ClCheckAccessAddress) {
       insertShadowCheck(Addr, &I);
@@ -2907,20 +2921,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       insertShadowCheck(Mask, &I);
     }
 
-    IRB.CreateMaskedStore(Shadow, ShadowPtr, Align, Mask);
+    IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment ? Alignment->value() : 0,
+                          Mask);
 
     if (MS.TrackOrigins) {
       auto &DL = F.getParent()->getDataLayout();
       paintOrigin(IRB, getOrigin(V), OriginPtr,
                   DL.getTypeStoreSize(Shadow->getType()),
-                  std::max(Align, kMinOriginAlignment));
+                  llvm::max(Alignment, kMinOriginAlignment));
     }
   }
 
   bool handleMaskedLoad(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *Addr = I.getArgOperand(0);
-    unsigned Align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    const MaybeAlign Alignment(
+        cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
     Value *Mask = I.getArgOperand(2);
     Value *PassThru = I.getArgOperand(3);
 
@@ -2928,9 +2944,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *ShadowPtr, *OriginPtr;
     if (PropagateShadow) {
       std::tie(ShadowPtr, OriginPtr) =
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Align, /*isStore*/ false);
-      setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Align, Mask,
-                                         getShadow(PassThru), "_msmaskedld"));
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+      setShadow(&I, IRB.CreateMaskedLoad(
+                        ShadowPtr, Alignment ? Alignment->value() : 0, Mask,
+                        getShadow(PassThru), "_msmaskedld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
     }
@@ -2988,11 +3005,52 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  Constant *getPclmulMask(IRBuilder<> &IRB, unsigned Width, bool OddElements) {
+    SmallVector<Constant *, 8> Mask;
+    for (unsigned X = OddElements ? 1 : 0; X < Width; X += 2) {
+      Constant *C = ConstantInt::get(IRB.getInt32Ty(), X);
+      Mask.push_back(C);
+      Mask.push_back(C);
+    }
+    return ConstantVector::get(Mask);
+  }
+
+  // Instrument pclmul intrinsics.
+  // These intrinsics operate either on odd or on even elements of the input
+  // vectors, depending on the constant in the 3rd argument, ignoring the rest.
+  // Replace the unused elements with copies of the used ones, ex:
+  //   (0, 1, 2, 3) -> (0, 0, 2, 2) (even case)
+  // or
+  //   (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case)
+  // and then apply the usual shadow combining logic.
+  void handlePclmulIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+    unsigned Width = I.getArgOperand(0)->getType()->getVectorNumElements();
+    assert(isa<ConstantInt>(I.getArgOperand(2)) &&
+           "pclmul 3rd operand must be a constant");
+    unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    Value *Shuf0 =
+        IRB.CreateShuffleVector(getShadow(&I, 0), UndefValue::get(ShadowTy),
+                                getPclmulMask(IRB, Width, Imm & 0x01));
+    Value *Shuf1 =
+        IRB.CreateShuffleVector(getShadow(&I, 1), UndefValue::get(ShadowTy),
+                                getPclmulMask(IRB, Width, Imm & 0x10));
+    ShadowAndOriginCombiner SOC(this, IRB);
+    SOC.Add(Shuf0, getOrigin(&I, 0));
+    SOC.Add(Shuf1, getOrigin(&I, 1));
+    SOC.Done(&I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case Intrinsic::lifetime_start:
       handleLifetimeStart(I);
       break;
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
+      handleInvariantGroup(I);
+      break;
     case Intrinsic::bswap:
       handleBswap(I);
       break;
@@ -3217,6 +3275,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleBmiIntrinsic(I);
       break;
 
+    case Intrinsic::x86_pclmulqdq:
+    case Intrinsic::x86_pclmulqdq_256:
+    case Intrinsic::x86_pclmulqdq_512:
+      handlePclmulIntrinsic(I);
+      break;
+
     case Intrinsic::is_constant:
       // The result of llvm.is.constant() is always defined.
       setShadow(&I, getCleanShadow(&I));
@@ -3258,7 +3322,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         // Clear out readonly/readnone attributes.
         AttrBuilder B;
         B.addAttribute(Attribute::ReadOnly)
-          .addAttribute(Attribute::ReadNone);
+            .addAttribute(Attribute::ReadNone)
+            .addAttribute(Attribute::WriteOnly)
+            .addAttribute(Attribute::ArgMemOnly)
+            .addAttribute(Attribute::Speculatable);
         Func->removeAttributes(AttributeList::FunctionIndex, B);
       }
 
@@ -3292,8 +3359,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                "ByVal argument is not a pointer!");
         Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
         if (ArgOffset + Size > kParamTLSSize) break;
-        unsigned ParamAlignment = CS.getParamAlignment(i);
-        unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
+        const MaybeAlign ParamAlignment(CS.getParamAlignment(i));
+        MaybeAlign Alignment = llvm::None;
+        if (ParamAlignment)
+          Alignment = std::min(*ParamAlignment, kShadowTLSAlignment);
         Value *AShadowPtr =
             getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
                                /*isStore*/ false)
@@ -3306,7 +3375,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         Size = DL.getTypeAllocSize(A->getType());
         if (ArgOffset + Size > kParamTLSSize) break;
         Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
-                                       kShadowTLSAlignment);
+                                       kShadowTLSAlignment.value());
         Constant *Cst = dyn_cast<Constant>(ArgShadow);
         if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
       }
@@ -3332,7 +3401,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRBBefore(&I);
     // Until we have full dynamic coverage, make sure the retval shadow is 0.
     Value *Base = getShadowPtrForRetval(&I, IRBBefore);
-    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base,
+                                 kShadowTLSAlignment.value());
     BasicBlock::iterator NextInsn;
     if (CS.isCall()) {
       NextInsn = ++I.getIterator();
@@ -3356,7 +3426,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRBAfter(&*NextInsn);
     Value *RetvalShadow = IRBAfter.CreateAlignedLoad(
         getShadowTy(&I), getShadowPtrForRetval(&I, IRBAfter),
-        kShadowTLSAlignment, "_msret");
+        kShadowTLSAlignment.value(), "_msret");
     setShadow(&I, RetvalShadow);
     if (MS.TrackOrigins)
       setOrigin(&I, IRBAfter.CreateLoad(MS.OriginTy,
@@ -3383,10 +3453,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (CheckReturnValue) {
       insertShadowCheck(RetVal, &I);
       Value *Shadow = getCleanShadow(RetVal);
-      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value());
     } else {
       Value *Shadow = getShadow(RetVal);
-      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value());
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -3427,11 +3497,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                      {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
     } else {
       Value *ShadowBase, *OriginBase;
-      std::tie(ShadowBase, OriginBase) =
-          getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(), 1, /*isStore*/ true);
+      std::tie(ShadowBase, OriginBase) = getShadowOriginPtr(
+          &I, IRB, IRB.getInt8Ty(), Align::None(), /*isStore*/ true);
 
       Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
-      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len,
+                       MaybeAlign(I.getAlignment()));
     }
 
     if (PoisonStack && MS.TrackOrigins) {
@@ -3627,10 +3698,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   int getNumOutputArgs(InlineAsm *IA, CallBase *CB) {
     int NumRetOutputs = 0;
     int NumOutputs = 0;
-    Type *RetTy = dyn_cast<Value>(CB)->getType();
+    Type *RetTy = cast<Value>(CB)->getType();
     if (!RetTy->isVoidTy()) {
       // Register outputs are returned via the CallInst return value.
-      StructType *ST = dyn_cast_or_null<StructType>(RetTy);
+      auto *ST = dyn_cast<StructType>(RetTy);
       if (ST)
         NumRetOutputs = ST->getNumElements();
       else
@@ -3667,7 +3738,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // corresponding CallInst has nO+nI+1 operands (the last operand is the
     // function to be called).
     const DataLayout &DL = F.getParent()->getDataLayout();
-    CallBase *CB = dyn_cast<CallBase>(&I);
+    CallBase *CB = cast<CallBase>(&I);
     IRBuilder<> IRB(&I);
     InlineAsm *IA = cast<InlineAsm>(CB->getCalledValue());
     int OutputArgs = getNumOutputArgs(IA, CB);
@@ -3843,7 +3914,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
         if (!ShadowBase)
           continue;
         Value *Shadow = MSV.getShadow(A);
-        IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment);
+        IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment.value());
         if (MS.TrackOrigins) {
           Value *Origin = MSV.getOrigin(A);
           unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
@@ -3884,7 +3955,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     IRBuilder<> IRB(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = 8;
+    const Align Alignment = Align(8);
     std::tie(ShadowPtr, OriginPtr) =
         MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
                                /*isStore*/ true);
@@ -3922,10 +3993,11 @@ struct VarArgAMD64Helper : public VarArgHelper {
         IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
                       VAArgOverflowSize);
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
       if (MS.TrackOrigins) {
         VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-        IRB.CreateMemCpy(VAArgTLSOriginCopy, 8, MS.VAArgOriginTLS, 8, CopySize);
+        IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
+                         Align(8), CopySize);
       }
     }
 
@@ -3944,7 +4016,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
       Value *RegSaveAreaPtr =
           IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
-      unsigned Alignment = 16;
+      const Align Alignment = Align(16);
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
           MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
                                  Alignment, /*isStore*/ true);
@@ -4012,7 +4084,8 @@ struct VarArgMIPS64Helper : public VarArgHelper {
       VAArgOffset = alignTo(VAArgOffset, 8);
       if (!Base)
         continue;
-      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base,
+                             kShadowTLSAlignment.value());
     }
 
     Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
@@ -4038,7 +4111,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
     VAStartInstrumentationList.push_back(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = 8;
+    const Align Alignment = Align(8);
     std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
         VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
@@ -4050,7 +4123,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
     VAStartInstrumentationList.push_back(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = 8;
+    const Align Alignment = Align(8);
     std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
         VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
@@ -4069,7 +4142,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
     }
 
     // Instrument va_start.
@@ -4085,7 +4158,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
       Value *RegSaveAreaPtr =
           IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
-      unsigned Alignment = 8;
+      const Align Alignment = Align(8);
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
           MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
                                  Alignment, /*isStore*/ true);
@@ -4183,7 +4256,8 @@ struct VarArgAArch64Helper : public VarArgHelper {
         continue;
       if (!Base)
         continue;
-      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base,
+                             kShadowTLSAlignment.value());
     }
     Constant *OverflowSize =
       ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
@@ -4207,7 +4281,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
     VAStartInstrumentationList.push_back(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = 8;
+    const Align Alignment = Align(8);
     std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
         VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
@@ -4219,7 +4293,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
     VAStartInstrumentationList.push_back(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = 8;
+    const Align Alignment = Align(8);
     std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
         VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
@@ -4260,7 +4334,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
         IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
                       VAArgOverflowSize);
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
     }
 
     Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
@@ -4311,14 +4385,15 @@ struct VarArgAArch64Helper : public VarArgHelper {
 
       Value *GrRegSaveAreaShadowPtr =
           MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 /*Alignment*/ 8, /*isStore*/ true)
+                                 Align(8), /*isStore*/ true)
               .first;
 
       Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
                                               GrRegSaveAreaShadowPtrOff);
       Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
 
-      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, 8, GrSrcPtr, 8, GrCopySize);
+      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8),
+                       GrCopySize);
 
       // Again, but for FP/SIMD values.
       Value *VrRegSaveAreaShadowPtrOff =
@@ -4326,7 +4401,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
 
       Value *VrRegSaveAreaShadowPtr =
           MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 /*Alignment*/ 8, /*isStore*/ true)
+                                 Align(8), /*isStore*/ true)
               .first;
 
       Value *VrSrcPtr = IRB.CreateInBoundsGEP(
@@ -4336,20 +4411,21 @@ struct VarArgAArch64Helper : public VarArgHelper {
         VrRegSaveAreaShadowPtrOff);
       Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
 
-      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, 8, VrSrcPtr, 8, VrCopySize);
+      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, Align(8), VrSrcPtr, Align(8),
+                       VrCopySize);
 
       // And finally for remaining arguments.
       Value *StackSaveAreaShadowPtr =
           MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 /*Alignment*/ 16, /*isStore*/ true)
+                                 Align(16), /*isStore*/ true)
               .first;
 
       Value *StackSrcPtr =
         IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
                               IRB.getInt32(AArch64VAEndOffset));
 
-      IRB.CreateMemCpy(StackSaveAreaShadowPtr, 16, StackSrcPtr, 16,
-                       VAArgOverflowSize);
+      IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr,
+                       Align(16), VAArgOverflowSize);
     }
   }
 };
@@ -4441,7 +4517,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
           Base = getShadowPtrForVAArgument(A->getType(), IRB,
                                            VAArgOffset - VAArgBase, ArgSize);
           if (Base)
-            IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+            IRB.CreateAlignedStore(MSV.getShadow(A), Base,
+                                   kShadowTLSAlignment.value());
         }
         VAArgOffset += ArgSize;
         VAArgOffset = alignTo(VAArgOffset, 8);
@@ -4474,7 +4551,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
     VAStartInstrumentationList.push_back(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = 8;
+    const Align Alignment = Align(8);
     std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
         VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
@@ -4485,7 +4562,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
     IRBuilder<> IRB(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
-    unsigned Alignment = 8;
+    const Align Alignment = Align(8);
     std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
         VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     // Unpoison the whole __va_list_tag.
@@ -4506,7 +4583,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
     }
 
     // Instrument va_start.
@@ -4522,7 +4599,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
       Value *RegSaveAreaPtr =
           IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
-      unsigned Alignment = 8;
+      const Align Alignment = Align(8);
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
           MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
                                  Alignment, /*isStore*/ true);
@@ -4567,14 +4644,18 @@ static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
 }
 
 bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
-  if (!CompileKernel && (&F == MsanCtorFunction))
+  if (!CompileKernel && F.getName() == kMsanModuleCtorName)
     return false;
+
   MemorySanitizerVisitor Visitor(F, *this, TLI);
 
   // Clear out readonly/readnone attributes.
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly)
-    .addAttribute(Attribute::ReadNone);
+      .addAttribute(Attribute::ReadNone)
+      .addAttribute(Attribute::WriteOnly)
+      .addAttribute(Attribute::ArgMemOnly)
+      .addAttribute(Attribute::Speculatable);
   F.removeAttributes(AttributeList::FunctionIndex, B);
 
   return Visitor.runOnFunction();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 6fec3c9c79ee..cc96bdd1d516 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -47,7 +47,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "CFGMST.h"
+#include "ValueProfileCollector.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -61,7 +63,6 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/IndirectCallVisitor.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -92,10 +93,12 @@
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DOTGraphTraits.h"
@@ -103,11 +106,10 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/JamCRC.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -120,6 +122,7 @@
 
 using namespace llvm;
 using ProfileCount = Function::ProfileCount;
+using VPCandidateInfo = ValueProfileCollector::CandidateInfo;
 
 #define DEBUG_TYPE "pgo-instrumentation"
 
@@ -286,6 +289,11 @@ static std::string getBranchCondString(Instruction *TI) {
   return result;
 }
 
+static const char *ValueProfKindDescr[] = {
+#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
 namespace {
 
 /// The select instruction visitor plays three roles specified
@@ -348,50 +356,6 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   unsigned getNumOfSelectInsts() const { return NSIs; }
 };
 
-/// Instruction Visitor class to visit memory intrinsic calls.
-struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> {
-  Function &F;
-  unsigned NMemIs = 0;          // Number of memIntrinsics instrumented.
-  VisitMode Mode = VM_counting; // Visiting mode.
-  unsigned CurCtrId = 0;        // Current counter index.
-  unsigned TotalNumCtrs = 0;    // Total number of counters
-  GlobalVariable *FuncNameVar = nullptr;
-  uint64_t FuncHash = 0;
-  PGOUseFunc *UseFunc = nullptr;
-  std::vector<Instruction *> Candidates;
-
-  MemIntrinsicVisitor(Function &Func) : F(Func) {}
-
-  void countMemIntrinsics(Function &Func) {
-    NMemIs = 0;
-    Mode = VM_counting;
-    visit(Func);
-  }
-
-  void instrumentMemIntrinsics(Function &Func, unsigned TotalNC,
-                               GlobalVariable *FNV, uint64_t FHash) {
-    Mode = VM_instrument;
-    TotalNumCtrs = TotalNC;
-    FuncHash = FHash;
-    FuncNameVar = FNV;
-    visit(Func);
-  }
-
-  std::vector<Instruction *> findMemIntrinsics(Function &Func) {
-    Candidates.clear();
-    Mode = VM_annotate;
-    visit(Func);
-    return Candidates;
-  }
-
-  // Visit the IR stream and annotate all mem intrinsic call instructions.
-  void instrumentOneMemIntrinsic(MemIntrinsic &MI);
-
-  // Visit \p MI instruction and perform tasks according to visit mode.
-  void visitMemIntrinsic(MemIntrinsic &SI);
-
-  unsigned getNumOfMemIntrinsics() const { return NMemIs; }
-};
 
 class PGOInstrumentationGenLegacyPass : public ModulePass {
 public:
@@ -563,13 +527,14 @@ private:
   // A map that stores the Comdat group in function F.
   std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
 
+  ValueProfileCollector VPC;
+
   void computeCFGHash();
   void renameComdatFunction();
 
 public:
-  std::vector<std::vector<Instruction *>> ValueSites;
+  std::vector<std::vector<VPCandidateInfo>> ValueSites;
   SelectInstVisitor SIVisitor;
-  MemIntrinsicVisitor MIVisitor;
   std::string FuncName;
   GlobalVariable *FuncNameVar;
 
@@ -604,23 +569,21 @@ public:
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr, bool IsCS = false)
-      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers),
-        ValueSites(IPVK_Last + 1), SIVisitor(Func), MIVisitor(Func),
-        MST(F, BPI, BFI) {
+      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func),
+        ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) {
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
-    MIVisitor.countMemIntrinsics(Func);
+    ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize);
     if (!IsCS) {
       NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
-      NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+      NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
       NumOfPGOBB += MST.BBInfos.size();
-      ValueSites[IPVK_IndirectCallTarget] = findIndirectCalls(Func);
+      ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget);
     } else {
       NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
-      NumOfCSPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+      NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
       NumOfCSPGOBB += MST.BBInfos.size();
     }
-    ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func);
 
     FuncName = getPGOFuncName(F);
     computeCFGHash();
@@ -647,7 +610,7 @@ public:
 // value of each BB in the CFG. The higher 32 bits record the number of edges.
 template <class Edge, class BBInfo>
 void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
-  std::vector<char> Indexes;
+  std::vector<uint8_t> Indexes;
   JamCRC JC;
   for (auto &BB : F) {
     const Instruction *TI = BB.getTerminator();
@@ -658,7 +621,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
         continue;
       uint32_t Index = BI->Index;
       for (int J = 0; J < 4; J++)
-        Indexes.push_back((char)(Index >> (J * 8)));
+        Indexes.push_back((uint8_t)(Index >> (J * 8)));
     }
   }
   JC.update(Indexes);
@@ -874,28 +837,36 @@ static void instrumentOneFunc(
   if (DisableValueProfiling)
     return;
 
-  unsigned NumIndirectCalls = 0;
-  for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
-    CallSite CS(I);
-    Value *Callee = CS.getCalledValue();
-    LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
-                      << NumIndirectCalls << "\n");
-    IRBuilder<> Builder(I);
-    assert(Builder.GetInsertPoint() != I->getParent()->end() &&
-           "Cannot get the Instrumentation point");
-    Builder.CreateCall(
-        Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
-        {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
-         Builder.getInt64(FuncInfo.FunctionHash),
-         Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
-         Builder.getInt32(IPVK_IndirectCallTarget),
-         Builder.getInt32(NumIndirectCalls++)});
-  }
-  NumOfPGOICall += NumIndirectCalls;
+  NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
+
+  // For each VP Kind, walk the VP candidates and instrument each one.
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
+    unsigned SiteIndex = 0;
+    if (Kind == IPVK_MemOPSize && !PGOInstrMemOP)
+      continue;
 
-  // Now instrument memop intrinsic calls.
-  FuncInfo.MIVisitor.instrumentMemIntrinsics(
-      F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash);
+    for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) {
+      LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind]
+                        << " site: CallSite Index = " << SiteIndex << "\n");
+
+      IRBuilder<> Builder(Cand.InsertPt);
+      assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() &&
+             "Cannot get the Instrumentation point");
+
+      Value *ToProfile = nullptr;
+      if (Cand.V->getType()->isIntegerTy())
+        ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty());
+      else if (Cand.V->getType()->isPointerTy())
+        ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
+      assert(ToProfile && "value profiling Value is of unexpected type");
+
+      Builder.CreateCall(
+          Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+          {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+           Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
+           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)});
+    }
+  } // IPVK_First <= Kind <= IPVK_Last
 }
 
 namespace {
@@ -984,9 +955,9 @@ class PGOUseFunc {
 public:
   PGOUseFunc(Function &Func, Module *Modu,
              std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
-             BranchProbabilityInfo *BPI = nullptr,
-             BlockFrequencyInfo *BFIin = nullptr, bool IsCS = false)
-      : F(Func), M(Modu), BFI(BFIin),
+             BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
+             ProfileSummaryInfo *PSI, bool IsCS)
+      : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
         FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS),
         FreqAttr(FFA_Normal), IsCS(IsCS) {}
 
@@ -1041,6 +1012,7 @@ private:
   Function &F;
   Module *M;
   BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
 
   // This member stores the shared information with class PGOGenFunc.
   FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
@@ -1078,15 +1050,9 @@ private:
   // FIXME: This function should be removed once the functionality in
   // the inliner is implemented.
   void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
-    if (ProgramMaxCount == 0)
-      return;
-    // Threshold of the hot functions.
-    const BranchProbability HotFunctionThreshold(1, 100);
-    // Threshold of the cold functions.
-    const BranchProbability ColdFunctionThreshold(2, 10000);
-    if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount))
+    if (PSI->isHotCount(EntryCount))
       FreqAttr = FFA_Hot;
-    else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount))
+    else if (PSI->isColdCount(MaxCount))
       FreqAttr = FFA_Cold;
   }
 };
@@ -1433,43 +1399,6 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   llvm_unreachable("Unknown visiting mode");
 }
 
-void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {
-  Module *M = F.getParent();
-  IRBuilder<> Builder(&MI);
-  Type *Int64Ty = Builder.getInt64Ty();
-  Type *I8PtrTy = Builder.getInt8PtrTy();
-  Value *Length = MI.getLength();
-  assert(!isa<ConstantInt>(Length));
-  Builder.CreateCall(
-      Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
-      {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
-       Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty),
-       Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});
-  ++CurCtrId;
-}
-
-void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {
-  if (!PGOInstrMemOP)
-    return;
-  Value *Length = MI.getLength();
-  // Not instrument constant length calls.
-  if (dyn_cast<ConstantInt>(Length))
-    return;
-
-  switch (Mode) {
-  case VM_counting:
-    NMemIs++;
-    return;
-  case VM_instrument:
-    instrumentOneMemIntrinsic(MI);
-    return;
-  case VM_annotate:
-    Candidates.push_back(&MI);
-    return;
-  }
-  llvm_unreachable("Unknown visiting mode");
-}
-
 // Traverse all valuesites and annotate the instructions for all value kind.
 void PGOUseFunc::annotateValueSites() {
   if (DisableValueProfiling)
@@ -1482,11 +1411,6 @@ void PGOUseFunc::annotateValueSites() {
     annotateValueSites(Kind);
 }
 
-static const char *ValueProfKindDescr[] = {
-#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
-#include "llvm/ProfileData/InstrProfData.inc"
-};
-
 // Annotate the instructions for a specific value kind.
 void PGOUseFunc::annotateValueSites(uint32_t Kind) {
   assert(Kind <= IPVK_Last);
@@ -1505,11 +1429,11 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) {
     return;
   }
 
-  for (auto &I : ValueSites) {
+  for (VPCandidateInfo &I : ValueSites) {
     LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
                       << "): Index = " << ValueSiteIndex << " out of "
                       << NumValueSites << "\n");
-    annotateValueSite(*M, *I, ProfileRecord,
+    annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
                       static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
                       Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
                                              : MaxNumAnnotations);
@@ -1595,7 +1519,8 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
 static bool annotateAllFunctions(
     Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
-    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
+    ProfileSummaryInfo *PSI, bool IsCS) {
   LLVM_DEBUG(dbgs() << "Read in profile counters: ");
   auto &Ctx = M.getContext();
   // Read the counter array from file.
@@ -1626,6 +1551,13 @@ static bool annotateAllFunctions(
     return false;
   }
 
+  // Add the profile summary (read from the header of the indexed summary) here
+  // so that we can use it below when reading counters (which checks if the
+  // function should be marked with a cold or inlinehint attribute).
+  M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
+                      IsCS ? ProfileSummary::PSK_CSInstr
+                           : ProfileSummary::PSK_Instr);
+
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
   std::vector<Function *> HotFunctions;
@@ -1638,7 +1570,7 @@ static bool annotateAllFunctions(
     // Split indirectbr critical edges here before computing the MST rather than
     // later in getInstrBB() to avoid invalidating it.
     SplitIndirectBrCriticalEdges(F, BPI, BFI);
-    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, IsCS);
+    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS);
     bool AllZeros = false;
     if (!Func.readCounters(PGOReader.get(), AllZeros))
       continue;
@@ -1662,9 +1594,9 @@ static bool annotateAllFunctions(
          F.getName().equals(ViewBlockFreqFuncName))) {
       LoopInfo LI{DominatorTree(F)};
       std::unique_ptr<BranchProbabilityInfo> NewBPI =
-          llvm::make_unique<BranchProbabilityInfo>(F, LI);
+          std::make_unique<BranchProbabilityInfo>(F, LI);
       std::unique_ptr<BlockFrequencyInfo> NewBFI =
-          llvm::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+          std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
       if (PGOViewCounts == PGOVCT_Graph)
         NewBFI->view();
       else if (PGOViewCounts == PGOVCT_Text) {
@@ -1686,9 +1618,6 @@ static bool annotateAllFunctions(
       }
     }
   }
-  M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
-                      IsCS ? ProfileSummary::PSK_CSInstr
-                           : ProfileSummary::PSK_Instr);
 
   // Set function hotness attribute from the profile.
   // We have to apply these attributes at the end because their presence
@@ -1730,8 +1659,10 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
 
+  auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
   if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
-                            LookupBPI, LookupBFI, IsCS))
+                            LookupBPI, LookupBFI, PSI, IsCS))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1748,7 +1679,8 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
 
-  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI,
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI,
                               IsCS);
 }
 
@@ -1776,6 +1708,9 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
                                            : Weights) {
     dbgs() << W << " ";
   } dbgs() << "\n";);
+
+  misexpect::verifyMisExpect(TI, Weights, TI->getContext());
+
   TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 188f95b4676b..d0afe2959b39 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
@@ -138,7 +139,7 @@ public:
                OptimizationRemarkEmitter &ORE, DominatorTree *DT)
       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
     ValueDataArray =
-        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+        std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
     // Get the MemOPSize range information from option MemOPSizeRange,
     getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
                                 PreciseRangeLast);
@@ -374,8 +375,8 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
     Instruction *NewInst = MI->clone();
     // Fix the argument.
-    MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst);
-    IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
+    auto *MemI = cast<MemIntrinsic>(NewInst);
+    auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
     assert(SizeType && "Expected integer type size argument.");
     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
     MemI->setLength(CaseSizeId);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
index 81d92e724c7d..71ecfd9a2642 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -65,10 +65,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index ca0cb4bdbe84..e6dc684c2e77 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/EHPersonalities.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -176,24 +178,21 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   return Options;
 }
 
-class SanitizerCoverageModule : public ModulePass {
+using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>;
+using PostDomTreeCallback =
+    function_ref<const PostDominatorTree *(Function &F)>;
+
+class ModuleSanitizerCoverage {
 public:
-  SanitizerCoverageModule(
+  ModuleSanitizerCoverage(
       const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
-      : ModulePass(ID), Options(OverrideFromCL(Options)) {
-    initializeSanitizerCoverageModulePass(*PassRegistry::getPassRegistry());
-  }
-  bool runOnModule(Module &M) override;
-  bool runOnFunction(Function &F);
-  static char ID; // Pass identification, replacement for typeid
-  StringRef getPassName() const override { return "SanitizerCoverageModule"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTreeWrapperPass>();
-  }
+      : Options(OverrideFromCL(Options)) {}
+  bool instrumentModule(Module &M, DomTreeCallback DTCallback,
+                        PostDomTreeCallback PDTCallback);
 
 private:
+  void instrumentFunction(Function &F, DomTreeCallback DTCallback,
+                          PostDomTreeCallback PDTCallback);
   void InjectCoverageForIndirectCalls(Function &F,
                                       ArrayRef<Instruction *> IndirCalls);
   void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
@@ -252,10 +251,57 @@ private:
   SanitizerCoverageOptions Options;
 };
 
+class ModuleSanitizerCoverageLegacyPass : public ModulePass {
+public:
+  ModuleSanitizerCoverageLegacyPass(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
+      : ModulePass(ID), Options(Options) {
+    initializeModuleSanitizerCoverageLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    ModuleSanitizerCoverage ModuleSancov(Options);
+    auto DTCallback = [this](Function &F) -> const DominatorTree * {
+      return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    auto PDTCallback = [this](Function &F) -> const PostDominatorTree * {
+      return &this->getAnalysis<PostDominatorTreeWrapperPass>(F)
+                  .getPostDomTree();
+    };
+    return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback);
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+  StringRef getPassName() const override { return "ModuleSanitizerCoverage"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+  }
+
+private:
+  SanitizerCoverageOptions Options;
+};
+
 } // namespace
 
+PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
+                                                   ModuleAnalysisManager &MAM) {
+  ModuleSanitizerCoverage ModuleSancov(Options);
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto DTCallback = [&FAM](Function &F) -> const DominatorTree * {
+    return &FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * {
+    return &FAM.getResult<PostDominatorTreeAnalysis>(F);
+  };
+  if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
 std::pair<Value *, Value *>
-SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
+ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
                                            Type *Ty) {
   GlobalVariable *SecStart =
       new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr,
@@ -278,7 +324,7 @@ SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
   return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEndPtr);
 }
 
-Function *SanitizerCoverageModule::CreateInitCallsForSections(
+Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
     Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty,
     const char *Section) {
   auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
@@ -310,7 +356,8 @@ Function *SanitizerCoverageModule::CreateInitCallsForSections(
   return CtorFunc;
 }
 
-bool SanitizerCoverageModule::runOnModule(Module &M) {
+bool ModuleSanitizerCoverage::instrumentModule(
+    Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
   if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
     return false;
   C = &(M.getContext());
@@ -403,7 +450,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
       M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy);
 
   for (auto &F : M)
-    runOnFunction(F);
+    instrumentFunction(F, DTCallback, PDTCallback);
 
   Function *Ctor = nullptr;
 
@@ -518,29 +565,30 @@ static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT,
   return true;
 }
 
-bool SanitizerCoverageModule::runOnFunction(Function &F) {
+void ModuleSanitizerCoverage::instrumentFunction(
+    Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
   if (F.empty())
-    return false;
+    return;
   if (F.getName().find(".module_ctor") != std::string::npos)
-    return false; // Should not instrument sanitizer init functions.
+    return; // Should not instrument sanitizer init functions.
   if (F.getName().startswith("__sanitizer_"))
-    return false;  // Don't instrument __sanitizer_* callbacks.
+    return; // Don't instrument __sanitizer_* callbacks.
   // Don't touch available_externally functions, their actual body is elewhere.
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
-    return false;
+    return;
   // Don't instrument MSVC CRT configuration helpers. They may run before normal
   // initialization.
   if (F.getName() == "__local_stdio_printf_options" ||
       F.getName() == "__local_stdio_scanf_options")
-    return false;
+    return;
   if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
-    return false;
+    return;
   // Don't instrument functions using SEH for now. Splitting basic blocks like
   // we do for coverage breaks WinEHPrepare.
   // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
   if (F.hasPersonalityFn() &&
       isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
-    return false;
+    return;
   if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
     SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests());
   SmallVector<Instruction *, 8> IndirCalls;
@@ -550,10 +598,8 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   SmallVector<BinaryOperator *, 8> DivTraceTargets;
   SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
 
-  const DominatorTree *DT =
-      &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
-  const PostDominatorTree *PDT =
-      &getAnalysis<PostDominatorTreeWrapperPass>(F).getPostDomTree();
+  const DominatorTree *DT = DTCallback(F);
+  const PostDominatorTree *PDT = PDTCallback(F);
   bool IsLeafFunc = true;
 
   for (auto &BB : F) {
@@ -593,10 +639,9 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   InjectTraceForSwitch(F, SwitchTraceTargets);
   InjectTraceForDiv(F, DivTraceTargets);
   InjectTraceForGep(F, GepTraceTargets);
-  return true;
 }
 
-GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
+GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
     size_t NumElements, Function &F, Type *Ty, const char *Section) {
   ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
   auto Array = new GlobalVariable(
@@ -608,8 +653,9 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
             GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
       Array->setComdat(Comdat);
   Array->setSection(getSectionName(Section));
-  Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize()
-                                        : Ty->getPrimitiveSizeInBits() / 8);
+  Array->setAlignment(Align(Ty->isPointerTy()
+                                ? DL->getPointerSize()
+                                : Ty->getPrimitiveSizeInBits() / 8));
   GlobalsToAppendToUsed.push_back(Array);
   GlobalsToAppendToCompilerUsed.push_back(Array);
   MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
@@ -619,7 +665,7 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
 }
 
 GlobalVariable *
-SanitizerCoverageModule::CreatePCArray(Function &F,
+ModuleSanitizerCoverage::CreatePCArray(Function &F,
                                        ArrayRef<BasicBlock *> AllBlocks) {
   size_t N = AllBlocks.size();
   assert(N);
@@ -646,7 +692,7 @@ SanitizerCoverageModule::CreatePCArray(Function &F,
   return PCArray;
 }
 
-void SanitizerCoverageModule::CreateFunctionLocalArrays(
+void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
     Function &F, ArrayRef<BasicBlock *> AllBlocks) {
   if (Options.TracePCGuard)
     FunctionGuardArray = CreateFunctionLocalArrayInSection(
@@ -660,7 +706,7 @@ void SanitizerCoverageModule::CreateFunctionLocalArrays(
     FunctionPCsArray = CreatePCArray(F, AllBlocks);
 }
 
-bool SanitizerCoverageModule::InjectCoverage(Function &F,
+bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
                                              ArrayRef<BasicBlock *> AllBlocks,
                                              bool IsLeafFunc) {
   if (AllBlocks.empty()) return false;
@@ -677,7 +723,7 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F,
 //     The cache is used to speed up recording the caller-callee pairs.
 // The address of the caller is passed implicitly via caller PC.
 // CacheSize is encoded in the name of the run-time function.
-void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
+void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
     Function &F, ArrayRef<Instruction *> IndirCalls) {
   if (IndirCalls.empty())
     return;
@@ -696,7 +742,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
 // __sanitizer_cov_trace_switch(CondValue,
 //      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
 
-void SanitizerCoverageModule::InjectTraceForSwitch(
+void ModuleSanitizerCoverage::InjectTraceForSwitch(
     Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
   for (auto I : SwitchTraceTargets) {
     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
@@ -735,7 +781,7 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
   }
 }
 
-void SanitizerCoverageModule::InjectTraceForDiv(
+void ModuleSanitizerCoverage::InjectTraceForDiv(
     Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
   for (auto BO : DivTraceTargets) {
     IRBuilder<> IRB(BO);
@@ -753,7 +799,7 @@ void SanitizerCoverageModule::InjectTraceForDiv(
   }
 }
 
-void SanitizerCoverageModule::InjectTraceForGep(
+void ModuleSanitizerCoverage::InjectTraceForGep(
     Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
   for (auto GEP : GepTraceTargets) {
     IRBuilder<> IRB(GEP);
@@ -764,7 +810,7 @@ void SanitizerCoverageModule::InjectTraceForGep(
   }
 }
 
-void SanitizerCoverageModule::InjectTraceForCmp(
+void ModuleSanitizerCoverage::InjectTraceForCmp(
     Function &, ArrayRef<Instruction *> CmpTraceTargets) {
   for (auto I : CmpTraceTargets) {
     if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
@@ -799,7 +845,7 @@ void SanitizerCoverageModule::InjectTraceForCmp(
   }
 }
 
-void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
                                                     size_t Idx,
                                                     bool IsLeafFunc) {
   BasicBlock::iterator IP = BB.getFirstInsertionPt();
@@ -842,8 +888,10 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   }
   if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
     // Check stack depth.  If it's the deepest so far, record it.
-    Function *GetFrameAddr =
-        Intrinsic::getDeclaration(F.getParent(), Intrinsic::frameaddress);
+    Module *M = F.getParent();
+    Function *GetFrameAddr = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
     auto FrameAddrPtr =
         IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)});
     auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
@@ -858,7 +906,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
 }
 
 std::string
-SanitizerCoverageModule::getSectionName(const std::string &Section) const {
+ModuleSanitizerCoverage::getSectionName(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatCOFF()) {
     if (Section == SanCovCountersSectionName)
       return ".SCOV$CM";
@@ -872,32 +920,29 @@ SanitizerCoverageModule::getSectionName(const std::string &Section) const {
 }
 
 std::string
-SanitizerCoverageModule::getSectionStart(const std::string &Section) const {
+ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatMachO())
     return "\1section$start$__DATA$__" + Section;
   return "__start___" + Section;
 }
 
 std::string
-SanitizerCoverageModule::getSectionEnd(const std::string &Section) const {
+ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatMachO())
     return "\1section$end$__DATA$__" + Section;
   return "__stop___" + Section;
 }
 
-
-char SanitizerCoverageModule::ID = 0;
-INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov",
-                      "SanitizerCoverage: TODO."
-                      "ModulePass",
-                      false, false)
+char ModuleSanitizerCoverageLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov",
+                      "Pass for instrumenting coverage on functions", false,
+                      false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SanitizerCoverageModule, "sancov",
-                    "SanitizerCoverage: TODO."
-                    "ModulePass",
-                    false, false)
-ModulePass *llvm::createSanitizerCoverageModulePass(
+INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
+                    "Pass for instrumenting coverage on functions", false,
+                    false)
+ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
     const SanitizerCoverageOptions &Options) {
-  return new SanitizerCoverageModule(Options);
+  return new ModuleSanitizerCoverageLegacyPass(Options);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 5be13fa745cb..9b7edad3444b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -26,7 +26,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -37,6 +36,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -45,6 +45,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -92,11 +93,10 @@ namespace {
 /// ensures the __tsan_init function is in the list of global constructors for
 /// the module.
 struct ThreadSanitizer {
-  ThreadSanitizer(Module &M);
   bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
 
 private:
-  void initializeCallbacks(Module &M);
+  void initialize(Module &M);
   bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
   bool instrumentAtomic(Instruction *I, const DataLayout &DL);
   bool instrumentMemIntrinsic(Instruction *I);
@@ -108,8 +108,6 @@ private:
   void InsertRuntimeIgnores(Function &F);
 
   Type *IntptrTy;
-  IntegerType *OrdTy;
-  // Callbacks to run-time library are computed in doInitialization.
   FunctionCallee TsanFuncEntry;
   FunctionCallee TsanFuncExit;
   FunctionCallee TsanIgnoreBegin;
@@ -130,7 +128,6 @@ private:
   FunctionCallee TsanVptrUpdate;
   FunctionCallee TsanVptrLoad;
   FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
-  Function *TsanCtorFunction;
 };
 
 struct ThreadSanitizerLegacyPass : FunctionPass {
@@ -143,16 +140,32 @@ struct ThreadSanitizerLegacyPass : FunctionPass {
 private:
   Optional<ThreadSanitizer> TSan;
 };
+
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
+}
+
 }  // namespace
 
 PreservedAnalyses ThreadSanitizerPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
-  ThreadSanitizer TSan(*F.getParent());
+  ThreadSanitizer TSan;
   if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
 
+PreservedAnalyses ThreadSanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &MAM) {
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
 char ThreadSanitizerLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan",
                       "ThreadSanitizer: detects data races.", false, false)
@@ -169,12 +182,13 @@ void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool ThreadSanitizerLegacyPass::doInitialization(Module &M) {
-  TSan.emplace(M);
+  insertModuleCtor(M);
+  TSan.emplace();
   return true;
 }
 
 bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) {
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   TSan->sanitizeFunction(F, TLI);
   return true;
 }
@@ -183,7 +197,10 @@ FunctionPass *llvm::createThreadSanitizerLegacyPassPass() {
   return new ThreadSanitizerLegacyPass();
 }
 
-void ThreadSanitizer::initializeCallbacks(Module &M) {
+void ThreadSanitizer::initialize(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+
   IRBuilder<> IRB(M.getContext());
   AttributeList Attr;
   Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
@@ -197,7 +214,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
                                           IRB.getVoidTy());
   TsanIgnoreEnd =
       M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy());
-  OrdTy = IRB.getInt32Ty();
+  IntegerType *OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
     const unsigned ByteSize = 1U << i;
     const unsigned BitSize = ByteSize * 8;
@@ -280,20 +297,6 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
                             IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
 }
 
-ThreadSanitizer::ThreadSanitizer(Module &M) {
-  const DataLayout &DL = M.getDataLayout();
-  IntptrTy = DL.getIntPtrType(M.getContext());
-  std::tie(TsanCtorFunction, std::ignore) =
-      getOrCreateSanitizerCtorAndInitFunctions(
-          M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
-          /*InitArgs=*/{},
-          // This callback is invoked when the functions are created the first
-          // time. Hook them into the global ctors list in that case:
-          [&](Function *Ctor, FunctionCallee) {
-            appendToGlobalCtors(M, Ctor, 0);
-          });
-}
-
 static bool isVtableAccess(Instruction *I) {
   if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
     return Tag->isTBAAVtableAccess();
@@ -436,9 +439,9 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
                                        const TargetLibraryInfo &TLI) {
   // This is required to prevent instrumenting call to __tsan_init from within
   // the module constructor.
-  if (&F == TsanCtorFunction)
+  if (F.getName() == kTsanModuleCtorName)
     return false;
-  initializeCallbacks(*F.getParent());
+  initialize(*F.getParent());
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
   SmallVector<Instruction*, 8> AtomicAccesses;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
new file mode 100644
index 000000000000..604726d4f40f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -0,0 +1,78 @@
+//===- ValueProfileCollector.cpp - determine what to value profile --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfilePlugins.inc"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+/// A plugin-based class that takes an arbitrary number of Plugin types.
+/// Each plugin type must satisfy the following API:
+///  1) the constructor must take a `Function &f`. Typically, the plugin would
+///     scan the function looking for candidates.
+///  2) contain a member function with the following signature and name:
+///        void run(std::vector<CandidateInfo> &Candidates);
+///    such that the plugin would append its result into the vector parameter.
+///
+/// Plugins are defined in ValueProfilePlugins.inc
+template <class... Ts> class PluginChain;
+
+/// The type PluginChainFinal is the final chain of plugins that will be used by
+/// ValueProfileCollectorImpl.
+using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
+
+template <> class PluginChain<> {
+public:
+  PluginChain(Function &F) {}
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
+};
+
+template <class PluginT, class... Ts>
+class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
+  PluginT Plugin;
+  using Base = PluginChain<Ts...>;
+
+public:
+  PluginChain(Function &F) : PluginChain<Ts...>(F), Plugin(F) {}
+
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
+    if (K == PluginT::Kind)
+      Plugin.run(Candidates);
+    Base::get(K, Candidates);
+  }
+};
+
+} // end anonymous namespace
+
+/// ValueProfileCollectorImpl inherits the API of PluginChainFinal.
+class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal {
+public:
+  using PluginChainFinal::PluginChainFinal;
+};
+
+ValueProfileCollector::ValueProfileCollector(Function &F)
+    : PImpl(new ValueProfileCollectorImpl(F)) {}
+
+ValueProfileCollector::~ValueProfileCollector() = default;
+
+std::vector<CandidateInfo>
+ValueProfileCollector::get(InstrProfValueKind Kind) const {
+  std::vector<CandidateInfo> Result;
+  PImpl->get(Kind, Result);
+  return Result;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
new file mode 100644
index 000000000000..ff883c8d0c77
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -0,0 +1,79 @@
+//===- ValueProfileCollector.h - determine what to value profile ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a utility class, ValueProfileCollector, that is used to
+// determine what kind of llvm::Value's are worth value-profiling, at which
+// point in the program, and which instruction holds the Value Profile metadata.
+// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use]
+// passes.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+
+namespace llvm {
+
+/// Utility analysis that determines what values are worth profiling.
+/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to
+/// populate the Candidates vector.
+///
+/// Value profiling an expression means to track the values that this expression
+/// takes at runtime and the frequency of each value.
+/// It is important to distinguish between two sets of value profiles for a
+/// particular expression:
+///  1) The set of values at the point of evaluation.
+///  2) The set of values at the point of use.
+/// In some cases, the two sets are identical, but it's not unusual for the two
+/// to differ.
+///
+/// To elaborate more, consider this C code, and focus on the expression `nn`:
+///  void foo(int nn, bool b) {
+///    if (b)  memcpy(x, y, nn);
+///  }
+/// The point of evaluation can be as early as the start of the function, and
+/// let's say the value profile for `nn` is:
+///     total=100; (value,freq) set = {(8,10), (32,50)}
+/// The point of use is right before we call memcpy, and since we execute the
+/// memcpy conditionally, the value profile of `nn` can be:
+///     total=15; (value,freq) set = {(8,10), (4,5)}
+///
+/// For this reason, a plugin is responsible for computing the insertion point
+/// for each value to be profiled. The `CandidateInfo` structure encapsulates
+/// all the information needed for each value profile site.
+class ValueProfileCollector {
+public:
+  struct CandidateInfo {
+    Value *V;                   // The value to profile.
+    Instruction *InsertPt;      // Insert the VP lib call before this instr.
+    Instruction *AnnotatedInst; // Where metadata is attached.
+  };
+
+  ValueProfileCollector(Function &Fn);
+  ValueProfileCollector(ValueProfileCollector &&) = delete;
+  ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
+
+  ValueProfileCollector(const ValueProfileCollector &) = delete;
+  ValueProfileCollector &operator=(const ValueProfileCollector &) = delete;
+  ~ValueProfileCollector();
+
+  /// returns a list of value profiling candidates of the given kind
+  std::vector<CandidateInfo> get(InstrProfValueKind Kind) const;
+
+private:
+  class ValueProfileCollectorImpl;
+  std::unique_ptr<ValueProfileCollectorImpl> PImpl;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
new file mode 100644
index 000000000000..4cc4c6c848c3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -0,0 +1,75 @@
+//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a set of plugin classes used in ValueProfileCollectorImpl.
+// Each plugin is responsible for collecting Value Profiling candidates for a
+// particular optimization.
+// Each plugin must satisfy the interface described in ValueProfileCollector.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfileCollector.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+using CandidateInfo = ValueProfileCollector::CandidateInfo;
+
+///--------------------------- MemIntrinsicPlugin ------------------------------
+class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
+  Function &F;
+  std::vector<CandidateInfo> *Candidates;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
+
+  MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {}
+
+  void run(std::vector<CandidateInfo> &Cs) {
+    Candidates = &Cs;
+    visit(F);
+    Candidates = nullptr;
+  }
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not instrument constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+
+    Instruction *InsertPt = &MI;
+    Instruction *AnnotatedInst = &MI;
+    Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+  }
+};
+
+///------------------------ IndirectCallPromotionPlugin ------------------------
+class IndirectCallPromotionPlugin {
+  Function &F;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
+
+  IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {}
+
+  void run(std::vector<CandidateInfo> &Candidates) {
+    std::vector<Instruction *> Result = findIndirectCalls(F);
+    for (Instruction *I : Result) {
+      Value *Callee = CallSite(I).getCalledValue();
+      Instruction *InsertPt = I;
+      Instruction *AnnotatedInst = I;
+      Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst});
+    }
+  }
+};
+
+///----------------------- Registration of the plugins -------------------------
+/// For now, registering a plugin with the ValueProfileCollector is done by
+/// adding the plugin type to the VP_PLUGIN_LIST macro.
+#define VP_PLUGIN_LIST           \
+    MemIntrinsicPlugin,          \
+    IndirectCallPromotionPlugin
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index b341dd807508..7a01ec967fb5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -26,6 +26,7 @@
 #include "ObjCARC.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 36aa513ec554..ecf8220ae95d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -34,6 +34,8 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index 04e98d8f5577..205d8ddf151d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/PassRegistry.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 6653ff0bb91a..b80c1675050b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -58,8 +58,10 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -506,6 +508,20 @@ namespace {
                                    ARCInstKind &Class);
     void OptimizeIndividualCalls(Function &F);
 
+    /// Optimize an individual call, optionally passing the
+    /// GetArgRCIdentityRoot if it has already been computed.
+    void OptimizeIndividualCallImpl(
+        Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+        Instruction *Inst, ARCInstKind Class, const Value *Arg);
+
+    /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV.  If the
+    /// optimization occurs, returns true to indicate that the caller should
+    /// assume the instructions are dead.
+    bool OptimizeInlinedAutoreleaseRVCall(
+        Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+        Instruction *Inst, const Value *&Arg, ARCInstKind Class,
+        Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg);
+
     void CheckForCFGHazards(const BasicBlock *BB,
                             DenseMap<const BasicBlock *, BBState> &BBStates,
                             BBState &MyStates) const;
@@ -589,8 +605,7 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
-/// not a return value.  Or, if it can be paired with an
-/// objc_autoreleaseReturnValue, delete the pair and return true.
+/// not a return value.
 bool
 ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   // Check for the argument being from an immediately preceding call or invoke.
@@ -616,39 +631,6 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
     }
   }
 
-  // Track PHIs which are equivalent to our Arg.
-  SmallDenseSet<const Value*, 2> EquivalentArgs;
-  EquivalentArgs.insert(Arg);
-
-  // Add PHIs that are equivalent to Arg to ArgUsers.
-  if (const PHINode *PN = dyn_cast<PHINode>(Arg)) {
-    SmallVector<const Value *, 2> ArgUsers;
-    getEquivalentPHIs(*PN, ArgUsers);
-    EquivalentArgs.insert(ArgUsers.begin(), ArgUsers.end());
-  }
-
-  // Check for being preceded by an objc_autoreleaseReturnValue on the same
-  // pointer. In this case, we can delete the pair.
-  BasicBlock::iterator I = RetainRV->getIterator(),
-                       Begin = RetainRV->getParent()->begin();
-  if (I != Begin) {
-    do
-      --I;
-    while (I != Begin && IsNoopInstruction(&*I));
-    if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV &&
-        EquivalentArgs.count(GetArgRCIdentityRoot(&*I))) {
-      Changed = true;
-      ++NumPeeps;
-
-      LLVM_DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
-                        << "Erasing " << *RetainRV << "\n");
-
-      EraseInstruction(&*I);
-      EraseInstruction(RetainRV);
-      return true;
-    }
-  }
-
   // Turn it to a plain objc_retain.
   Changed = true;
   ++NumPeeps;
@@ -666,6 +648,62 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   return false;
 }
 
+bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
+    Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+    Instruction *Inst, const Value *&Arg, ARCInstKind Class,
+    Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg) {
+  // Must be in the same basic block.
+  assert(Inst->getParent() == AutoreleaseRV->getParent());
+
+  // Must operate on the same root.
+  Arg = GetArgRCIdentityRoot(Inst);
+  AutoreleaseRVArg = GetArgRCIdentityRoot(AutoreleaseRV);
+  if (Arg != AutoreleaseRVArg) {
+    // If there isn't an exact match, check if we have equivalent PHIs.
+    const PHINode *PN = dyn_cast<PHINode>(Arg);
+    if (!PN)
+      return false;
+
+    SmallVector<const Value *, 4> ArgUsers;
+    getEquivalentPHIs(*PN, ArgUsers);
+    if (llvm::find(ArgUsers, AutoreleaseRVArg) == ArgUsers.end())
+      return false;
+  }
+
+  // Okay, this is a match.  Merge them.
+  ++NumPeeps;
+  LLVM_DEBUG(dbgs() << "Found inlined objc_autoreleaseReturnValue '"
+                    << *AutoreleaseRV << "' paired with '" << *Inst << "'\n");
+
+  // Delete the RV pair, starting with the AutoreleaseRV.
+  AutoreleaseRV->replaceAllUsesWith(
+      cast<CallInst>(AutoreleaseRV)->getArgOperand(0));
+  EraseInstruction(AutoreleaseRV);
+  if (Class == ARCInstKind::RetainRV) {
+    // AutoreleaseRV and RetainRV cancel out.  Delete the RetainRV.
+    Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0));
+    EraseInstruction(Inst);
+    return true;
+  }
+
+  // ClaimRV is a frontend peephole for RetainRV + Release.  Since the
+  // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release.
+  assert(Class == ARCInstKind::ClaimRV);
+  Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0);
+  CallInst *Release = CallInst::Create(
+      EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst);
+  assert(IsAlwaysTail(ARCInstKind::ClaimRV) &&
+         "Expected ClaimRV to be safe to tail call");
+  Release->setTailCall();
+  Inst->replaceAllUsesWith(CallArg);
+  EraseInstruction(Inst);
+
+  // Run the normal optimizations on Release.
+  OptimizeIndividualCallImpl(F, BlockColors, Release, ARCInstKind::Release,
+                             Arg);
+  return true;
+}
+
 /// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
 /// used as a return value.
 void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
@@ -752,286 +790,370 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
     BlockColors = colorEHFunclets(F);
 
+  // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired
+  // with RetainRV and ClaimRV.
+  Instruction *DelayedAutoreleaseRV = nullptr;
+  const Value *DelayedAutoreleaseRVArg = nullptr;
+  auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) {
+    assert(!DelayedAutoreleaseRV || !AutoreleaseRV);
+    DelayedAutoreleaseRV = AutoreleaseRV;
+    DelayedAutoreleaseRVArg = nullptr;
+  };
+  auto optimizeDelayedAutoreleaseRV = [&]() {
+    if (!DelayedAutoreleaseRV)
+      return;
+    OptimizeIndividualCallImpl(F, BlockColors, DelayedAutoreleaseRV,
+                               ARCInstKind::AutoreleaseRV,
+                               DelayedAutoreleaseRVArg);
+    setDelayedAutoreleaseRV(nullptr);
+  };
+  auto shouldDelayAutoreleaseRV = [&](Instruction *NonARCInst) {
+    // Nothing to delay, but we may as well skip the logic below.
+    if (!DelayedAutoreleaseRV)
+      return true;
+
+    // If we hit the end of the basic block we're not going to find an RV-pair.
+    // Stop delaying.
+    if (NonARCInst->isTerminator())
+      return false;
+
+    // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and
+    // ClaimRV, it's probably safe to skip over even opaque function calls
+    // here since OptimizeInlinedAutoreleaseRVCall will confirm that they
+    // have the same RCIdentityRoot.  However, what really matters is
+    // skipping instructions or intrinsics that the inliner could leave behind;
+    // be conservative for now and don't skip over opaque calls, which could
+    // potentially include other ARC calls.
+    auto *CB = dyn_cast<CallBase>(NonARCInst);
+    if (!CB)
+      return true;
+    return CB->getIntrinsicID() != Intrinsic::not_intrinsic;
+  };
+
   // Visit all objc_* calls in F.
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
 
     ARCInstKind Class = GetBasicARCInstKind(Inst);
 
-    LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
-
-    // Some of the ARC calls can be deleted if their arguments are global
-    // variables that are inert in ARC.
-    if (IsNoopOnGlobal(Class)) {
-      Value *Opnd = Inst->getOperand(0);
-      if (auto *GV = dyn_cast<GlobalVariable>(Opnd->stripPointerCasts()))
-        if (GV->hasAttribute("objc_arc_inert")) {
-          if (!Inst->getType()->isVoidTy())
-            Inst->replaceAllUsesWith(Opnd);
-          Inst->eraseFromParent();
-          continue;
-        }
-    }
-
+    // Skip this loop if this instruction isn't itself an ARC intrinsic.
+    const Value *Arg = nullptr;
     switch (Class) {
-    default: break;
-
-    // Delete no-op casts. These function calls have special semantics, but
-    // the semantics are entirely implemented via lowering in the front-end,
-    // so by the time they reach the optimizer, they are just no-op calls
-    // which return their argument.
-    //
-    // There are gray areas here, as the ability to cast reference-counted
-    // pointers to raw void* and back allows code to break ARC assumptions,
-    // however these are currently considered to be unimportant.
-    case ARCInstKind::NoopCast:
-      Changed = true;
-      ++NumNoops;
-      LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
-      EraseInstruction(Inst);
-      continue;
-
-    // If the pointer-to-weak-pointer is null, it's undefined behavior.
-    case ARCInstKind::StoreWeak:
-    case ARCInstKind::LoadWeak:
-    case ARCInstKind::LoadWeakRetained:
-    case ARCInstKind::InitWeak:
-    case ARCInstKind::DestroyWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (IsNullOrUndef(CI->getArgOperand(0))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-        Value *NewValue = UndefValue::get(CI->getType());
-        LLVM_DEBUG(
-            dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
-                      "\nOld = "
-                   << *CI << "\nNew = " << *NewValue << "\n");
-        CI->replaceAllUsesWith(NewValue);
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case ARCInstKind::CopyWeak:
-    case ARCInstKind::MoveWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (IsNullOrUndef(CI->getArgOperand(0)) ||
-          IsNullOrUndef(CI->getArgOperand(1))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-
-        Value *NewValue = UndefValue::get(CI->getType());
-        LLVM_DEBUG(
-            dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
-                      "\nOld = "
-                   << *CI << "\nNew = " << *NewValue << "\n");
-
-        CI->replaceAllUsesWith(NewValue);
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case ARCInstKind::RetainRV:
-      if (OptimizeRetainRVCall(F, Inst))
-        continue;
+    default:
+      optimizeDelayedAutoreleaseRV();
       break;
+    case ARCInstKind::CallOrUser:
+    case ARCInstKind::User:
+    case ARCInstKind::None:
+      // This is a non-ARC instruction.  If we're delaying an AutoreleaseRV,
+      // check if it's safe to skip over it; if not, optimize the AutoreleaseRV
+      // now.
+      if (!shouldDelayAutoreleaseRV(Inst))
+        optimizeDelayedAutoreleaseRV();
+      continue;
     case ARCInstKind::AutoreleaseRV:
-      OptimizeAutoreleaseRVCall(F, Inst, Class);
+      optimizeDelayedAutoreleaseRV();
+      setDelayedAutoreleaseRV(Inst);
+      continue;
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::ClaimRV:
+      if (DelayedAutoreleaseRV) {
+        // We have a potential RV pair.  Check if they cancel out.
+        if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class,
+                                             DelayedAutoreleaseRV,
+                                             DelayedAutoreleaseRVArg)) {
+          setDelayedAutoreleaseRV(nullptr);
+          continue;
+        }
+        optimizeDelayedAutoreleaseRV();
+      }
       break;
     }
 
-    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
-    if (IsAutorelease(Class) && Inst->use_empty()) {
-      CallInst *Call = cast<CallInst>(Inst);
-      const Value *Arg = Call->getArgOperand(0);
-      Arg = FindSingleUseIdentifiedObject(Arg);
-      if (Arg) {
-        Changed = true;
-        ++NumAutoreleases;
-
-        // Create the declaration lazily.
-        LLVMContext &C = Inst->getContext();
-
-        Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
-        CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "",
-                                             Call);
-        NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
-                             MDNode::get(C, None));
-
-        LLVM_DEBUG(
-            dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
-                      "since x is otherwise unused.\nOld: "
-                   << *Call << "\nNew: " << *NewCall << "\n");
-
-        EraseInstruction(Call);
-        Inst = NewCall;
-        Class = ARCInstKind::Release;
+    OptimizeIndividualCallImpl(F, BlockColors, Inst, Class, Arg);
+  }
+
+  // Catch the final delayed AutoreleaseRV.
+  optimizeDelayedAutoreleaseRV();
+}
+
+void ObjCARCOpt::OptimizeIndividualCallImpl(
+    Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+    Instruction *Inst, ARCInstKind Class, const Value *Arg) {
+  LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
+
+  // Some of the ARC calls can be deleted if their arguments are global
+  // variables that are inert in ARC.
+  if (IsNoopOnGlobal(Class)) {
+    Value *Opnd = Inst->getOperand(0);
+    if (auto *GV = dyn_cast<GlobalVariable>(Opnd->stripPointerCasts()))
+      if (GV->hasAttribute("objc_arc_inert")) {
+        if (!Inst->getType()->isVoidTy())
+          Inst->replaceAllUsesWith(Opnd);
+        Inst->eraseFromParent();
+        return;
       }
-    }
+  }
 
-    // For functions which can never be passed stack arguments, add
-    // a tail keyword.
-    if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) {
+  switch (Class) {
+  default:
+    break;
+
+  // Delete no-op casts. These function calls have special semantics, but
+  // the semantics are entirely implemented via lowering in the front-end,
+  // so by the time they reach the optimizer, they are just no-op calls
+  // which return their argument.
+  //
+  // There are gray areas here, as the ability to cast reference-counted
+  // pointers to raw void* and back allows code to break ARC assumptions,
+  // however these are currently considered to be unimportant.
+  case ARCInstKind::NoopCast:
+    Changed = true;
+    ++NumNoops;
+    LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
+    EraseInstruction(Inst);
+    return;
+
+  // If the pointer-to-weak-pointer is null, it's undefined behavior.
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::DestroyWeak: {
+    CallInst *CI = cast<CallInst>(Inst);
+    if (IsNullOrUndef(CI->getArgOperand(0))) {
       Changed = true;
+      Type *Ty = CI->getArgOperand(0)->getType();
+      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                    Constant::getNullValue(Ty), CI);
+      Value *NewValue = UndefValue::get(CI->getType());
       LLVM_DEBUG(
-          dbgs() << "Adding tail keyword to function since it can never be "
-                    "passed stack args: "
-                 << *Inst << "\n");
-      cast<CallInst>(Inst)->setTailCall();
+          dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                    "\nOld = "
+                 << *CI << "\nNew = " << *NewValue << "\n");
+      CI->replaceAllUsesWith(NewValue);
+      CI->eraseFromParent();
+      return;
     }
-
-    // Ensure that functions that can never have a "tail" keyword due to the
-    // semantics of ARC truly do not do so.
-    if (IsNeverTail(Class)) {
+    break;
+  }
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::MoveWeak: {
+    CallInst *CI = cast<CallInst>(Inst);
+    if (IsNullOrUndef(CI->getArgOperand(0)) ||
+        IsNullOrUndef(CI->getArgOperand(1))) {
       Changed = true;
-      LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst
-                        << "\n");
-      cast<CallInst>(Inst)->setTailCall(false);
+      Type *Ty = CI->getArgOperand(0)->getType();
+      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                    Constant::getNullValue(Ty), CI);
+
+      Value *NewValue = UndefValue::get(CI->getType());
+      LLVM_DEBUG(
+          dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                    "\nOld = "
+                 << *CI << "\nNew = " << *NewValue << "\n");
+
+      CI->replaceAllUsesWith(NewValue);
+      CI->eraseFromParent();
+      return;
     }
+    break;
+  }
+  case ARCInstKind::RetainRV:
+    if (OptimizeRetainRVCall(F, Inst))
+      return;
+    break;
+  case ARCInstKind::AutoreleaseRV:
+    OptimizeAutoreleaseRVCall(F, Inst, Class);
+    break;
+  }
 
-    // Set nounwind as needed.
-    if (IsNoThrow(Class)) {
+  // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+  if (IsAutorelease(Class) && Inst->use_empty()) {
+    CallInst *Call = cast<CallInst>(Inst);
+    const Value *Arg = Call->getArgOperand(0);
+    Arg = FindSingleUseIdentifiedObject(Arg);
+    if (Arg) {
       Changed = true;
-      LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: "
-                        << *Inst << "\n");
-      cast<CallInst>(Inst)->setDoesNotThrow();
-    }
+      ++NumAutoreleases;
 
-    if (!IsNoopOnNull(Class)) {
-      UsedInThisFunction |= 1 << unsigned(Class);
-      continue;
-    }
+      // Create the declaration lazily.
+      LLVMContext &C = Inst->getContext();
 
-    const Value *Arg = GetArgRCIdentityRoot(Inst);
+      Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
+      CallInst *NewCall =
+          CallInst::Create(Decl, Call->getArgOperand(0), "", Call);
+      NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
+                           MDNode::get(C, None));
 
-    // ARC calls with null are no-ops. Delete them.
-    if (IsNullOrUndef(Arg)) {
-      Changed = true;
-      ++NumNoops;
-      LLVM_DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
-                        << "\n");
-      EraseInstruction(Inst);
-      continue;
+      LLVM_DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+                           "since x is otherwise unused.\nOld: "
+                        << *Call << "\nNew: " << *NewCall << "\n");
+
+      EraseInstruction(Call);
+      Inst = NewCall;
+      Class = ARCInstKind::Release;
     }
+  }
+
+  // For functions which can never be passed stack arguments, add
+  // a tail keyword.
+  if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) {
+    Changed = true;
+    LLVM_DEBUG(
+        dbgs() << "Adding tail keyword to function since it can never be "
+                  "passed stack args: "
+               << *Inst << "\n");
+    cast<CallInst>(Inst)->setTailCall();
+  }
+
+  // Ensure that functions that can never have a "tail" keyword due to the
+  // semantics of ARC truly do not do so.
+  if (IsNeverTail(Class)) {
+    Changed = true;
+    LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst
+                      << "\n");
+    cast<CallInst>(Inst)->setTailCall(false);
+  }
+
+  // Set nounwind as needed.
+  if (IsNoThrow(Class)) {
+    Changed = true;
+    LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
+                      << "\n");
+    cast<CallInst>(Inst)->setDoesNotThrow();
+  }
 
-    // Keep track of which of retain, release, autorelease, and retain_block
-    // are actually present in this function.
+  // Note: This catches instructions unrelated to ARC.
+  if (!IsNoopOnNull(Class)) {
     UsedInThisFunction |= 1 << unsigned(Class);
+    return;
+  }
+
+  // If we haven't already looked up the root, look it up now.
+  if (!Arg)
+    Arg = GetArgRCIdentityRoot(Inst);
+
+  // ARC calls with null are no-ops. Delete them.
+  if (IsNullOrUndef(Arg)) {
+    Changed = true;
+    ++NumNoops;
+    LLVM_DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
+                      << "\n");
+    EraseInstruction(Inst);
+    return;
+  }
+
+  // Keep track of which of retain, release, autorelease, and retain_block
+  // are actually present in this function.
+  UsedInThisFunction |= 1 << unsigned(Class);
+
+  // If Arg is a PHI, and one or more incoming values to the
+  // PHI are null, and the call is control-equivalent to the PHI, and there
+  // are no relevant side effects between the PHI and the call, and the call
+  // is not a release that doesn't have the clang.imprecise_release tag, the
+  // call could be pushed up to just those paths with non-null incoming
+  // values. For now, don't bother splitting critical edges for this.
+  if (Class == ARCInstKind::Release &&
+      !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease)))
+    return;
+
+  SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+  Worklist.push_back(std::make_pair(Inst, Arg));
+  do {
+    std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+    Inst = Pair.first;
+    Arg = Pair.second;
 
-    // If Arg is a PHI, and one or more incoming values to the
-    // PHI are null, and the call is control-equivalent to the PHI, and there
-    // are no relevant side effects between the PHI and the call, and the call
-    // is not a release that doesn't have the clang.imprecise_release tag, the
-    // call could be pushed up to just those paths with non-null incoming
-    // values. For now, don't bother splitting critical edges for this.
-    if (Class == ARCInstKind::Release &&
-        !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease)))
+    const PHINode *PN = dyn_cast<PHINode>(Arg);
+    if (!PN)
       continue;
 
-    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
-    Worklist.push_back(std::make_pair(Inst, Arg));
-    do {
-      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
-      Inst = Pair.first;
-      Arg = Pair.second;
-
-      const PHINode *PN = dyn_cast<PHINode>(Arg);
-      if (!PN) continue;
-
-      // Determine if the PHI has any null operands, or any incoming
-      // critical edges.
-      bool HasNull = false;
-      bool HasCriticalEdges = false;
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        Value *Incoming =
-          GetRCIdentityRoot(PN->getIncomingValue(i));
-        if (IsNullOrUndef(Incoming))
-          HasNull = true;
-        else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() !=
-                 1) {
-          HasCriticalEdges = true;
-          break;
-        }
+    // Determine if the PHI has any null operands, or any incoming
+    // critical edges.
+    bool HasNull = false;
+    bool HasCriticalEdges = false;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
+      if (IsNullOrUndef(Incoming))
+        HasNull = true;
+      else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() !=
+               1) {
+        HasCriticalEdges = true;
+        break;
       }
-      // If we have null operands and no critical edges, optimize.
-      if (!HasCriticalEdges && HasNull) {
-        SmallPtrSet<Instruction *, 4> DependingInstructions;
-        SmallPtrSet<const BasicBlock *, 4> Visited;
-
-        // Check that there is nothing that cares about the reference
-        // count between the call and the phi.
-        switch (Class) {
-        case ARCInstKind::Retain:
-        case ARCInstKind::RetainBlock:
-          // These can always be moved up.
-          break;
-        case ARCInstKind::Release:
-          // These can't be moved across things that care about the retain
-          // count.
-          FindDependencies(NeedsPositiveRetainCount, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case ARCInstKind::Autorelease:
-          // These can't be moved across autorelease pool scope boundaries.
-          FindDependencies(AutoreleasePoolBoundary, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case ARCInstKind::ClaimRV:
-        case ARCInstKind::RetainRV:
-        case ARCInstKind::AutoreleaseRV:
-          // Don't move these; the RV optimization depends on the autoreleaseRV
-          // being tail called, and the retainRV being immediately after a call
-          // (which might still happen if we get lucky with codegen layout, but
-          // it's not worth taking the chance).
-          continue;
-        default:
-          llvm_unreachable("Invalid dependence flavor");
-        }
+    }
+    // If we have null operands and no critical edges, optimize.
+    if (HasCriticalEdges)
+      continue;
+    if (!HasNull)
+      continue;
 
-        if (DependingInstructions.size() == 1 &&
-            *DependingInstructions.begin() == PN) {
-          Changed = true;
-          ++NumPartialNoops;
-          // Clone the call into each predecessor that has a non-null value.
-          CallInst *CInst = cast<CallInst>(Inst);
-          Type *ParamTy = CInst->getArgOperand(0)->getType();
-          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-            Value *Incoming =
-              GetRCIdentityRoot(PN->getIncomingValue(i));
-            if (!IsNullOrUndef(Incoming)) {
-              Value *Op = PN->getIncomingValue(i);
-              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
-              CallInst *Clone = cast<CallInst>(CloneCallInstForBB(
-                  *CInst, *InsertPos->getParent(), BlockColors));
-              if (Op->getType() != ParamTy)
-                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
-              Clone->setArgOperand(0, Op);
-              Clone->insertBefore(InsertPos);
-
-              LLVM_DEBUG(dbgs() << "Cloning " << *CInst
-                                << "\n"
-                                   "And inserting clone at "
-                                << *InsertPos << "\n");
-              Worklist.push_back(std::make_pair(Clone, Incoming));
-            }
-          }
-          // Erase the original call.
-          LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
-          EraseInstruction(CInst);
-          continue;
-        }
-      }
-    } while (!Worklist.empty());
-  }
+    SmallPtrSet<Instruction *, 4> DependingInstructions;
+    SmallPtrSet<const BasicBlock *, 4> Visited;
+
+    // Check that there is nothing that cares about the reference
+    // count between the call and the phi.
+    switch (Class) {
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainBlock:
+      // These can always be moved up.
+      break;
+    case ARCInstKind::Release:
+      // These can't be moved across things that care about the retain
+      // count.
+      FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst,
+                       DependingInstructions, Visited, PA);
+      break;
+    case ARCInstKind::Autorelease:
+      // These can't be moved across autorelease pool scope boundaries.
+      FindDependencies(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst,
+                       DependingInstructions, Visited, PA);
+      break;
+    case ARCInstKind::ClaimRV:
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::AutoreleaseRV:
+      // Don't move these; the RV optimization depends on the autoreleaseRV
+      // being tail called, and the retainRV being immediately after a call
+      // (which might still happen if we get lucky with codegen layout, but
+      // it's not worth taking the chance).
+      continue;
+    default:
+      llvm_unreachable("Invalid dependence flavor");
+    }
+
+    if (DependingInstructions.size() != 1)
+      continue;
+    if (*DependingInstructions.begin() != PN)
+      continue;
+
+    Changed = true;
+    ++NumPartialNoops;
+    // Clone the call into each predecessor that has a non-null value.
+    CallInst *CInst = cast<CallInst>(Inst);
+    Type *ParamTy = CInst->getArgOperand(0)->getType();
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
+      if (IsNullOrUndef(Incoming))
+        continue;
+      Value *Op = PN->getIncomingValue(i);
+      Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+      CallInst *Clone = cast<CallInst>(
+          CloneCallInstForBB(*CInst, *InsertPos->getParent(), BlockColors));
+      if (Op->getType() != ParamTy)
+        Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+      Clone->setArgOperand(0, Op);
+      Clone->insertBefore(InsertPos);
+
+      LLVM_DEBUG(dbgs() << "Cloning " << *CInst << "\n"
+                                                   "And inserting clone at "
+                        << *InsertPos << "\n");
+      Worklist.push_back(std::make_pair(Clone, Incoming));
+    }
+    // Erase the original call.
+    LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+    EraseInstruction(CInst);
+  } while (!Worklist.empty());
 }
 
 /// If we have a top down pointer in the S_Use state, make sure that there are
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index b768f7973b87..99a2055aba94 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp
index 3243481dee0d..26dd416d6184 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp
@@ -275,6 +275,10 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
     } else {
       InsertAfter = std::next(Inst->getIterator());
     }
+
+    if (InsertAfter != BB->end())
+      InsertAfter = skipDebugIntrinsics(InsertAfter);
+
     InsertReverseInsertPt(&*InsertAfter);
   };
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
index 7f7460c5746a..cc3d3bf7cdbf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index de9a62e88c27..06deaf3c4f9a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -15,6 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/InitializePasses.h"
 #define AA_NAME "alignment-from-assumptions"
 #define DEBUG_TYPE AA_NAME
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
@@ -93,9 +94,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
                                     const SCEV *AlignSCEV,
                                     ScalarEvolution *SE) {
   // DiffUnits = Diff % int64_t(Alignment)
-  const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
-  const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
-  const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+  const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
 
   LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
                     << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
@@ -323,7 +322,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
         LI->getPointerOperand(), SE);
 
       if (NewAlignment > LI->getAlignment()) {
-        LI->setAlignment(NewAlignment);
+        LI->setAlignment(MaybeAlign(NewAlignment));
         ++NumLoadAlignChanged;
       }
     } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
@@ -331,7 +330,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
         SI->getPointerOperand(), SE);
 
       if (NewAlignment > SI->getAlignment()) {
-        SI->setAlignment(NewAlignment);
+        SI->setAlignment(MaybeAlign(NewAlignment));
         ++NumStoreAlignChanged;
       }
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp
index 9bd387c33e80..0fa38fa80b17 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -19,13 +19,14 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "bdce"
@@ -101,7 +102,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
         (I.getType()->isIntOrIntVectorTy() &&
          DB.getDemandedBits(&I).isNullValue() &&
          wouldInstructionBeTriviallyDead(&I))) {
-      salvageDebugInfo(I);
+      salvageDebugInfoOrMarkUndef(I);
       Worklist.push_back(&I);
       I.dropAllReferences();
       Changed = true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 3519b000a33f..e34c011b1c87 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -59,13 +59,15 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -562,7 +564,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
     if (skipFunction(F))
       return false;
 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     return doCallSiteSplitting(F, TLI, TTI, DT);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 98243a23f1ef..5bfece010bec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -14,7 +14,7 @@
 // cost. If the constant can be folded into the instruction (the cost is
 // TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
 // consider it expensive and leave it alone. This is the default behavior and
-// the default implementation of getIntImmCost will always return TCC_Free.
+// the default implementation of getIntImmCostInst will always return TCC_Free.
 //
 // If the cost is more than TCC_BASIC, then the integer constant can't be folded
 // into the instruction and it might be beneficial to hoist the constant.
@@ -43,7 +43,6 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -54,6 +53,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/Casting.h"
@@ -61,6 +61,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
@@ -204,7 +205,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
 /// set found in \p BBs.
 static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
                                  BasicBlock *Entry,
-                                 SmallPtrSet<BasicBlock *, 8> &BBs) {
+                                 SetVector<BasicBlock *> &BBs) {
   assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
   // Nodes on the current path to the root.
   SmallPtrSet<BasicBlock *, 8> Path;
@@ -257,7 +258,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
 
   // Visit Orders in bottom-up order.
   using InsertPtsCostPair =
-      std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>;
+      std::pair<SetVector<BasicBlock *>, BlockFrequency>;
 
   // InsertPtsMap is a map from a BB to the best insertion points for the
   // subtree of BB (subtree not including the BB itself).
@@ -266,7 +267,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
   for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
     BasicBlock *Node = *RIt;
     bool NodeInBBs = BBs.count(Node);
-    SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
+    auto &InsertPts = InsertPtsMap[Node].first;
     BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
 
     // Return the optimal insert points in BBs.
@@ -283,7 +284,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
     BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
     // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
     // will update its parent's ParentInsertPts and ParentPtsFreq.
-    SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
+    auto &ParentInsertPts = InsertPtsMap[Parent].first;
     BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
     // Choose to insert in Node or in subtree of Node.
     // Don't hoist to EHPad because we may not find a proper place to insert
@@ -305,12 +306,12 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
 }
 
 /// Find an insertion point that dominates all uses.
-SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
+SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
     const ConstantInfo &ConstInfo) const {
   assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
   // Collect all basic blocks.
-  SmallPtrSet<BasicBlock *, 8> BBs;
-  SmallPtrSet<Instruction *, 8> InsertPts;
+  SetVector<BasicBlock *> BBs;
+  SetVector<Instruction *> InsertPts;
   for (auto const &RCI : ConstInfo.RebasedConstants)
     for (auto const &U : RCI.Uses)
       BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
@@ -333,15 +334,13 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
 
   while (BBs.size() >= 2) {
     BasicBlock *BB, *BB1, *BB2;
-    BB1 = *BBs.begin();
-    BB2 = *std::next(BBs.begin());
+    BB1 = BBs.pop_back_val();
+    BB2 = BBs.pop_back_val();
     BB = DT->findNearestCommonDominator(BB1, BB2);
     if (BB == Entry) {
       InsertPts.insert(&Entry->front());
       return InsertPts;
     }
-    BBs.erase(BB1);
-    BBs.erase(BB2);
     BBs.insert(BB);
   }
   assert((BBs.size() == 1) && "Expected only one element.");
@@ -363,11 +362,11 @@ void ConstantHoistingPass::collectConstantCandidates(
   // Ask the target about the cost of materializing the constant for the given
   // instruction and operand index.
   if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
-    Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx,
-                              ConstInt->getValue(), ConstInt->getType());
+    Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx,
+                                    ConstInt->getValue(), ConstInt->getType());
   else
-    Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(),
-                              ConstInt->getType());
+    Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(),
+                                  ConstInt->getType());
 
   // Ignore cheap integer constants.
   if (Cost > TargetTransformInfo::TCC_Basic) {
@@ -403,7 +402,7 @@ void ConstantHoistingPass::collectConstantCandidates(
     return;
 
   // Get offset from the base GV.
-  PointerType *GVPtrTy = dyn_cast<PointerType>(BaseGV->getType());
+  PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType());
   IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
   APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
   auto *GEPO = cast<GEPOperator>(ConstExpr);
@@ -417,7 +416,7 @@ void ConstantHoistingPass::collectConstantCandidates(
   // usually lowered to a load from constant pool. Such operation is unlikely
   // to be cheaper than compute it by <Base + Offset>, which can be lowered to
   // an ADD instruction or folded into Load/Store instruction.
-  int Cost = TTI->getIntImmCost(Instruction::Add, 1, Offset, PtrIntTy);
+  int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy);
   ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
   ConstCandMapType::iterator Itr;
   bool Inserted;
@@ -488,9 +487,10 @@ void ConstantHoistingPass::collectConstantCandidates(
   // Scan all operands.
   for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
     // The cost of materializing the constants (defined in
-    // `TargetTransformInfo::getIntImmCost`) for instructions which only take
-    // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So
-    // it's safe for us to collect constant candidates from all IntrinsicInsts.
+    // `TargetTransformInfo::getIntImmCostInst`) for instructions which only
+    // take constant variables is lower than `TargetTransformInfo::TCC_Basic`.
+    // So it's safe for us to collect constant candidates from all
+    // IntrinsicInsts.
     if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) {
       collectConstantCandidates(ConstCandMap, Inst, Idx);
     }
@@ -501,9 +501,13 @@ void ConstantHoistingPass::collectConstantCandidates(
 /// into an instruction itself.
 void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
   ConstCandMapType ConstCandMap;
-  for (BasicBlock &BB : Fn)
+  for (BasicBlock &BB : Fn) {
+    // Ignore unreachable basic blocks.
+    if (!DT->isReachableFromEntry(&BB))
+      continue;
     for (Instruction &Inst : BB)
       collectConstantCandidates(ConstCandMap, &Inst);
+  }
 }
 
 // This helper function is necessary to deal with values that have different
@@ -554,7 +558,8 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
   unsigned NumUses = 0;
 
   bool OptForSize = Entry->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI);
+                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
   if (!OptForSize || std::distance(S,E) > 100) {
     for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
       NumUses += ConstCand->Uses.size();
@@ -577,7 +582,7 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
     for (auto User : ConstCand->Uses) {
       unsigned Opcode = User.Inst->getOpcode();
       unsigned OpndIdx = User.OpndIdx;
-      Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty);
+      Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty);
       LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
 
       for (auto C2 = S; C2 != E; ++C2) {
@@ -830,7 +835,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
   SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
       BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
   for (auto const &ConstInfo : ConstInfoVec) {
-    SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
+    SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo);
     // We can have an empty set if the function contains unreachable blocks.
     if (IPSet.empty())
       continue;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp
index 770321c740a0..73bf1d521b1d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
@@ -82,7 +83,7 @@ bool ConstantPropagation::runOnFunction(Function &F) {
   bool Changed = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   while (!WorkList.empty()) {
     SmallVector<Instruction*, 16> NewWorkListVec;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 89497177524f..3435bc7f5eaa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -62,6 +63,23 @@ STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
 STATISTIC(NumUDivs,     "Number of udivs whose width was decreased");
 STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
+STATISTIC(NumSExt,      "Number of sext converted to zext");
+STATISTIC(NumAnd,       "Number of ands removed");
+STATISTIC(NumNW,        "Number of no-wrap deductions");
+STATISTIC(NumNSW,       "Number of no-signed-wrap deductions");
+STATISTIC(NumNUW,       "Number of no-unsigned-wrap deductions");
+STATISTIC(NumAddNW,     "Number of no-wrap deductions for add");
+STATISTIC(NumAddNSW,    "Number of no-signed-wrap deductions for add");
+STATISTIC(NumAddNUW,    "Number of no-unsigned-wrap deductions for add");
+STATISTIC(NumSubNW,     "Number of no-wrap deductions for sub");
+STATISTIC(NumSubNSW,    "Number of no-signed-wrap deductions for sub");
+STATISTIC(NumSubNUW,    "Number of no-unsigned-wrap deductions for sub");
+STATISTIC(NumMulNW,     "Number of no-wrap deductions for mul");
+STATISTIC(NumMulNSW,    "Number of no-signed-wrap deductions for mul");
+STATISTIC(NumMulNUW,    "Number of no-unsigned-wrap deductions for mul");
+STATISTIC(NumShlNW,     "Number of no-wrap deductions for shl");
+STATISTIC(NumShlNSW,    "Number of no-signed-wrap deductions for shl");
+STATISTIC(NumShlNUW,    "Number of no-unsigned-wrap deductions for shl");
 STATISTIC(NumOverflows, "Number of overflow checks removed");
 STATISTIC(NumSaturating,
     "Number of saturating arithmetics converted to normal arithmetics");
@@ -85,6 +103,7 @@ namespace {
       AU.addRequired<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LazyValueInfoWrapperPass>();
     }
   };
 
@@ -176,7 +195,14 @@ static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
   }
 
   // All constant incoming values map to the same variable along the incoming
-  // edges of the phi. The phi is unnecessary.
+  // edges of the phi. The phi is unnecessary. However, we must drop all
+  // poison-generating flags to ensure that no poison is propagated to the phi
+  // location by performing this substitution.
+  // Warning: If the underlying analysis changes, this may not be enough to
+  //          guarantee that poison is not propagated.
+  // TODO: We may be able to re-infer flags by re-analyzing the instruction.
+  if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+    CommonInst->dropPoisonGeneratingFlags();
   P->replaceAllUsesWith(CommonValue);
   P->eraseFromParent();
   ++NumPhiCommon;
@@ -416,37 +442,96 @@ static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
   return NWRegion.contains(LRange);
 }
 
-static void processOverflowIntrinsic(WithOverflowInst *WO) {
-  IRBuilder<> B(WO);
-  Value *NewOp = B.CreateBinOp(
-      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), WO->getName());
-  // Constant-folding could have happened.
-  if (auto *Inst = dyn_cast<Instruction>(NewOp)) {
-    if (WO->isSigned())
+static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode,
+                                       bool NewNSW, bool NewNUW) {
+  Statistic *OpcNW, *OpcNSW, *OpcNUW;
+  switch (Opcode) {
+  case Instruction::Add:
+    OpcNW = &NumAddNW;
+    OpcNSW = &NumAddNSW;
+    OpcNUW = &NumAddNUW;
+    break;
+  case Instruction::Sub:
+    OpcNW = &NumSubNW;
+    OpcNSW = &NumSubNSW;
+    OpcNUW = &NumSubNUW;
+    break;
+  case Instruction::Mul:
+    OpcNW = &NumMulNW;
+    OpcNSW = &NumMulNSW;
+    OpcNUW = &NumMulNUW;
+    break;
+  case Instruction::Shl:
+    OpcNW = &NumShlNW;
+    OpcNSW = &NumShlNSW;
+    OpcNUW = &NumShlNUW;
+    break;
+  default:
+    llvm_unreachable("Will not be called with other binops");
+  }
+
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (NewNSW) {
+    ++NumNW;
+    ++*OpcNW;
+    ++NumNSW;
+    ++*OpcNSW;
+    if (Inst)
       Inst->setHasNoSignedWrap();
-    else
+  }
+  if (NewNUW) {
+    ++NumNW;
+    ++*OpcNW;
+    ++NumNUW;
+    ++*OpcNUW;
+    if (Inst)
       Inst->setHasNoUnsignedWrap();
   }
+}
+
+static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI);
 
-  Value *NewI = B.CreateInsertValue(UndefValue::get(WO->getType()), NewOp, 0);
-  NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(WO->getContext()), 1);
+// Rewrite this with.overflow intrinsic as non-overflowing.
+static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) {
+  IRBuilder<> B(WO);
+  Instruction::BinaryOps Opcode = WO->getBinaryOp();
+  bool NSW = WO->isSigned();
+  bool NUW = !WO->isSigned();
+
+  Value *NewOp =
+      B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName());
+  setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW);
+
+  StructType *ST = cast<StructType>(WO->getType());
+  Constant *Struct = ConstantStruct::get(ST,
+      { UndefValue::get(ST->getElementType(0)),
+        ConstantInt::getFalse(ST->getElementType(1)) });
+  Value *NewI = B.CreateInsertValue(Struct, NewOp, 0);
   WO->replaceAllUsesWith(NewI);
   WO->eraseFromParent();
   ++NumOverflows;
+
+  // See if we can infer the other no-wrap too.
+  if (auto *BO = dyn_cast<BinaryOperator>(NewOp))
+    processBinOp(BO, LVI);
 }
 
-static void processSaturatingInst(SaturatingInst *SI) {
+static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
+  Instruction::BinaryOps Opcode = SI->getBinaryOp();
+  bool NSW = SI->isSigned();
+  bool NUW = !SI->isSigned();
   BinaryOperator *BinOp = BinaryOperator::Create(
-      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+      Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI);
   BinOp->setDebugLoc(SI->getDebugLoc());
-  if (SI->isSigned())
-    BinOp->setHasNoSignedWrap();
-  else
-    BinOp->setHasNoUnsignedWrap();
+  setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW);
 
   SI->replaceAllUsesWith(BinOp);
   SI->eraseFromParent();
   ++NumSaturating;
+
+  // See if we can infer the other no-wrap too.
+  if (auto *BO = dyn_cast<BinaryOperator>(BinOp))
+    processBinOp(BO, LVI);
 }
 
 /// Infer nonnull attributes for the arguments at the specified callsite.
@@ -456,14 +541,14 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
 
   if (auto *WO = dyn_cast<WithOverflowInst>(CS.getInstruction())) {
     if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
-      processOverflowIntrinsic(WO);
+      processOverflowIntrinsic(WO, LVI);
       return true;
     }
   }
 
   if (auto *SI = dyn_cast<SaturatingInst>(CS.getInstruction())) {
     if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
-      processSaturatingInst(SI);
+      processSaturatingInst(SI, LVI);
       return true;
     }
   }
@@ -632,6 +717,27 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
   return true;
 }
 
+static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy())
+    return false;
+
+  Value *Base = SDI->getOperand(0);
+
+  Constant *Zero = ConstantInt::get(Base->getType(), 0);
+  if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, Base, Zero, SDI) !=
+      LazyValueInfo::True)
+    return false;
+
+  ++NumSExt;
+  auto *ZExt =
+      CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI);
+  ZExt->setDebugLoc(SDI->getDebugLoc());
+  SDI->replaceAllUsesWith(ZExt);
+  SDI->eraseFromParent();
+
+  return true;
+}
+
 static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   using OBO = OverflowingBinaryOperator;
 
@@ -648,6 +754,7 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
 
   BasicBlock *BB = BinOp->getParent();
 
+  Instruction::BinaryOps Opcode = BinOp->getOpcode();
   Value *LHS = BinOp->getOperand(0);
   Value *RHS = BinOp->getOperand(1);
 
@@ -655,24 +762,48 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   ConstantRange RRange = LVI->getConstantRange(RHS, BB, BinOp);
 
   bool Changed = false;
+  bool NewNUW = false, NewNSW = false;
   if (!NUW) {
     ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
-        BinOp->getOpcode(), RRange, OBO::NoUnsignedWrap);
-    bool NewNUW = NUWRange.contains(LRange);
-    BinOp->setHasNoUnsignedWrap(NewNUW);
+        Opcode, RRange, OBO::NoUnsignedWrap);
+    NewNUW = NUWRange.contains(LRange);
     Changed |= NewNUW;
   }
   if (!NSW) {
     ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
-        BinOp->getOpcode(), RRange, OBO::NoSignedWrap);
-    bool NewNSW = NSWRange.contains(LRange);
-    BinOp->setHasNoSignedWrap(NewNSW);
+        Opcode, RRange, OBO::NoSignedWrap);
+    NewNSW = NSWRange.contains(LRange);
     Changed |= NewNSW;
   }
 
+  setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW);
+
   return Changed;
 }
 
+static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
+  if (BinOp->getType()->isVectorTy())
+    return false;
+
+  // Pattern match (and lhs, C) where C includes a superset of bits which might
+  // be set in lhs.  This is a common truncation idiom created by instcombine.
+  BasicBlock *BB = BinOp->getParent();
+  Value *LHS = BinOp->getOperand(0);
+  ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+  if (!RHS || !RHS->getValue().isMask())
+    return false;
+
+  ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp);
+  if (!LRange.getUnsignedMax().ule(RHS->getValue()))
+    return false;
+
+  BinOp->replaceAllUsesWith(LHS);
+  BinOp->eraseFromParent();
+  NumAnd++;
+  return true;
+}
+
+
 static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
   if (Constant *C = LVI->getConstant(V, At->getParent(), At))
     return C;
@@ -740,10 +871,18 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
       case Instruction::AShr:
         BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
         break;
+      case Instruction::SExt:
+        BBChanged |= processSExt(cast<SExtInst>(II), LVI);
+        break;
       case Instruction::Add:
       case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::Shl:
         BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI);
         break;
+      case Instruction::And:
+        BBChanged |= processAnd(cast<BinaryOperator>(II), LVI);
+        break;
       }
     }
 
@@ -796,5 +935,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
   PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LazyValueAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp
index 479e0ed74074..a4b0c8df98f6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -19,12 +19,14 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "dce"
@@ -38,17 +40,19 @@ namespace {
   //===--------------------------------------------------------------------===//
   // DeadInstElimination pass implementation
   //
-  struct DeadInstElimination : public BasicBlockPass {
-    static char ID; // Pass identification, replacement for typeid
-    DeadInstElimination() : BasicBlockPass(ID) {
-      initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
-    }
-    bool runOnBasicBlock(BasicBlock &BB) override {
-      if (skipBasicBlock(BB))
-        return false;
-      auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-      TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
-      bool Changed = false;
+struct DeadInstElimination : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DeadInstElimination() : FunctionPass(ID) {
+    initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+
+    bool Changed = false;
+    for (auto &BB : F) {
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = &*DI++;
         if (isInstructionTriviallyDead(Inst, TLI)) {
@@ -60,13 +64,14 @@ namespace {
           ++DIEEliminated;
         }
       }
-      return Changed;
     }
+    return Changed;
+  }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
     }
-  };
+};
 }
 
 char DeadInstElimination::ID = 0;
@@ -77,6 +82,43 @@ Pass *llvm::createDeadInstEliminationPass() {
   return new DeadInstElimination();
 }
 
+//===--------------------------------------------------------------------===//
+// RedundantDbgInstElimination pass implementation
+//
+
+namespace {
+struct RedundantDbgInstElimination : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  RedundantDbgInstElimination() : FunctionPass(ID) {
+    initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    bool Changed = false;
+    for (auto &BB : F)
+      Changed |= RemoveRedundantDbgInstrs(&BB);
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char RedundantDbgInstElimination::ID = 0;
+INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim",
+                "Redundant Dbg Instruction Elimination", false, false)
+
+Pass *llvm::createRedundantDbgInstEliminationPass() {
+  return new RedundantDbgInstElimination();
+}
+
+//===--------------------------------------------------------------------===//
+// DeadCodeElimination pass implementation
+//
+
 static bool DCEInstruction(Instruction *I,
                            SmallSetVector<Instruction *, 16> &WorkList,
                            const TargetLibraryInfo *TLI) {
@@ -154,7 +196,7 @@ struct DCELegacyPass : public FunctionPass {
       return false;
 
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
 
     return eliminateDeadCode(F, TLI);
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a81645745b48..1ba4aab999e1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -48,6 +49,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -99,6 +101,7 @@ static void
 deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
                       MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
                       InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB,
+                      MapVector<Instruction *, bool> &ThrowableInst,
                       SmallSetVector<const Value *, 16> *ValueSet = nullptr) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
@@ -112,6 +115,10 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
   // Before we touch this instruction, remove it from memdep!
   do {
     Instruction *DeadInst = NowDeadInsts.pop_back_val();
+    // Mark the DeadInst as dead in the list of throwable instructions.
+    auto It = ThrowableInst.find(DeadInst);
+    if (It != ThrowableInst.end())
+      ThrowableInst[It->first] = false;
     ++NumFastOther;
 
     // Try to preserve debug information attached to the dead instruction.
@@ -144,6 +151,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
       DeadInst->eraseFromParent();
   } while (!NowDeadInsts.empty());
   *BBI = NewIter;
+  // Pop dead entries from back of ThrowableInst till we find an alive entry.
+  while (!ThrowableInst.empty() && !ThrowableInst.back().second)
+    ThrowableInst.pop_back();
 }
 
 /// Does this instruction write some memory?  This only returns true for things
@@ -169,15 +179,18 @@ static bool hasAnalyzableMemoryWrite(Instruction *I,
   }
   if (auto CS = CallSite(I)) {
     if (Function *F = CS.getCalledFunction()) {
-      StringRef FnName = F->getName();
-      if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
-        return true;
-      if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
-        return true;
-      if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
-        return true;
-      if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
-        return true;
+      LibFunc LF;
+      if (TLI.getLibFunc(*F, LF) && TLI.has(LF)) {
+        switch (LF) {
+        case LibFunc_strcpy:
+        case LibFunc_strncpy:
+        case LibFunc_strcat:
+        case LibFunc_strncat:
+          return true;
+        default:
+          return false;
+        }
+      }
     }
   }
   return false;
@@ -656,7 +669,8 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
 static bool handleFree(CallInst *F, AliasAnalysis *AA,
                        MemoryDependenceResults *MD, DominatorTree *DT,
                        const TargetLibraryInfo *TLI,
-                       InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB) {
+                       InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB,
+                       MapVector<Instruction *, bool> &ThrowableInst) {
   bool MadeChange = false;
 
   MemoryLocation Loc = MemoryLocation(F->getOperand(0));
@@ -690,7 +704,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
 
       // DCE instructions only used to calculate that store.
       BasicBlock::iterator BBI(Dependency);
-      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB);
+      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB,
+                            ThrowableInst);
       ++NumFastStores;
       MadeChange = true;
 
@@ -747,8 +762,8 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
 static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
                            MemoryDependenceResults *MD,
                            const TargetLibraryInfo *TLI,
-                           InstOverlapIntervalsTy &IOL,
-                           OrderedBasicBlock &OBB) {
+                           InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB,
+                           MapVector<Instruction *, bool> &ThrowableInst) {
   bool MadeChange = false;
 
   // Keep track of all of the stack objects that are dead at the end of the
@@ -809,7 +824,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
                    << '\n');
 
         // DCE instructions only used to calculate that store.
-        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB,
+        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst,
                               &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
@@ -821,7 +836,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
     if (isInstructionTriviallyDead(&*BBI, TLI)) {
       LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n  DEAD: "
                         << *&*BBI << '\n');
-      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB,
+      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst,
                             &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
@@ -1028,7 +1043,8 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
                                const DataLayout &DL,
                                const TargetLibraryInfo *TLI,
                                InstOverlapIntervalsTy &IOL,
-                               OrderedBasicBlock &OBB) {
+                               OrderedBasicBlock &OBB,
+                               MapVector<Instruction *, bool> &ThrowableInst) {
   // Must be a store instruction.
   StoreInst *SI = dyn_cast<StoreInst>(Inst);
   if (!SI)
@@ -1044,7 +1060,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
           dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
                  << *DepLoad << "\n  STORE: " << *SI << '\n');
 
-      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB);
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst);
       ++NumRedundantStores;
       return true;
     }
@@ -1062,7 +1078,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
           dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
                  << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
 
-      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB);
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst);
       ++NumRedundantStores;
       return true;
     }
@@ -1077,7 +1093,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
   bool MadeChange = false;
 
   OrderedBasicBlock OBB(&BB);
-  Instruction *LastThrowing = nullptr;
+  MapVector<Instruction *, bool> ThrowableInst;
 
   // A map of interval maps representing partially-overwritten value parts.
   InstOverlapIntervalsTy IOL;
@@ -1086,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
     // Handle 'free' calls specially.
     if (CallInst *F = isFreeCall(&*BBI, TLI)) {
-      MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB);
+      MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB, ThrowableInst);
       // Increment BBI after handleFree has potentially deleted instructions.
       // This ensures we maintain a valid iterator.
       ++BBI;
@@ -1096,7 +1112,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
     Instruction *Inst = &*BBI++;
 
     if (Inst->mayThrow()) {
-      LastThrowing = Inst;
+      ThrowableInst[Inst] = true;
       continue;
     }
 
@@ -1105,7 +1121,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       continue;
 
     // eliminateNoopStore will update in iterator, if necessary.
-    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB)) {
+    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB,
+                           ThrowableInst)) {
       MadeChange = true;
       continue;
     }
@@ -1148,6 +1165,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       if (!DepLoc.Ptr)
         break;
 
+      // Find the last throwable instruction not removed by call to
+      // deleteDeadInstruction.
+      Instruction *LastThrowing = nullptr;
+      if (!ThrowableInst.empty())
+        LastThrowing = ThrowableInst.back().first;
+
       // Make sure we don't look past a call which might throw. This is an
       // issue because MemoryDependenceAnalysis works in the wrong direction:
       // it finds instructions which dominate the current instruction, rather than
@@ -1187,7 +1210,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
                             << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
-          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB);
+          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB,
+                                ThrowableInst);
           ++NumFastStores;
           MadeChange = true;
 
@@ -1254,8 +1278,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
 
             auto *SI = new StoreInst(
                 ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
-                Earlier->getPointerOperand(), false, Earlier->getAlignment(),
-                Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+                Earlier->getPointerOperand(), false,
+                MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(),
+                Earlier->getSyncScopeID(), DepWrite);
 
             unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
                                    LLVMContext::MD_alias_scope,
@@ -1268,8 +1293,10 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
             OBB.replaceInstruction(DepWrite, SI);
 
             // Delete the old stores and now-dead instructions that feed them.
-            deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB);
-            deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB);
+            deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB,
+                                  ThrowableInst);
+            deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB,
+                                  ThrowableInst);
             MadeChange = true;
 
             // We erased DepWrite and Inst (Loc); start over.
@@ -1304,7 +1331,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
   // If this block ends in a return, unwind, or unreachable, all allocas are
   // dead at its end, which means stores to them are also dead.
   if (BB.getTerminator()->getNumSuccessors() == 0)
-    MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB);
+    MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB, ThrowableInst);
 
   return MadeChange;
 }
@@ -1361,7 +1388,7 @@ public:
     MemoryDependenceResults *MD =
         &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
     return eliminateDeadStores(F, AA, MD, DT, TLI);
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
index e64651d97495..132dfc8f6da1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -1,4 +1,4 @@
-//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===//
+//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass hoists and/or decomposes integer division and remainder
+// This pass hoists and/or decomposes/recomposes integer division and remainder
 // instructions to enable CFG improvements and better codegen.
 //
 //===----------------------------------------------------------------------===//
@@ -19,20 +19,58 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "div-rem-pairs"
 STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumRecomposed, "Number of instructions recomposed");
 STATISTIC(NumHoisted, "Number of instructions hoisted");
 STATISTIC(NumDecomposed, "Number of instructions decomposed");
 DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
               "Controls transformations in div-rem-pairs pass");
 
+namespace {
+struct ExpandedMatch {
+  DivRemMapKey Key;
+  Instruction *Value;
+};
+} // namespace
+
+/// See if we can match: (which is the form we expand into)
+///   X - ((X ?/ Y) * Y)
+/// which is equivalent to:
+///   X ?% Y
+static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) {
+  Value *Dividend, *XroundedDownToMultipleOfY;
+  if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY))))
+    return llvm::None;
+
+  Value *Divisor;
+  Instruction *Div;
+  // Look for  ((X / Y) * Y)
+  if (!match(
+          XroundedDownToMultipleOfY,
+          m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)),
+                               m_Instruction(Div)),
+                  m_Deferred(Divisor))))
+    return llvm::None;
+
+  ExpandedMatch M;
+  M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv;
+  M.Key.Dividend = Dividend;
+  M.Key.Divisor = Divisor;
+  M.Value = &I;
+  return M;
+}
+
 /// A thin wrapper to store two values that we matched as div-rem pair.
 /// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
 struct DivRemPairWorklistEntry {
@@ -62,6 +100,16 @@ struct DivRemPairWorklistEntry {
   /// In this pair, what are the divident and divisor?
   Value *getDividend() const { return DivInst->getOperand(0); }
   Value *getDivisor() const { return DivInst->getOperand(1); }
+
+  bool isRemExpanded() const {
+    switch (RemInst->getOpcode()) {
+    case Instruction::SRem:
+    case Instruction::URem:
+      return false; // single 'rem' instruction - unexpanded form.
+    default:
+      return true; // anything else means we have remainder in expanded form.
+    }
+  }
 };
 using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
 
@@ -87,6 +135,8 @@ static DivRemWorklistTy getWorklist(Function &F) {
         RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
       else if (I.getOpcode() == Instruction::URem)
         RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (auto Match = matchExpandedRem(I))
+        RemMap[Match->Key] = Match->Value;
     }
   }
 
@@ -137,11 +187,43 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
 
   // Process each entry in the worklist.
   for (DivRemPairWorklistEntry &E : Worklist) {
+    if (!DebugCounter::shouldExecute(DRPCounter))
+      continue;
+
     bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
 
     auto &DivInst = E.DivInst;
     auto &RemInst = E.RemInst;
 
+    const bool RemOriginallyWasInExpandedForm = E.isRemExpanded();
+    (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning
+
+    if (HasDivRemOp && E.isRemExpanded()) {
+      // The target supports div+rem but the rem is expanded.
+      // We should recompose it first.
+      Value *X = E.getDividend();
+      Value *Y = E.getDivisor();
+      Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y)
+                                          : BinaryOperator::CreateURem(X, Y);
+      // Note that we place it right next to the original expanded instruction,
+      // and letting further handling to move it if needed.
+      RealRem->setName(RemInst->getName() + ".recomposed");
+      RealRem->insertAfter(RemInst);
+      Instruction *OrigRemInst = RemInst;
+      // Update AssertingVH<> with new instruction so it doesn't assert.
+      RemInst = RealRem;
+      // And replace the original instruction with the new one.
+      OrigRemInst->replaceAllUsesWith(RealRem);
+      OrigRemInst->eraseFromParent();
+      NumRecomposed++;
+      // Note that we have left ((X / Y) * Y) around.
+      // If it had other uses we could rewrite it as X - X % Y
+    }
+
+    assert((!E.isRemExpanded() || !HasDivRemOp) &&
+           "*If* the target supports div-rem, then by now the RemInst *is* "
+           "Instruction::[US]Rem.");
+
     // If the target supports div+rem and the instructions are in the same block
     // already, there's nothing to do. The backend should handle this. If the
     // target does not support div+rem, then we will decompose the rem.
@@ -149,10 +231,16 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       continue;
 
     bool DivDominates = DT.dominates(DivInst, RemInst);
-    if (!DivDominates && !DT.dominates(RemInst, DivInst))
+    if (!DivDominates && !DT.dominates(RemInst, DivInst)) {
+      // We have matching div-rem pair, but they are in two different blocks,
+      // neither of which dominates one another.
+      // FIXME: We could hoist both ops to the common predecessor block?
       continue;
+    }
 
-    if (!DebugCounter::shouldExecute(DRPCounter))
+    // The target does not have a single div/rem operation,
+    // and the rem is already in expanded form. Nothing to do.
+    if (!HasDivRemOp && E.isRemExpanded())
       continue;
 
     if (HasDivRemOp) {
@@ -164,9 +252,15 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
         DivInst->moveAfter(RemInst);
       NumHoisted++;
     } else {
-      // The target does not have a single div/rem operation. Decompose the
-      // remainder calculation as:
+      // The target does not have a single div/rem operation,
+      // and the rem is *not* in a already-expanded form.
+      // Decompose the remainder calculation as:
       // X % Y --> X - ((X / Y) * Y).
+
+      assert(!RemOriginallyWasInExpandedForm &&
+             "We should not be expanding if the rem was in expanded form to "
+             "begin with.");
+
       Value *X = E.getDividend();
       Value *Y = E.getDivisor();
       Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index f1f075257020..40c1ba88354f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -45,6 +44,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/AtomicOrdering.h"
@@ -55,6 +55,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <deque>
 #include <memory>
@@ -108,11 +109,12 @@ struct SimpleValue {
     // This can only handle non-void readnone functions.
     if (CallInst *CI = dyn_cast<CallInst>(Inst))
       return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
-    return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
-           isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
-           isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
-           isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
-           isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+    return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
+           isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
+           isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
+           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+           isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
+           isa<InsertValueInst>(Inst);
   }
 };
 
@@ -240,7 +242,7 @@ static unsigned getHashValueImpl(SimpleValue Val) {
 
   assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-          isa<ShuffleVectorInst>(Inst)) &&
+          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst)) &&
          "Invalid/unknown instruction");
 
   // Mix in the opcode.
@@ -526,7 +528,7 @@ public:
            const TargetTransformInfo &TTI, DominatorTree &DT,
            AssumptionCache &AC, MemorySSA *MSSA)
       : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
-        MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
 
   bool run();
 
@@ -651,7 +653,7 @@ private:
 
     bool isInvariantLoad() const {
       if (auto *LI = dyn_cast<LoadInst>(Inst))
-        return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+        return LI->hasMetadata(LLVMContext::MD_invariant_load);
       return false;
     }
 
@@ -790,7 +792,7 @@ bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
   // A location loaded from with an invariant_load is assumed to *never* change
   // within the visible scope of the compilation.
   if (auto *LI = dyn_cast<LoadInst>(I))
-    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+    if (LI->hasMetadata(LLVMContext::MD_invariant_load))
       return true;
 
   auto MemLocOpt = MemoryLocation::getOrNone(I);
@@ -905,8 +907,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
         continue;
       }
-      if (!salvageDebugInfo(*Inst))
-        replaceDbgUsesWithUndef(Inst);
+
+      salvageDebugInfoOrMarkUndef(*Inst);
       removeMSSA(Inst);
       Inst->eraseFromParent();
       Changed = true;
@@ -1359,7 +1361,7 @@ public:
     if (skipFunction(F))
       return false;
 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1381,6 +1383,7 @@ public:
       AU.addPreserved<MemorySSAWrapperPass>();
     }
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
     AU.setPreservesCFG();
   }
 };
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 31670b1464e4..72512430b366 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -11,10 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "flattencfg"
@@ -52,15 +55,23 @@ FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
 static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   bool Changed = false;
   bool LocalChange = true;
+
+  // Use block handles instead of iterating over function blocks directly
+  // to avoid using iterators invalidated by erasing blocks.
+  std::vector<WeakVH> Blocks;
+  Blocks.reserve(F.size());
+  for (auto &BB : F)
+    Blocks.push_back(&BB);
+
   while (LocalChange) {
     LocalChange = false;
 
-    // Loop over all of the basic blocks and remove them if they are unneeded...
-    //
-    for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
-      if (FlattenCFG(&*BBIt++, AA)) {
-        LocalChange = true;
-      }
+    // Loop over all of the basic blocks and try to flatten them.
+    for (WeakVH &BlockHandle : Blocks) {
+      // Skip blocks erased by FlattenCFG.
+      if (auto *BB = cast_or_null<BasicBlock>(BlockHandle))
+        if (FlattenCFG(BB, AA))
+          LocalChange = true;
     }
     Changed |= LocalChange;
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 4f83e869b303..af223cc837f2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #define DEBUG_TYPE "float2int"
 
 #include "llvm/Transforms/Scalar/Float2Int.h"
@@ -60,11 +62,13 @@ namespace {
       if (skipFunction(F))
         return false;
 
-      return Impl.runImpl(F);
+      const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      return Impl.runImpl(F, DT);
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
     }
 
@@ -116,21 +120,29 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
 
 // Find the roots - instructions that convert from the FP domain to
 // integer domain.
-void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
-  for (auto &I : instructions(F)) {
-    if (isa<VectorType>(I.getType()))
+void Float2IntPass::findRoots(Function &F, const DominatorTree &DT,
+                              SmallPtrSet<Instruction*,8> &Roots) {
+  for (BasicBlock &BB : F) {
+    // Unreachable code can take on strange forms that we are not prepared to
+    // handle. For example, an instruction may have itself as an operand.
+    if (!DT.isReachableFromEntry(&BB))
       continue;
-    switch (I.getOpcode()) {
-    default: break;
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-      Roots.insert(&I);
-      break;
-    case Instruction::FCmp:
-      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
-          CmpInst::BAD_ICMP_PREDICATE)
+
+    for (Instruction &I : BB) {
+      if (isa<VectorType>(I.getType()))
+        continue;
+      switch (I.getOpcode()) {
+      default: break;
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
         Roots.insert(&I);
-      break;
+        break;
+      case Instruction::FCmp:
+        if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+            CmpInst::BAD_ICMP_PREDICATE)
+          Roots.insert(&I);
+        break;
+      }
     }
   }
 }
@@ -503,7 +515,7 @@ void Float2IntPass::cleanup() {
     I.first->eraseFromParent();
 }
 
-bool Float2IntPass::runImpl(Function &F) {
+bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
   // Clear out all state.
   ECs = EquivalenceClasses<Instruction*>();
@@ -513,7 +525,7 @@ bool Float2IntPass::runImpl(Function &F) {
 
   Ctx = &F.getParent()->getContext();
 
-  findRoots(F, Roots);
+  findRoots(F, DT, Roots);
 
   walkBackwards(Roots);
   walkForwards();
@@ -527,8 +539,9 @@ bool Float2IntPass::runImpl(Function &F) {
 namespace llvm {
 FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
 
-PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
-  if (!runImpl(F))
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) {
+  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, DT))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp
index 542d3b3f7814..1e6aab14e7b4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -64,12 +64,14 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -111,7 +113,7 @@ static cl::opt<uint32_t> MaxNumDeps(
 
 struct llvm::GVN::Expression {
   uint32_t opcode;
-  Type *type;
+  Type *type = nullptr;
   bool commutative = false;
   SmallVector<uint32_t, 4> varargs;
 
@@ -172,7 +174,7 @@ struct llvm::gvn::AvailableValue {
   PointerIntPair<Value *, 2, ValType> Val;
 
   /// Offset - The byte offset in Val that is interesting for the load query.
-  unsigned Offset;
+  unsigned Offset = 0;
 
   static AvailableValue get(Value *V, unsigned Offset = 0) {
     AvailableValue Res;
@@ -236,7 +238,7 @@ struct llvm::gvn::AvailableValue {
 /// the associated BasicBlock.
 struct llvm::gvn::AvailableValueInBlock {
   /// BB - The basic block in question.
-  BasicBlock *BB;
+  BasicBlock *BB = nullptr;
 
   /// AV - The actual available value
   AvailableValue AV;
@@ -363,6 +365,7 @@ GVN::ValueTable::ValueTable() = default;
 GVN::ValueTable::ValueTable(const ValueTable &) = default;
 GVN::ValueTable::ValueTable(ValueTable &&) = default;
 GVN::ValueTable::~ValueTable() = default;
+GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default;
 
 /// add - Insert a value into the table with a specified value number.
 void GVN::ValueTable::add(Value *V, uint32_t num) {
@@ -626,6 +629,8 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
   PA.preserve<TargetLibraryAnalysis>();
+  if (LI)
+    PA.preserve<LoopAnalysis>();
   return PA;
 }
 
@@ -1161,15 +1166,30 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
     // Do PHI translation to get its value in the predecessor if necessary.  The
     // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+    // We do the translation for each edge we skipped by going from LI's block
+    // to LoadBB, otherwise we might miss pieces needing translation.
 
     // If all preds have a single successor, then we know it is safe to insert
     // the load on the pred (?!?), so we can insert code to materialize the
     // pointer if it is not available.
-    PHITransAddr Address(LI->getPointerOperand(), DL, AC);
-    Value *LoadPtr = nullptr;
-    LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
-                                                *DT, NewInsts);
+    Value *LoadPtr = LI->getPointerOperand();
+    BasicBlock *Cur = LI->getParent();
+    while (Cur != LoadBB) {
+      PHITransAddr Address(LoadPtr, DL, AC);
+      LoadPtr = Address.PHITranslateWithInsertion(
+          Cur, Cur->getSinglePredecessor(), *DT, NewInsts);
+      if (!LoadPtr) {
+        CanDoPRE = false;
+        break;
+      }
+      Cur = Cur->getSinglePredecessor();
+    }
 
+    if (LoadPtr) {
+      PHITransAddr Address(LoadPtr, DL, AC);
+      LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT,
+                                                  NewInsts);
+    }
     // If we couldn't find or insert a computation of this phi translated value,
     // we fail PRE.
     if (!LoadPtr) {
@@ -1184,8 +1204,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
   if (!CanDoPRE) {
     while (!NewInsts.empty()) {
-      Instruction *I = NewInsts.pop_back_val();
-      markInstructionForDeletion(I);
+      // Erase instructions generated by the failed PHI translation before
+      // trying to number them. PHI translation might insert instructions
+      // in basic blocks other than the current one, and we delete them
+      // directly, as markInstructionForDeletion only allows removing from the
+      // current basic block.
+      NewInsts.pop_back_val()->eraseFromParent();
     }
     // HINT: Don't revert the edge-splitting as following transformation may
     // also need to split these critical edges.
@@ -1219,10 +1243,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     BasicBlock *UnavailablePred = PredLoad.first;
     Value *LoadPtr = PredLoad.second;
 
-    auto *NewLoad =
-        new LoadInst(LI->getType(), LoadPtr, LI->getName() + ".pre",
-                     LI->isVolatile(), LI->getAlignment(), LI->getOrdering(),
-                     LI->getSyncScopeID(), UnavailablePred->getTerminator());
+    auto *NewLoad = new LoadInst(
+        LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(),
+        MaybeAlign(LI->getAlignment()), LI->getOrdering(), LI->getSyncScopeID(),
+        UnavailablePred->getTerminator());
     NewLoad->setDebugLoc(LI->getDebugLoc());
 
     // Transfer the old load's AA tags to the new load.
@@ -1365,6 +1389,67 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
 }
 
+static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
+  if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ)
+    return true;
+
+  // Floating point comparisons can be equal, but not equivalent.  Cases:
+  // NaNs for unordered operators
+  // +0.0 vs 0.0 for all operators
+  if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+      (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+       Cmp->getFastMathFlags().noNaNs())) {
+      Value *LHS = Cmp->getOperand(0);
+      Value *RHS = Cmp->getOperand(1);
+      // If we can prove either side non-zero, then equality must imply
+      // equivalence.
+      // FIXME: We should do this optimization if 'no signed zeros' is
+      // applicable via an instruction-level fast-math-flag or some other
+      // indicator that relaxed FP semantics are being used.
+      if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
+        return true;
+      if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
+        return true;;
+      // TODO: Handle vector floating point constants
+  }
+  return false;
+}
+
+static bool impliesEquivalanceIfFalse(CmpInst* Cmp) {
+  if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE)
+    return true;
+
+  // Floating point comparisons can be equal, but not equivelent.  Cases:
+  // NaNs for unordered operators
+  // +0.0 vs 0.0 for all operators
+  if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE &&
+       Cmp->getFastMathFlags().noNaNs()) ||
+      Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) {
+      Value *LHS = Cmp->getOperand(0);
+      Value *RHS = Cmp->getOperand(1);
+      // If we can prove either side non-zero, then equality must imply
+      // equivalence. 
+      // FIXME: We should do this optimization if 'no signed zeros' is
+      // applicable via an instruction-level fast-math-flag or some other
+      // indicator that relaxed FP semantics are being used.
+      if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
+        return true;
+      if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
+        return true;;
+      // TODO: Handle vector floating point constants
+  }
+  return false;
+}
+
+
+static bool hasUsersIn(Value *V, BasicBlock *BB) {
+  for (User *U : V->users())
+    if (isa<Instruction>(U) &&
+        cast<Instruction>(U)->getParent() == BB)
+      return true;
+  return false;
+}
+
 bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
   assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
          "This function can only be called with llvm.assume intrinsic");
@@ -1403,26 +1488,65 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
   // We can replace assume value with true, which covers cases like this:
   // call void @llvm.assume(i1 %cmp)
   // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
-  ReplaceWithConstMap[V] = True;
-
-  // If one of *cmp *eq operand is const, adding it to map will cover this:
+  ReplaceOperandsWithMap[V] = True;
+
+  // If we find an equality fact, canonicalize all dominated uses in this block
+  // to one of the two values.  We heuristically choice the "oldest" of the
+  // two where age is determined by value number. (Note that propagateEquality
+  // above handles the cross block case.) 
+  // 
+  // Key case to cover are:
+  // 1) 
   // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
   // call void @llvm.assume(i1 %cmp)
   // ret float %0 ; will change it to ret float 3.000000e+00
+  // 2)
+  // %load = load float, float* %addr
+  // %cmp = fcmp oeq float %load, %0
+  // call void @llvm.assume(i1 %cmp)
+  // ret float %load ; will change it to ret float %0
   if (auto *CmpI = dyn_cast<CmpInst>(V)) {
-    if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
-        CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
-        (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
-         CmpI->getFastMathFlags().noNaNs())) {
+    if (impliesEquivalanceIfTrue(CmpI)) {
       Value *CmpLHS = CmpI->getOperand(0);
       Value *CmpRHS = CmpI->getOperand(1);
-      if (isa<Constant>(CmpLHS))
+      // Heuristically pick the better replacement -- the choice of heuristic
+      // isn't terribly important here, but the fact we canonicalize on some
+      // replacement is for exposing other simplifications.
+      // TODO: pull this out as a helper function and reuse w/existing
+      // (slightly different) logic.
+      if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
         std::swap(CmpLHS, CmpRHS);
-      auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+      if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
+        std::swap(CmpLHS, CmpRHS);
+      if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
+          (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
+        // Move the 'oldest' value to the right-hand side, using the value
+        // number as a proxy for age.
+        uint32_t LVN = VN.lookupOrAdd(CmpLHS);
+        uint32_t RVN = VN.lookupOrAdd(CmpRHS);
+        if (LVN < RVN)
+          std::swap(CmpLHS, CmpRHS);
+      }
 
-      // If only one operand is constant.
-      if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
-        ReplaceWithConstMap[CmpLHS] = RHSConst;
+      // Handle degenerate case where we either haven't pruned a dead path or a
+      // removed a trivial assume yet.
+      if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
+        return Changed;
+
+      LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
+                 << *CmpLHS << " with "
+                 << *CmpRHS << " in block "
+                 << IntrinsicI->getParent()->getName() << "\n");
+      
+
+      // Setup the replacement map - this handles uses within the same block
+      if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
+        ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
+
+      // NOTE: The non-block local cases are handled by the call to
+      // propagateEquality above; this block is just about handling the block
+      // local cases.  TODO: There's a bunch of logic in propagateEqualiy which
+      // isn't duplicated for the block local case, can we share it somehow?
     }
   }
   return Changed;
@@ -1675,16 +1799,12 @@ void GVN::assignBlockRPONumber(Function &F) {
   InvalidBlockRPONumbers = false;
 }
 
-// Tries to replace instruction with const, using information from
-// ReplaceWithConstMap.
-bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
   bool Changed = false;
   for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
-    Value *Operand = Instr->getOperand(OpNum);
-    auto it = ReplaceWithConstMap.find(Operand);
-    if (it != ReplaceWithConstMap.end()) {
-      assert(!isa<Constant>(Operand) &&
-             "Replacing constants with constants is invalid");
+    Value *Operand = Instr->getOperand(OpNum); 
+    auto it = ReplaceOperandsWithMap.find(Operand);
+    if (it != ReplaceOperandsWithMap.end()) {
       LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
                         << *it->second << " in instruction " << *Instr << '\n');
       Instr->setOperand(OpNum, it->second);
@@ -1801,27 +1921,12 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
       Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
 
       // If "A == B" is known true, or "A != B" is known false, then replace
-      // A with B everywhere in the scope.
-      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
-          (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+      // A with B everywhere in the scope.  For floating point operations, we
+      // have to be careful since equality does not always imply equivalance.  
+      if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) ||
+          (isKnownFalse && impliesEquivalanceIfFalse(Cmp)))
         Worklist.push_back(std::make_pair(Op0, Op1));
 
-      // Handle the floating point versions of equality comparisons too.
-      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
-          (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
-
-        // Floating point -0.0 and 0.0 compare equal, so we can only
-        // propagate values if we know that we have a constant and that
-        // its value is non-zero.
-
-        // FIXME: We should do this optimization if 'no signed zeros' is
-        // applicable via an instruction-level fast-math-flag or some other
-        // indicator that relaxed FP semantics are being used.
-
-        if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
-          Worklist.push_back(std::make_pair(Op0, Op1));
-      }
-
       // If "A >= B" is known true, replace "A < B" with false everywhere.
       CmpInst::Predicate NotPred = Cmp->getInversePredicate();
       Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
@@ -2014,6 +2119,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   MD = RunMD;
   ImplicitControlFlowTracking ImplicitCFT(DT);
   ICF = &ImplicitCFT;
+  this->LI = LI;
   VN.setMemDep(MD);
   ORE = RunORE;
   InvalidBlockRPONumbers = true;
@@ -2075,13 +2181,13 @@ bool GVN::processBlock(BasicBlock *BB) {
     return false;
 
   // Clearing map before every BB because it can be used only for single BB.
-  ReplaceWithConstMap.clear();
+  ReplaceOperandsWithMap.clear();
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
        BI != BE;) {
-    if (!ReplaceWithConstMap.empty())
-      ChangedFunction |= replaceOperandsWithConsts(&*BI);
+    if (!ReplaceOperandsWithMap.empty())
+      ChangedFunction |= replaceOperandsForInBlockEquality(&*BI);
     ChangedFunction |= processInstruction(&*BI);
 
     if (InstrsToErase.empty()) {
@@ -2373,7 +2479,7 @@ bool GVN::performPRE(Function &F) {
 /// the block inserted to the critical edge.
 BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
   BasicBlock *BB =
-      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
+      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT, LI));
   if (MD)
     MD->invalidateCachedPredecessors();
   InvalidBlockRPONumbers = true;
@@ -2388,7 +2494,7 @@ bool GVN::splitCriticalEdges() {
   do {
     std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
     SplitCriticalEdge(Edge.first, Edge.second,
-                      CriticalEdgeSplittingOptions(DT));
+                      CriticalEdgeSplittingOptions(DT, LI));
   } while (!toSplit.empty());
   if (MD) MD->invalidateCachedPredecessors();
   InvalidBlockRPONumbers = true;
@@ -2494,18 +2600,26 @@ void GVN::addDeadBlock(BasicBlock *BB) {
     if (DeadBlocks.count(B))
       continue;
 
+    // First, split the critical edges. This might also create additional blocks
+    // to preserve LoopSimplify form and adjust edges accordingly.
     SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
     for (BasicBlock *P : Preds) {
       if (!DeadBlocks.count(P))
         continue;
 
-      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+      if (llvm::any_of(successors(P),
+                       [B](BasicBlock *Succ) { return Succ == B; }) &&
+          isCriticalEdge(P->getTerminator(), B)) {
         if (BasicBlock *S = splitCriticalEdges(P, B))
           DeadBlocks.insert(P = S);
       }
+    }
 
-      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
-        PHINode &Phi = cast<PHINode>(*II);
+    // Now undef the incoming values from the dead predecessors.
+    for (BasicBlock *P : predecessors(B)) {
+      if (!DeadBlocks.count(P))
+        continue;
+      for (PHINode &Phi : B->phis()) {
         Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
         if (MD)
           MD->invalidateCachedPointerInfo(&Phi);
@@ -2582,10 +2696,11 @@ public:
     return Impl.runImpl(
         F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
         getAnalysis<AAResultsWrapperPass>().getAAResults(),
-        NoMemDepAnalysis ? nullptr
-                : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
+        NoMemDepAnalysis
+            ? nullptr
+            : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
         LIWP ? &LIWP->getLoopInfo() : nullptr,
         &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
   }
@@ -2594,6 +2709,7 @@ public:
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
     if (!NoMemDepAnalysis)
       AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
@@ -2601,6 +2717,8 @@ public:
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreservedID(LoopSimplifyID);
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 7614599653c4..e1796f6bf05a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -47,7 +47,6 @@
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/PostDominators.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -65,6 +64,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -72,6 +72,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -257,7 +258,7 @@ public:
   GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
            MemoryDependenceResults *MD, MemorySSA *MSSA)
       : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
-        MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
 
   bool run(Function &F) {
     NumFuncArgs = F.arg_size();
@@ -539,7 +540,7 @@ private:
 
     // Check for unsafe hoistings due to side effects.
     if (K == InsKind::Store) {
-      if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths))
+      if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths))
         return false;
     } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
       return false;
@@ -889,19 +890,18 @@ private:
 
   void updateAlignment(Instruction *I, Instruction *Repl) {
     if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
-      ReplacementLoad->setAlignment(
-          std::min(ReplacementLoad->getAlignment(),
-                   cast<LoadInst>(I)->getAlignment()));
+      ReplacementLoad->setAlignment(MaybeAlign(std::min(
+          ReplacementLoad->getAlignment(), cast<LoadInst>(I)->getAlignment())));
       ++NumLoadsRemoved;
     } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
       ReplacementStore->setAlignment(
-          std::min(ReplacementStore->getAlignment(),
-                   cast<StoreInst>(I)->getAlignment()));
+          MaybeAlign(std::min(ReplacementStore->getAlignment(),
+                              cast<StoreInst>(I)->getAlignment())));
       ++NumStoresRemoved;
     } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
       ReplacementAlloca->setAlignment(
-          std::max(ReplacementAlloca->getAlignment(),
-                   cast<AllocaInst>(I)->getAlignment()));
+          MaybeAlign(std::max(ReplacementAlloca->getAlignment(),
+                              cast<AllocaInst>(I)->getAlignment())));
     } else if (isa<CallInst>(Repl)) {
       ++NumCallsRemoved;
     }
@@ -957,7 +957,8 @@ private:
     if (MoveAccess && NewMemAcc) {
         // The definition of this ld/st will not change: ld/st hoisting is
         // legal when the ld/st is not moved past its current definition.
-        MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End);
+        MSSAUpdater->moveToPlace(NewMemAcc, DestBB,
+                                 MemorySSA::BeforeTerminator);
     }
 
     // Replace all other instructions with Repl with memory access NewMemAcc.
@@ -1068,6 +1069,9 @@ private:
         ++NI;
     }
 
+    if (MSSA && VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+
     NumHoisted += NL + NS + NC + NI;
     NumRemoved += NR;
     NumLoadsHoisted += NL;
@@ -1169,6 +1173,7 @@ public:
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
   }
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 054025755c69..6d0a4975e266 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -47,7 +47,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -59,6 +58,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ArrayRecycler.h"
@@ -71,6 +71,7 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index e14f44bb7069..a3eba27a4d90 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -39,7 +39,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/GuardWidening.h"
-#include <functional>
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -53,11 +52,15 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include <functional>
 
 using namespace llvm;
 
@@ -66,22 +69,6 @@ using namespace llvm;
 STATISTIC(GuardsEliminated, "Number of eliminated guards");
 STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches");
 
-static cl::opt<bool> WidenFrequentBranches(
-    "guard-widening-widen-frequent-branches", cl::Hidden,
-    cl::desc("Widen conditions of explicit branches into dominating guards in "
-             "case if their taken frequency exceeds threshold set by "
-             "guard-widening-frequent-branch-threshold option"),
-    cl::init(false));
-
-static cl::opt<unsigned> FrequentBranchThreshold(
-    "guard-widening-frequent-branch-threshold", cl::Hidden,
-    cl::desc("When WidenFrequentBranches is set to true, this option is used "
-             "to determine which branches are frequently taken. The criteria "
-             "that a branch is taken more often than "
-             "((FrequentBranchThreshold - 1) / FrequentBranchThreshold), then "
-             "it is considered frequently taken"),
-    cl::init(1000));
-
 static cl::opt<bool>
     WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden,
                       cl::desc("Whether or not we should widen guards  "
@@ -97,15 +84,16 @@ static Value *getCondition(Instruction *I) {
            "Bad guard intrinsic?");
     return GI->getArgOperand(0);
   }
-  if (isGuardAsWidenableBranch(I)) {
-    auto *Cond = cast<BranchInst>(I)->getCondition();
-    return cast<BinaryOperator>(Cond)->getOperand(0);
-  }
+  Value *Cond, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB))
+    return Cond;
+
   return cast<BranchInst>(I)->getCondition();
 }
 
 // Set the condition for \p I to \p NewCond. \p I can either be a guard or a
-// conditional branch.
+// conditional branch.  
 static void setCondition(Instruction *I, Value *NewCond) {
   if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
     assert(GI->getIntrinsicID() == Intrinsic::experimental_guard &&
@@ -126,7 +114,6 @@ class GuardWideningImpl {
   DominatorTree &DT;
   PostDominatorTree *PDT;
   LoopInfo &LI;
-  BranchProbabilityInfo *BPI;
 
   /// Together, these describe the region of interest.  This might be all of
   /// the blocks within a function, or only a given loop's blocks and preheader.
@@ -271,26 +258,22 @@ class GuardWideningImpl {
   void widenGuard(Instruction *ToWiden, Value *NewCondition,
                   bool InvertCondition) {
     Value *Result;
+    
     widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result,
                     InvertCondition);
-    Value *WidenableCondition = nullptr;
     if (isGuardAsWidenableBranch(ToWiden)) {
-      auto *Cond = cast<BranchInst>(ToWiden)->getCondition();
-      WidenableCondition = cast<BinaryOperator>(Cond)->getOperand(1);
+      setWidenableBranchCond(cast<BranchInst>(ToWiden), Result);
+      return;
     }
-    if (WidenableCondition)
-      Result = BinaryOperator::CreateAnd(Result, WidenableCondition,
-                                         "guard.chk", ToWiden);
     setCondition(ToWiden, Result);
   }
 
 public:
 
   explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
-                             LoopInfo &LI, BranchProbabilityInfo *BPI,
-                             DomTreeNode *Root,
+                             LoopInfo &LI, DomTreeNode *Root,
                              std::function<bool(BasicBlock*)> BlockFilter)
-    : DT(DT), PDT(PDT), LI(LI), BPI(BPI), Root(Root), BlockFilter(BlockFilter)
+    : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter)
         {}
 
   /// The entry point for this pass.
@@ -309,13 +292,6 @@ static bool isSupportedGuardInstruction(const Instruction *Insn) {
 bool GuardWideningImpl::run() {
   DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock;
   bool Changed = false;
-  Optional<BranchProbability> LikelyTaken = None;
-  if (WidenFrequentBranches && BPI) {
-    unsigned Threshold = FrequentBranchThreshold;
-    assert(Threshold > 0 && "Zero threshold makes no sense!");
-    LikelyTaken = BranchProbability(Threshold - 1, Threshold);
-  }
-
   for (auto DFI = df_begin(Root), DFE = df_end(Root);
        DFI != DFE; ++DFI) {
     auto *BB = (*DFI)->getBlock();
@@ -330,17 +306,6 @@ bool GuardWideningImpl::run() {
 
     for (auto *II : CurrentList)
       Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock);
-    if (WidenFrequentBranches && BPI)
-      if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
-        if (BI->isConditional()) {
-          // If one of branches of a conditional is likely taken, try to
-          // eliminate it.
-          if (BPI->getEdgeProbability(BB, 0U) >= *LikelyTaken)
-            Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock);
-          else if (BPI->getEdgeProbability(BB, 1U) >= *LikelyTaken)
-            Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock,
-                                                 /*InvertCondition*/true);
-        }
   }
 
   assert(EliminatedGuardsAndBranches.empty() || Changed);
@@ -591,7 +556,7 @@ bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
           else
             Result = RC.getCheckInst();
         }
-
+        assert(Result && "Failed to find result value");
         Result->setName("wide.chk");
       }
       return true;
@@ -805,10 +770,7 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-  BranchProbabilityInfo *BPI = nullptr;
-  if (WidenFrequentBranches)
-    BPI = AM.getCachedResult<BranchProbabilityAnalysis>(F);
-  if (!GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(),
+  if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
                          [](BasicBlock*) { return true; } ).run())
     return PreservedAnalyses::all();
 
@@ -820,22 +782,13 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
 PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM,
                                          LoopStandardAnalysisResults &AR,
                                          LPMUpdater &U) {
-
-  const auto &FAM =
-    AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  Function &F = *L.getHeader()->getParent();
-  BranchProbabilityInfo *BPI = nullptr;
-  if (WidenFrequentBranches)
-    BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(F);
-
   BasicBlock *RootBB = L.getLoopPredecessor();
   if (!RootBB)
     RootBB = L.getHeader();
   auto BlockFilter = [&](BasicBlock *BB) {
     return BB == RootBB || L.contains(BB);
   };
-  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, BPI,
-                         AR.DT.getNode(RootBB),
+  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB),
                          BlockFilter).run())
     return PreservedAnalyses::all();
 
@@ -856,10 +809,7 @@ struct GuardWideningLegacyPass : public FunctionPass {
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-    BranchProbabilityInfo *BPI = nullptr;
-    if (WidenFrequentBranches)
-      BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
-    return GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(),
+    return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
                          [](BasicBlock*) { return true; } ).run();
   }
 
@@ -868,8 +818,6 @@ struct GuardWideningLegacyPass : public FunctionPass {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<PostDominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
-    if (WidenFrequentBranches)
-      AU.addRequired<BranchProbabilityInfoWrapperPass>();
   }
 };
 
@@ -895,16 +843,11 @@ struct LoopGuardWideningLegacyPass : public LoopPass {
     auto BlockFilter = [&](BasicBlock *BB) {
       return BB == RootBB || L->contains(BB);
     };
-    BranchProbabilityInfo *BPI = nullptr;
-    if (WidenFrequentBranches)
-      BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
-    return GuardWideningImpl(DT, PDT, LI, BPI,
+    return GuardWideningImpl(DT, PDT, LI,
                              DT.getNode(RootBB), BlockFilter).run();
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    if (WidenFrequentBranches)
-      AU.addRequired<BranchProbabilityInfoWrapperPass>();
     AU.setPreservesCFG();
     getLoopAnalysisUsage(AU);
     AU.addPreserved<PostDominatorTreeWrapperPass>();
@@ -920,8 +863,6 @@ INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-if (WidenFrequentBranches)
-  INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
                     false, false)
 
@@ -931,8 +872,6 @@ INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening",
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-if (WidenFrequentBranches)
-  INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening",
                     "Widen guards (within a single loop, as a loop pass)",
                     false, false)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index f9fc698a4a9b..d8d7acae5c9f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -31,8 +31,8 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -44,7 +44,6 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
@@ -68,6 +67,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -79,6 +79,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include <cassert>
@@ -124,6 +125,10 @@ static cl::opt<bool>
 DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
             cl::desc("Disable Linear Function Test Replace optimization"));
 
+static cl::opt<bool>
+LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
+                cl::desc("Predicate conditions in read only loops"));
+
 namespace {
 
 struct RewritePhi;
@@ -144,7 +149,11 @@ class IndVarSimplify {
   bool rewriteNonIntegerIVs(Loop *L);
 
   bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
-  bool optimizeLoopExits(Loop *L);
+  /// Try to eliminate loop exits based on analyzeable exit counts
+  bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter);
+  /// Try to form loop invariant tests for loop exits by changing how many
+  /// iterations of the loop run when that is unobservable.
+  bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
 
   bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
   bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
@@ -628,12 +637,30 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
         // Okay, this instruction has a user outside of the current loop
         // and varies predictably *inside* the loop.  Evaluate the value it
-        // contains when the loop exits, if possible.
+        // contains when the loop exits, if possible.  We prefer to start with
+        // expressions which are true for all exits (so as to maximize
+        // expression reuse by the SCEVExpander), but resort to per-exit
+        // evaluation if that fails.  
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (!SE->isLoopInvariant(ExitValue, L) ||
-            !isSafeToExpand(ExitValue, *SE))
-          continue;
-
+        if (isa<SCEVCouldNotCompute>(ExitValue) ||
+            !SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE)) {
+          // TODO: This should probably be sunk into SCEV in some way; maybe a
+          // getSCEVForExit(SCEV*, L, ExitingBB)?  It can be generalized for
+          // most SCEV expressions and other recurrence types (e.g. shift
+          // recurrences).  Is there existing code we can reuse?
+          const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
+          if (isa<SCEVCouldNotCompute>(ExitCount))
+            continue;
+          if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
+            if (AddRec->getLoop() == L)
+              ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
+          if (isa<SCEVCouldNotCompute>(ExitValue) ||
+              !SE->isLoopInvariant(ExitValue, L) ||
+              !isSafeToExpand(ExitValue, *SE))
+            continue;
+        }
+        
         // Computing the value outside of the loop brings no benefit if it is
         // definitely used inside the loop in a way which can not be optimized
         // away.  Avoid doing so unless we know we have a value which computes
@@ -804,7 +831,7 @@ bool IndVarSimplify::canLoopBeDeleted(
   L->getExitingBlocks(ExitingBlocks);
   SmallVector<BasicBlock *, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1)
+  if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
     return false;
 
   BasicBlock *ExitBlock = ExitBlocks[0];
@@ -1654,6 +1681,10 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     return nullptr;
   }
 
+  // if we reached this point then we are going to replace
+  // DU.NarrowUse with WideUse. Reattach DbgValue then.
+  replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT);
+
   ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
   // Returning WideUse pushes it on the worklist.
   return WideUse;
@@ -1779,14 +1810,9 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
       DeadInsts.emplace_back(DU.NarrowDef);
   }
 
-  // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
-  // evaluate the same recurrence, we can just copy the debug info over.
-  SmallVector<DbgValueInst *, 1> DbgValues;
-  llvm::findDbgValues(DbgValues, OrigPhi);
-  auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
-                                     ValueAsMetadata::get(WidePhi));
-  for (auto &DbgValue : DbgValues)
-    DbgValue->setOperand(0, MDPhi);
+  // Attach any debug information to the new PHI.
+  replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT);
+
   return WidePhi;
 }
 
@@ -1817,8 +1843,8 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
     auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
     auto CmpConstrainedLHSRange =
             ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
-    auto NarrowDefRange =
-            CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS);
+    auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap(
+        *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap);
 
     updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
   };
@@ -2242,8 +2268,8 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
     if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
       continue;
 
-    const auto *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
-    
+    const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+
     // AR may be a pointer type, while BECount is an integer type.
     // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
     // AR may not be a narrower type, or we may never exit.
@@ -2624,74 +2650,125 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
   return MadeAnyChanges;
 }
 
-bool IndVarSimplify::optimizeLoopExits(Loop *L) {
+/// Return a symbolic upper bound for the backedge taken count of the loop.
+/// This is more general than getConstantMaxBackedgeTakenCount as it returns
+/// an arbitrary expression as opposed to only constants.
+/// TODO: Move into the ScalarEvolution class.
+static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE,
+                                            DominatorTree &DT, Loop *L) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
   // Form an expression for the maximum exit count possible for this loop. We
   // merge the max and exact information to approximate a version of
-  // getMaxBackedgeTakenInfo which isn't restricted to just constants.
-  // TODO: factor this out as a version of getMaxBackedgeTakenCount which
-  // isn't guaranteed to return a constant.
+  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
   SmallVector<const SCEV*, 4> ExitCounts;
-  const SCEV *MaxConstEC = SE->getMaxBackedgeTakenCount(L);
-  if (!isa<SCEVCouldNotCompute>(MaxConstEC))
-    ExitCounts.push_back(MaxConstEC);
   for (BasicBlock *ExitingBB : ExitingBlocks) {
-    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
+    if (isa<SCEVCouldNotCompute>(ExitCount))
+      ExitCount = SE.getExitCount(L, ExitingBB,
+                                  ScalarEvolution::ConstantMaximum);
     if (!isa<SCEVCouldNotCompute>(ExitCount)) {
-      assert(DT->dominates(ExitingBB, L->getLoopLatch()) &&
+      assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
              "We should only have known counts for exiting blocks that "
              "dominate latch!");
       ExitCounts.push_back(ExitCount);
     }
   }
   if (ExitCounts.empty())
-    return false;
-  const SCEV *MaxExitCount = SE->getUMinFromMismatchedTypes(ExitCounts);
+    return SE.getCouldNotCompute();
+  return SE.getUMinFromMismatchedTypes(ExitCounts);
+}
 
-  bool Changed = false;
-  for (BasicBlock *ExitingBB : ExitingBlocks) {
+bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // Remove all exits which aren't both rewriteable and analyzeable.
+  auto NewEnd = llvm::remove_if(ExitingBlocks,
+                                [&](BasicBlock *ExitingBB) {
     // If our exitting block exits multiple loops, we can only rewrite the
     // innermost one.  Otherwise, we're changing how many times the innermost
     // loop runs before it exits. 
     if (LI->getLoopFor(ExitingBB) != L)
-      continue;
+      return true;
 
     // Can't rewrite non-branch yet.
     BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
     if (!BI)
-      continue;
+      return true;
 
     // If already constant, nothing to do.
     if (isa<Constant>(BI->getCondition()))
-      continue;
+      return true;
     
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
     if (isa<SCEVCouldNotCompute>(ExitCount))
-      continue;
+      return true;
+    return false;
+   });
+  ExitingBlocks.erase(NewEnd, ExitingBlocks.end());
+
+  if (ExitingBlocks.empty())
+    return false;
+  
+  // Get a symbolic upper bound on the loop backedge taken count.  
+  const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L);
+  if (isa<SCEVCouldNotCompute>(MaxExitCount))
+    return false;
 
+  // Visit our exit blocks in order of dominance.  We know from the fact that
+  // all exits (left) are analyzeable that the must be a total dominance order
+  // between them as each must dominate the latch.  The visit order only
+  // matters for the provably equal case.  
+  llvm::sort(ExitingBlocks,
+             [&](BasicBlock *A, BasicBlock *B) {
+               // std::sort sorts in ascending order, so we want the inverse of
+               // the normal dominance relation.
+               if (DT->properlyDominates(A, B)) return true;
+               if (DT->properlyDominates(B, A)) return false;
+               llvm_unreachable("expected total dominance order!");
+             });
+#ifdef ASSERT
+  for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
+    assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
+  }
+#endif
+  
+  auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) {
+    BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+    auto *OldCond = BI->getCondition();
+    auto *NewCond = ConstantInt::get(OldCond->getType(),
+                                     IsTaken ? ExitIfTrue : !ExitIfTrue);
+    BI->setCondition(NewCond);
+    if (OldCond->use_empty())
+      DeadInsts.push_back(OldCond);
+  };
+
+  bool Changed = false;
+  SmallSet<const SCEV*, 8> DominatingExitCounts;
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above");
+    
     // If we know we'd exit on the first iteration, rewrite the exit to
     // reflect this.  This does not imply the loop must exit through this
     // exit; there may be an earlier one taken on the first iteration.
     // TODO: Given we know the backedge can't be taken, we should go ahead
     // and break it.  Or at least, kill all the header phis and simplify.
     if (ExitCount->isZero()) {
-      bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
-      auto *OldCond = BI->getCondition();
-      auto *NewCond = ExitIfTrue ? ConstantInt::getTrue(OldCond->getType()) :
-        ConstantInt::getFalse(OldCond->getType());
-      BI->setCondition(NewCond);
-      if (OldCond->use_empty())
-        DeadInsts.push_back(OldCond);
+      FoldExit(ExitingBB, true);
       Changed = true;
       continue;
     }
 
-    // If we end up with a pointer exit count, bail.
+    // If we end up with a pointer exit count, bail.  Note that we can end up
+    // with a pointer exit count for one exiting block, and not for another in
+    // the same loop.
     if (!ExitCount->getType()->isIntegerTy() ||
         !MaxExitCount->getType()->isIntegerTy())
-      return false;
+      continue;
     
     Type *WiderType =
       SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
@@ -2700,32 +2777,203 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L) {
     assert(MaxExitCount->getType() == ExitCount->getType());
     
     // Can we prove that some other exit must be taken strictly before this
-    // one?  TODO: handle cases where ule is known, and equality is covered
-    // by a dominating exit
+    // one?
     if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
                                      MaxExitCount, ExitCount)) {
-      bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
-      auto *OldCond = BI->getCondition();
-      auto *NewCond = ExitIfTrue ? ConstantInt::getFalse(OldCond->getType()) :
-        ConstantInt::getTrue(OldCond->getType());
-      BI->setCondition(NewCond);
-      if (OldCond->use_empty())
-        DeadInsts.push_back(OldCond);
+      FoldExit(ExitingBB, false);
       Changed = true;
       continue;
     }
 
-    // TODO: If we can prove that the exiting iteration is equal to the exit
-    // count for this exit and that no previous exit oppurtunities exist within
-    // the loop, then we can discharge all other exits.  (May fall out of
-    // previous TODO.) 
-    
-    // TODO: If we can't prove any relation between our exit count and the
-    // loops exit count, but taking this exit doesn't require actually running
-    // the loop (i.e. no side effects, no computed values used in exit), then
-    // we can replace the exit test with a loop invariant test which exits on
-    // the first iteration.  
+    // As we run, keep track of which exit counts we've encountered.  If we
+    // find a duplicate, we've found an exit which would have exited on the
+    // exiting iteration, but (from the visit order) strictly follows another
+    // which does the same and is thus dead. 
+    if (!DominatingExitCounts.insert(ExitCount).second) {
+      FoldExit(ExitingBB, false);
+      Changed = true;
+      continue;
+    }
+
+    // TODO: There might be another oppurtunity to leverage SCEV's reasoning
+    // here.  If we kept track of the min of dominanting exits so far, we could
+    // discharge exits with EC >= MDEC. This is less powerful than the existing
+    // transform (since later exits aren't considered), but potentially more
+    // powerful for any case where SCEV can prove a >=u b, but neither a == b
+    // or a >u b.  Such a case is not currently known.
+  }
+  return Changed;
+}
+
+bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  bool Changed = false;
+
+  // Finally, see if we can rewrite our exit conditions into a loop invariant
+  // form.  If we have a read-only loop, and we can tell that we must exit down
+  // a path which does not need any of the values computed within the loop, we
+  // can rewrite the loop to exit on the first iteration.  Note that this
+  // doesn't either a) tell us the loop exits on the first iteration (unless
+  // *all* exits are predicateable) or b) tell us *which* exit might be taken.
+  // This transformation looks a lot like a restricted form of dead loop
+  // elimination, but restricted to read-only loops and without neccesssarily
+  // needing to kill the loop entirely. 
+  if (!LoopPredication)
+    return Changed;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return Changed;
+
+  // Note: ExactBTC is the exact backedge taken count *iff* the loop exits
+  // through *explicit* control flow.  We have to eliminate the possibility of
+  // implicit exits (see below) before we know it's truly exact.
+  const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(ExactBTC) ||
+      !SE->isLoopInvariant(ExactBTC, L) ||
+      !isSafeToExpand(ExactBTC, *SE))
+    return Changed;
+
+  // If we end up with a pointer exit count, bail.  It may be unsized.
+  if (!ExactBTC->getType()->isIntegerTy())
+    return Changed;
+
+  auto BadExit = [&](BasicBlock *ExitingBB) {
+    // If our exiting block exits multiple loops, we can only rewrite the
+    // innermost one.  Otherwise, we're changing how many times the innermost
+    // loop runs before it exits. 
+    if (LI->getLoopFor(ExitingBB) != L)
+      return true;
+
+    // Can't rewrite non-branch yet.
+    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      return true;
+
+    // If already constant, nothing to do.
+    if (isa<Constant>(BI->getCondition()))
+      return true;
+
+    // If the exit block has phis, we need to be able to compute the values
+    // within the loop which contains them.  This assumes trivially lcssa phis
+    // have already been removed; TODO: generalize
+    BasicBlock *ExitBlock =
+    BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+    if (!ExitBlock->phis().empty())
+      return true;
+
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count");
+    if (!SE->isLoopInvariant(ExitCount, L) ||
+        !isSafeToExpand(ExitCount, *SE))
+      return true;
+
+    // If we end up with a pointer exit count, bail.  It may be unsized.
+    if (!ExitCount->getType()->isIntegerTy())
+      return true;
+
+    return false;
+  };
+
+  // If we have any exits which can't be predicated themselves, than we can't
+  // predicate any exit which isn't guaranteed to execute before it.  Consider
+  // two exits (a) and (b) which would both exit on the same iteration.  If we
+  // can predicate (b), but not (a), and (a) preceeds (b) along some path, then
+  // we could convert a loop from exiting through (a) to one exiting through
+  // (b).  Note that this problem exists only for exits with the same exit
+  // count, and we could be more aggressive when exit counts are known inequal.
+  llvm::sort(ExitingBlocks,
+            [&](BasicBlock *A, BasicBlock *B) {
+              // std::sort sorts in ascending order, so we want the inverse of
+              // the normal dominance relation, plus a tie breaker for blocks
+              // unordered by dominance.
+              if (DT->properlyDominates(A, B)) return true;
+              if (DT->properlyDominates(B, A)) return false;
+              return A->getName() < B->getName();
+            });
+  // Check to see if our exit blocks are a total order (i.e. a linear chain of
+  // exits before the backedge).  If they aren't, reasoning about reachability
+  // is complicated and we choose not to for now.
+  for (unsigned i = 1; i < ExitingBlocks.size(); i++)
+    if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]))
+      return Changed;
+
+  // Given our sorted total order, we know that exit[j] must be evaluated
+  // after all exit[i] such j > i.
+  for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++)
+    if (BadExit(ExitingBlocks[i])) {
+      ExitingBlocks.resize(i);  
+      break;
+    }
+
+  if (ExitingBlocks.empty())
+    return Changed;
+
+  // We rely on not being able to reach an exiting block on a later iteration
+  // then it's statically compute exit count.  The implementaton of
+  // getExitCount currently has this invariant, but assert it here so that
+  // breakage is obvious if this ever changes..
+  assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) {
+        return DT->dominates(ExitingBB, L->getLoopLatch());
+      }));
+
+  // At this point, ExitingBlocks consists of only those blocks which are
+  // predicatable.  Given that, we know we have at least one exit we can
+  // predicate if the loop is doesn't have side effects and doesn't have any
+  // implicit exits (because then our exact BTC isn't actually exact).
+  // @Reviewers - As structured, this is O(I^2) for loop nests.  Any
+  // suggestions on how to improve this?  I can obviously bail out for outer
+  // loops, but that seems less than ideal.  MemorySSA can find memory writes,
+  // is that enough for *all* side effects?
+  for (BasicBlock *BB : L->blocks())
+    for (auto &I : *BB)
+      // TODO:isGuaranteedToTransfer
+      if (I.mayHaveSideEffects() || I.mayThrow())
+        return Changed;
+
+  // Finally, do the actual predication for all predicatable blocks.  A couple
+  // of notes here:
+  // 1) We don't bother to constant fold dominated exits with identical exit
+  //    counts; that's simply a form of CSE/equality propagation and we leave
+  //    it for dedicated passes.
+  // 2) We insert the comparison at the branch.  Hoisting introduces additional
+  //    legality constraints and we leave that to dedicated logic.  We want to
+  //    predicate even if we can't insert a loop invariant expression as
+  //    peeling or unrolling will likely reduce the cost of the otherwise loop
+  //    varying check.
+  Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator());
+  IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+  Value *ExactBTCV = nullptr; // Lazily generated if needed.
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+
+    auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    Value *NewCond;
+    if (ExitCount == ExactBTC) {
+      NewCond = L->contains(BI->getSuccessor(0)) ?
+        B.getFalse() : B.getTrue();
+    } else {
+      Value *ECV = Rewriter.expandCodeFor(ExitCount);
+      if (!ExactBTCV)
+        ExactBTCV = Rewriter.expandCodeFor(ExactBTC);
+      Value *RHS = ExactBTCV;
+      if (ECV->getType() != RHS->getType()) {
+        Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+        ECV = B.CreateZExt(ECV, WiderTy);
+        RHS = B.CreateZExt(RHS, WiderTy);
+      }
+      auto Pred = L->contains(BI->getSuccessor(0)) ?
+        ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+      NewCond = B.CreateICmp(Pred, ECV, RHS);
+    }
+    Value *OldCond = BI->getCondition();
+    BI->setCondition(NewCond);
+    if (OldCond->use_empty())
+      DeadInsts.push_back(OldCond);
+    Changed = true;
   }
+
   return Changed;
 }
 
@@ -2751,12 +2999,15 @@ bool IndVarSimplify::run(Loop *L) {
   if (!L->isLoopSimplifyForm())
     return false;
 
+#ifndef NDEBUG
+  // Used below for a consistency check only
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+#endif
+
   // If there are any floating-point recurrences, attempt to
   // transform them to use integer recurrences.
   Changed |= rewriteNonIntegerIVs(L);
 
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
-
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE, DL, "indvars");
 #ifndef NDEBUG
@@ -2772,20 +3023,30 @@ bool IndVarSimplify::run(Loop *L) {
   Rewriter.disableCanonicalMode();
   Changed |= simplifyAndExtend(L, Rewriter, LI);
 
-  // Check to see if this loop has a computable loop-invariant execution count.
-  // If so, this means that we can compute the final value of any expressions
+  // Check to see if we can compute the final value of any expressions
   // that are recurrent in the loop, and substitute the exit values from the
-  // loop into any instructions outside of the loop that use the final values of
-  // the current expressions.
-  //
-  if (ReplaceExitValue != NeverRepl &&
-      !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+  // loop into any instructions outside of the loop that use the final values
+  // of the current expressions.
+  if (ReplaceExitValue != NeverRepl)
     Changed |= rewriteLoopExitValues(L, Rewriter);
 
   // Eliminate redundant IV cycles.
   NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
 
-  Changed |= optimizeLoopExits(L);
+  // Try to eliminate loop exits based on analyzeable exit counts
+  if (optimizeLoopExits(L, Rewriter))  {
+    Changed = true;
+    // Given we've changed exit counts, notify SCEV
+    SE->forgetLoop(L);
+  }
+  
+  // Try to form loop invariant tests for loop exits by changing how many
+  // iterations of the loop run when that is unobservable.
+  if (predicateLoopExits(L, Rewriter)) {
+    Changed = true;
+    // Given we've changed exit counts, notify SCEV
+    SE->forgetLoop(L);
+  }
 
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  
@@ -2825,7 +3086,7 @@ bool IndVarSimplify::run(Loop *L) {
       // that our definition of "high cost" is not exactly principled.  
       if (Rewriter.isHighCostExpansion(ExitCount, L))
         continue;
-      
+
       // Check preconditions for proper SCEVExpander operation. SCEV does not
       // express SCEVExpander's dependencies, such as LoopSimplify. Instead
       // any pass that uses the SCEVExpander must do it. This does not work
@@ -2873,7 +3134,8 @@ bool IndVarSimplify::run(Loop *L) {
          "Indvars did not preserve LCSSA!");
 
   // Verify that LFTR, and any other change have not interfered with SCEV's
-  // ability to compute trip count.
+  // ability to compute trip count.  We may have *changed* the exit count, but
+  // only by reducing it.
 #ifndef NDEBUG
   if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
     SE->forgetLoop(L);
@@ -2885,7 +3147,8 @@ bool IndVarSimplify::run(Loop *L) {
     else
       BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
                                                  NewBECount->getType());
-    assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+    assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount,
+                                 NewBECount) && "indvars must preserve SCEV");
   }
 #endif
 
@@ -2924,7 +3187,7 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
     auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
     auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 997d68838152..58469749600e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -74,6 +74,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 5f0e2001c73d..dfb1b6bfb739 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -141,9 +141,11 @@ using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
 
 /// InferAddressSpaces
 class InferAddressSpaces : public FunctionPass {
+  const TargetTransformInfo *TTI = nullptr;
+
   /// Target specific address space which uses of should be replaced if
   /// possible.
-  unsigned FlatAddrSpace;
+  unsigned FlatAddrSpace = 0;
 
 public:
   static char ID;
@@ -264,17 +266,6 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
   Module *M = II->getParent()->getParent()->getParent();
 
   switch (II->getIntrinsicID()) {
-  case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
-    const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
-    if (!IsVolatile || !IsVolatile->isZero())
-      return false;
-
-    LLVM_FALLTHROUGH;
-  }
   case Intrinsic::objectsize: {
     Type *DestTy = II->getType();
     Type *SrcTy = NewV->getType();
@@ -285,25 +276,27 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
     return true;
   }
   default:
-    return false;
+    return TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
   }
 }
 
-// TODO: Move logic to TTI?
 void InferAddressSpaces::collectRewritableIntrinsicOperands(
     IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
     DenseSet<Value *> &Visited) const {
-  switch (II->getIntrinsicID()) {
+  auto IID = II->getIntrinsicID();
+  switch (IID) {
   case Intrinsic::objectsize:
-  case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax:
     appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
                                                  PostorderStack, Visited);
     break;
   default:
+    SmallVector<int, 2> OpIndexes;
+    if (TTI->collectFlatAddressOperands(OpIndexes, IID)) {
+      for (int Idx : OpIndexes) {
+        appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx),
+                                                     PostorderStack, Visited);
+      }
+    }
     break;
   }
 }
@@ -631,11 +624,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  const TargetTransformInfo &TTI =
-      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   if (FlatAddrSpace == UninitializedAddressSpace) {
-    FlatAddrSpace = TTI.getFlatAddressSpace();
+    FlatAddrSpace = TTI->getFlatAddressSpace();
     if (FlatAddrSpace == UninitializedAddressSpace)
       return false;
   }
@@ -650,7 +642,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
 
   // Changes the address spaces of the flat address expressions who are inferred
   // to point to a specific address space.
-  return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F);
+  return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F);
 }
 
 // Constants need to be tracked through RAUW to handle cases with nested
@@ -799,8 +791,8 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
   MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
 
   if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
-    B.CreateMemSet(NewV, MSI->getValue(),
-                   MSI->getLength(), MSI->getDestAlignment(),
+    B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(),
+                   MaybeAlign(MSI->getDestAlignment()),
                    false, // isVolatile
                    TBAA, ScopeMD, NoAliasMD);
   } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
@@ -816,15 +808,13 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
 
     if (isa<MemCpyInst>(MTI)) {
       MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
-      B.CreateMemCpy(Dest, MTI->getDestAlignment(),
-                     Src, MTI->getSourceAlignment(),
+      B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
                      MTI->getLength(),
                      false, // isVolatile
                      TBAA, TBAAStruct, ScopeMD, NoAliasMD);
     } else {
       assert(isa<MemMoveInst>(MTI));
-      B.CreateMemMove(Dest, MTI->getDestAlignment(),
-                      Src, MTI->getSourceAlignment(),
+      B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
                       MTI->getLength(),
                       false, // isVolatile
                       TBAA, ScopeMD, NoAliasMD);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
index 6616364ab203..e8bbf2936da6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -33,37 +34,39 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ,
   bool Changed = false;
 
   do {
-    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
-      // Here be subtlety: the iterator must be incremented before the loop
-      // body (not sure why), so a range-for loop won't work here.
-      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-        Instruction *I = &*BI++;
-        // The first time through the loop ToSimplify is empty and we try to
-        // simplify all instructions.  On later iterations ToSimplify is not
+    for (BasicBlock &BB : F) {
+      // Unreachable code can take on strange forms that we are not prepared to
+      // handle. For example, an instruction may have itself as an operand.
+      if (!SQ.DT->isReachableFromEntry(&BB))
+        continue;
+
+      SmallVector<Instruction *, 8> DeadInstsInBB;
+      for (Instruction &I : BB) {
+        // The first time through the loop, ToSimplify is empty and we try to
+        // simplify all instructions. On later iterations, ToSimplify is not
         // empty and we only bother simplifying instructions that are in it.
-        if (!ToSimplify->empty() && !ToSimplify->count(I))
+        if (!ToSimplify->empty() && !ToSimplify->count(&I))
           continue;
 
-        // Don't waste time simplifying unused instructions.
-        if (!I->use_empty()) {
-          if (Value *V = SimplifyInstruction(I, SQ, ORE)) {
+        // Don't waste time simplifying dead/unused instructions.
+        if (isInstructionTriviallyDead(&I)) {
+          DeadInstsInBB.push_back(&I);
+          Changed = true;
+        } else if (!I.use_empty()) {
+          if (Value *V = SimplifyInstruction(&I, SQ, ORE)) {
             // Mark all uses for resimplification next time round the loop.
-            for (User *U : I->users())
+            for (User *U : I.users())
               Next->insert(cast<Instruction>(U));
-            I->replaceAllUsesWith(V);
+            I.replaceAllUsesWith(V);
             ++NumSimplified;
             Changed = true;
+            // A call can get simplified, but it may not be trivially dead.
+            if (isInstructionTriviallyDead(&I))
+              DeadInstsInBB.push_back(&I);
           }
         }
-        if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) {
-          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
-          // instruction, so simply incrementing the iterator does not work.
-          // When instructions get deleted re-iterate instead.
-          BI = BB->begin();
-          BE = BB->end();
-          Changed = true;
-        }
       }
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI);
     }
 
     // Place the list of instructions to simplify on the next loop iteration
@@ -90,7 +93,7 @@ struct InstSimplifyLegacyPass : public FunctionPass {
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
-  /// runOnFunction - Remove instructions that simplify.
+  /// Remove instructions that simplify.
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
@@ -98,7 +101,7 @@ struct InstSimplifyLegacyPass : public FunctionPass {
     const DominatorTree *DT =
         &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     AssumptionCache *AC =
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     OptimizationRemarkEmitter *ORE =
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index b86bf2fefbe5..98c2fcb3dae0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -55,6 +55,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
@@ -224,13 +225,21 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
          BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
     auto *PredBB = IncomingBB;
     auto *SuccBB = PhiBB;
+    SmallPtrSet<BasicBlock *, 16> Visited;
     while (true) {
       BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
       if (PredBr && PredBr->isConditional())
         return {PredBB, SuccBB};
+      Visited.insert(PredBB);
       auto *SinglePredBB = PredBB->getSinglePredecessor();
       if (!SinglePredBB)
         return {nullptr, nullptr};
+
+      // Stop searching when SinglePredBB has been visited. It means we see
+      // an unreachable loop.
+      if (Visited.count(SinglePredBB))
+        return {nullptr, nullptr};
+
       SuccBB = PredBB;
       PredBB = SinglePredBB;
     }
@@ -253,7 +262,9 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
       return;
 
     BasicBlock *PredBB = PredOutEdge.first;
-    BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator());
+    BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+    if (!PredBr)
+      return;
 
     uint64_t PredTrueWeight, PredFalseWeight;
     // FIXME: We currently only set the profile data when it is missing.
@@ -286,7 +297,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
 bool JumpThreading::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
-  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   // Get DT analysis before LVI. When LVI is initialized it conditionally adds
   // DT if it's available.
   auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -295,14 +306,13 @@ bool JumpThreading::runOnFunction(Function &F) {
   DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
-  bool HasProfileData = F.hasProfileData();
-  if (HasProfileData) {
+  if (F.hasProfileData()) {
     LoopInfo LI{DominatorTree(F)};
     BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
-  bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, HasProfileData,
+  bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(),
                               std::move(BFI), std::move(BPI));
   if (PrintLVIAfterJumpThreading) {
     dbgs() << "LVI for function '" << F.getName() << "':\n";
@@ -329,7 +339,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
-  bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, HasProfileData,
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
                          std::move(BFI), std::move(BPI));
 
   if (!Changed)
@@ -992,49 +1002,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // successor, merge the blocks.  This encourages recursive jump threading
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
-  if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
-    const Instruction *TI = SinglePred->getTerminator();
-    if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 &&
-        SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
-      // If SinglePred was a loop header, BB becomes one.
-      if (LoopHeaders.erase(SinglePred))
-        LoopHeaders.insert(BB);
-
-      LVI->eraseBlock(SinglePred);
-      MergeBasicBlockIntoOnlyPred(BB, DTU);
-
-      // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
-      // BB code within one basic block `BB`), we need to invalidate the LVI
-      // information associated with BB, because the LVI information need not be
-      // true for all of BB after the merge. For example,
-      // Before the merge, LVI info and code is as follows:
-      // SinglePred: <LVI info1 for %p val>
-      // %y = use of %p
-      // call @exit() // need not transfer execution to successor.
-      // assume(%p) // from this point on %p is true
-      // br label %BB
-      // BB: <LVI info2 for %p val, i.e. %p is true>
-      // %x = use of %p
-      // br label exit
-      //
-      // Note that this LVI info for blocks BB and SinglPred is correct for %p
-      // (info2 and info1 respectively). After the merge and the deletion of the
-      // LVI info1 for SinglePred. We have the following code:
-      // BB: <LVI info2 for %p val>
-      // %y = use of %p
-      // call @exit()
-      // assume(%p)
-      // %x = use of %p <-- LVI info2 is correct from here onwards.
-      // br label exit
-      // LVI info2 for BB is incorrect at the beginning of BB.
-
-      // Invalidate LVI information for BB if the LVI is not provably true for
-      // all of BB.
-      if (!isGuaranteedToTransferExecutionToSuccessor(BB))
-        LVI->eraseBlock(BB);
-      return true;
-    }
-  }
+  if (MaybeMergeBasicBlockIntoOnlyPred(BB))
+    return true;
 
   if (TryToUnfoldSelectInCurrBB(BB))
     return true;
@@ -1461,7 +1430,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
            "Can't handle critical edge here!");
     LoadInst *NewVal = new LoadInst(
         LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
-        LoadI->getName() + ".pr", false, LoadI->getAlignment(),
+        LoadI->getName() + ".pr", false, MaybeAlign(LoadI->getAlignment()),
         LoadI->getOrdering(), LoadI->getSyncScopeID(),
         UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LoadI->getDebugLoc());
@@ -1748,7 +1717,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
                             getSuccessor(GetBestDestForJumpOnUndef(BB));
 
   // Ok, try to thread it!
-  return ThreadEdge(BB, PredsToFactor, MostPopularDest);
+  return TryThreadEdge(BB, PredsToFactor, MostPopularDest);
 }
 
 /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
@@ -1910,12 +1879,146 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
   }
 }
 
-/// ThreadEdge - We have decided that it is safe and profitable to factor the
-/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
-/// across BB.  Transform the IR to reflect this change.
-bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
-                                   const SmallVectorImpl<BasicBlock *> &PredBBs,
-                                   BasicBlock *SuccBB) {
+/// Merge basic block BB into its sole predecessor if possible.
+bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
+  BasicBlock *SinglePred = BB->getSinglePredecessor();
+  if (!SinglePred)
+    return false;
+
+  const Instruction *TI = SinglePred->getTerminator();
+  if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 ||
+      SinglePred == BB || hasAddressTakenAndUsed(BB))
+    return false;
+
+  // If SinglePred was a loop header, BB becomes one.
+  if (LoopHeaders.erase(SinglePred))
+    LoopHeaders.insert(BB);
+
+  LVI->eraseBlock(SinglePred);
+  MergeBasicBlockIntoOnlyPred(BB, DTU);
+
+  // Now that BB is merged into SinglePred (i.e. SinglePred code followed by
+  // BB code within one basic block `BB`), we need to invalidate the LVI
+  // information associated with BB, because the LVI information need not be
+  // true for all of BB after the merge. For example,
+  // Before the merge, LVI info and code is as follows:
+  // SinglePred: <LVI info1 for %p val>
+  // %y = use of %p
+  // call @exit() // need not transfer execution to successor.
+  // assume(%p) // from this point on %p is true
+  // br label %BB
+  // BB: <LVI info2 for %p val, i.e. %p is true>
+  // %x = use of %p
+  // br label exit
+  //
+  // Note that this LVI info for blocks BB and SinglPred is correct for %p
+  // (info2 and info1 respectively). After the merge and the deletion of the
+  // LVI info1 for SinglePred. We have the following code:
+  // BB: <LVI info2 for %p val>
+  // %y = use of %p
+  // call @exit()
+  // assume(%p)
+  // %x = use of %p <-- LVI info2 is correct from here onwards.
+  // br label exit
+  // LVI info2 for BB is incorrect at the beginning of BB.
+
+  // Invalidate LVI information for BB if the LVI is not provably true for
+  // all of BB.
+  if (!isGuaranteedToTransferExecutionToSuccessor(BB))
+    LVI->eraseBlock(BB);
+  return true;
+}
+
+/// Update the SSA form.  NewBB contains instructions that are copied from BB.
+/// ValueMapping maps old values in BB to new ones in NewBB.
+void JumpThreadingPass::UpdateSSA(
+    BasicBlock *BB, BasicBlock *NewBB,
+    DenseMap<Instruction *, Value *> &ValueMapping) {
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use *, 16> UsesToRename;
+
+  for (Instruction &I : *BB) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Use &U : I.uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(U) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&U);
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+    LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+}
+
+/// Clone instructions in range [BI, BE) to NewBB.  For PHI nodes, we only clone
+/// arguments that come from PredBB.  Return the map from the variables in the
+/// source basic block to the variables in the newly created basic block.
+DenseMap<Instruction *, Value *>
+JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
+                                     BasicBlock::iterator BE, BasicBlock *NewBB,
+                                     BasicBlock *PredBB) {
+  // We are going to have to map operands from the source basic block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in the source basic
+  // block, evaluate them to account for entry from PredBB.
+  DenseMap<Instruction *, Value *> ValueMapping;
+
+  // Clone the phi nodes of the source basic block into NewBB.  The resulting
+  // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater
+  // might need to rewrite the operand of the cloned phi.
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+    PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
+    NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
+    ValueMapping[PN] = NewPN;
+  }
+
+  // Clone the non-phi instructions of the source basic block into NewBB,
+  // keeping track of the mapping and using it to remap operands in the cloned
+  // instructions.
+  for (; BI != BE; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    NewBB->getInstList().push_back(New);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return ValueMapping;
+}
+
+/// TryThreadEdge - Thread an edge if it's safe and profitable to do so.
+bool JumpThreadingPass::TryThreadEdge(
+    BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
+    BasicBlock *SuccBB) {
   // If threading to the same block as we come from, we would infinite loop.
   if (SuccBB == BB) {
     LLVM_DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
@@ -1945,6 +2048,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
+  ThreadEdge(BB, PredBBs, SuccBB);
+  return true;
+}
+
+/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB.  Transform the IR to reflect this change.
+void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
+                                   const SmallVectorImpl<BasicBlock *> &PredBBs,
+                                   BasicBlock *SuccBB) {
+  assert(SuccBB != BB && "Don't create an infinite loop");
+
+  assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) &&
+         "Don't thread across loop headers");
+
   // And finally, do it!  Start by factoring the predecessors if needed.
   BasicBlock *PredBB;
   if (PredBBs.size() == 1)
@@ -1958,7 +2076,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   // And finally, do it!
   LLVM_DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName()
                     << "' to '" << SuccBB->getName()
-                    << "' with cost: " << JumpThreadCost
                     << ", across block:\n    " << *BB << "\n");
 
   if (DTU->hasPendingDomTreeUpdates())
@@ -1967,11 +2084,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     LVI->enableDT();
   LVI->threadEdge(PredBB, BB, SuccBB);
 
-  // We are going to have to map operands from the original BB block to the new
-  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
-  // account for entry from PredBB.
-  DenseMap<Instruction*, Value*> ValueMapping;
-
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
                                          BB->getName()+".thread",
                                          BB->getParent(), BB);
@@ -1984,32 +2096,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
   }
 
-  BasicBlock::iterator BI = BB->begin();
-  // Clone the phi nodes of BB into NewBB. The resulting phi nodes are trivial,
-  // since NewBB only has one predecessor, but SSAUpdater might need to rewrite
-  // the operand of the cloned phi.
-  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
-    PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
-    NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
-    ValueMapping[PN] = NewPN;
-  }
-
-  // Clone the non-phi instructions of BB into NewBB, keeping track of the
-  // mapping and using it to remap operands in the cloned instructions.
-  for (; !BI->isTerminator(); ++BI) {
-    Instruction *New = BI->clone();
-    New->setName(BI->getName());
-    NewBB->getInstList().push_back(New);
-    ValueMapping[&*BI] = New;
-
-    // Remap operands to patch up intra-block references.
-    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
-      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
-        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
-        if (I != ValueMapping.end())
-          New->setOperand(i, I->second);
-      }
-  }
+  // Copy all the instructions from BB to NewBB except the terminator.
+  DenseMap<Instruction *, Value *> ValueMapping =
+      CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
@@ -2035,44 +2124,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
                                {DominatorTree::Insert, PredBB, NewBB},
                                {DominatorTree::Delete, PredBB, BB}});
 
-  // If there were values defined in BB that are used outside the block, then we
-  // now have to update all uses of the value to use either the original value,
-  // the cloned value, or some PHI derived value.  This can require arbitrary
-  // PHI insertion, of which we are prepared to do, clean these up now.
-  SSAUpdater SSAUpdate;
-  SmallVector<Use*, 16> UsesToRename;
-
-  for (Instruction &I : *BB) {
-    // Scan all uses of this instruction to see if their uses are no longer
-    // dominated by the previous def and if so, record them in UsesToRename.
-    // Also, skip phi operands from PredBB - we'll remove them anyway.
-    for (Use &U : I.uses()) {
-      Instruction *User = cast<Instruction>(U.getUser());
-      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
-        if (UserPN->getIncomingBlock(U) == BB)
-          continue;
-      } else if (User->getParent() == BB)
-        continue;
-
-      UsesToRename.push_back(&U);
-    }
-
-    // If there are no uses outside the block, we're done with this instruction.
-    if (UsesToRename.empty())
-      continue;
-    LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
-
-    // We found a use of I outside of BB.  Rename all uses of I that are outside
-    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
-    // with the two values we know.
-    SSAUpdate.Initialize(I.getType(), I.getName());
-    SSAUpdate.AddAvailableValue(BB, &I);
-    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
-
-    while (!UsesToRename.empty())
-      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    LLVM_DEBUG(dbgs() << "\n");
-  }
+  UpdateSSA(BB, NewBB, ValueMapping);
 
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
@@ -2084,7 +2136,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
 
   // Threaded an edge!
   ++NumThreads;
-  return true;
 }
 
 /// Create a new basic block that will be the predecessor of BB and successor of
@@ -2356,43 +2407,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
   AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
                                   ValueMapping);
 
-  // If there were values defined in BB that are used outside the block, then we
-  // now have to update all uses of the value to use either the original value,
-  // the cloned value, or some PHI derived value.  This can require arbitrary
-  // PHI insertion, of which we are prepared to do, clean these up now.
-  SSAUpdater SSAUpdate;
-  SmallVector<Use*, 16> UsesToRename;
-  for (Instruction &I : *BB) {
-    // Scan all uses of this instruction to see if it is used outside of its
-    // block, and if so, record them in UsesToRename.
-    for (Use &U : I.uses()) {
-      Instruction *User = cast<Instruction>(U.getUser());
-      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
-        if (UserPN->getIncomingBlock(U) == BB)
-          continue;
-      } else if (User->getParent() == BB)
-        continue;
-
-      UsesToRename.push_back(&U);
-    }
-
-    // If there are no uses outside the block, we're done with this instruction.
-    if (UsesToRename.empty())
-      continue;
-
-    LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
-
-    // We found a use of I outside of BB.  Rename all uses of I that are outside
-    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
-    // with the two values we know.
-    SSAUpdate.Initialize(I.getType(), I.getName());
-    SSAUpdate.AddAvailableValue(BB, &I);
-    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]);
-
-    while (!UsesToRename.empty())
-      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    LLVM_DEBUG(dbgs() << "\n");
-  }
+  UpdateSSA(BB, PredBB, ValueMapping);
 
   // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
   // that we nuked.
@@ -2423,7 +2438,7 @@ void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
   //  |-----
   //  v
   // BB
-  BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+  BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
                                          BB->getParent(), BB);
   // Move the unconditional branch to NewBB.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
index d9dda4cef2d2..8c33045c2380 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -63,6 +63,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/PredIteratorCache.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -137,7 +138,8 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   TargetTransformInfo *TTI, bool &FreeInLoop);
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
-                  MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE);
+                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                  OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
                  MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE);
@@ -162,7 +164,7 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
 
 static void moveInstructionBefore(Instruction &I, Instruction &Dest,
                                   ICFLoopSafetyInfo &SafetyInfo,
-                                  MemorySSAUpdater *MSSAU);
+                                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE);
 
 namespace {
 struct LoopInvariantCodeMotion {
@@ -220,7 +222,8 @@ struct LegacyLICMPass : public LoopPass {
                           &getAnalysis<AAResultsWrapperPass>().getAAResults(),
                           &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                           &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+                              *L->getHeader()->getParent()),
                           &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
                               *L->getHeader()->getParent()),
                           SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
@@ -294,7 +297,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
 
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
 
   return PA;
@@ -330,6 +333,12 @@ bool LoopInvariantCodeMotion::runOnLoop(
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
 
+  // If this loop has metadata indicating that LICM is not to be performed then
+  // just exit.
+  if (hasDisableLICMTransformsHint(L)) {
+    return false;
+  }
+
   std::unique_ptr<AliasSetTracker> CurAST;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
   bool NoOfMemAccTooLarge = false;
@@ -340,7 +349,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
     CurAST = collectAliasInfoForLoop(L, LI, AA);
   } else {
     LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
-    MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
     unsigned AccessCapCount = 0;
     for (auto *BB : L->getBlocks()) {
@@ -383,8 +392,9 @@ bool LoopInvariantCodeMotion::runOnLoop(
                           CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE);
   Flags.IsSink = false;
   if (Preheader)
-    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
-                           CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE);
+    Changed |=
+        hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+                    CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -780,6 +790,41 @@ public:
 };
 } // namespace
 
+
+/// Return true if we know how to rewrite all uses of the given alloca after
+/// hoisting it out of the loop.  The main concerns are a) potential captures
+/// and b) invariant.start markers which don't capture, but are no longer
+/// valid w/o a corresponding invariant.end.
+static bool canRewriteUsesOfAlloca(AllocaInst &AI) {
+  // TODO: This looks a lot like capture tracking, but we need to remove any
+  // invariant starts if we extend the lifetime of the alloca by hoisting it.
+  // We should probably refactor capture tracking into a form which allows us
+  // to reuse the relevant bits and remove the duplicated logic here.
+
+  SmallVector<Use *, 16> Worklist;
+  for (Use &U : AI.uses())
+    Worklist.push_back(&U);
+  
+  unsigned NumUsesExplored = 0;
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+    NumUsesExplored++;
+    if (NumUsesExplored > DefaultMaxUsesToExplore)
+      return false;
+    // Non capturing, terminating uses
+    if (isa<LoadInst>(I) ||
+        (isa<StoreInst>(I) && U->getOperandNo() == 1))
+      continue;
+    // Non capturing, non-terminating
+    if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I))
+      return false;
+    for (Use &U : I->uses())
+      Worklist.push_back(&U);
+  }
+  return true;
+}
+
 /// Walk the specified region of the CFG (defined by all blocks dominated by
 /// the specified block, and that are in the current loop) in depth first
 /// order w.r.t the DominatorTree.  This allows us to visit definitions before
@@ -788,7 +833,7 @@ public:
 bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                        DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
                        AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
-                       ICFLoopSafetyInfo *SafetyInfo,
+                       ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
                        OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
@@ -848,7 +893,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
               I, DT, CurLoop, SafetyInfo, ORE,
               CurLoop->getLoopPreheader()->getTerminator())) {
         hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
-              MSSAU, ORE);
+              MSSAU, SE, ORE);
         HoistedInstructions.push_back(&I);
         Changed = true;
         continue;
@@ -875,7 +920,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
 
         hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB),
-              SafetyInfo, MSSAU, ORE);
+              SafetyInfo, MSSAU, SE, ORE);
         HoistedInstructions.push_back(ReciprocalDivisor);
         Changed = true;
         continue;
@@ -894,7 +939,17 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
           CurLoop->hasLoopInvariantOperands(&I) &&
           MustExecuteWithoutWritesBefore(I)) {
         hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
-              MSSAU, ORE);
+              MSSAU, SE, ORE);
+        HoistedInstructions.push_back(&I);
+        Changed = true;
+        continue;
+      }
+
+      if (isa<AllocaInst>(&I) &&
+          SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
+          canRewriteUsesOfAlloca(cast<AllocaInst>(I))) {
+        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+              MSSAU, SE, ORE);
         HoistedInstructions.push_back(&I);
         Changed = true;
         continue;
@@ -908,7 +963,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
             PN->setIncomingBlock(
                 i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i)));
           hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
-                MSSAU, ORE);
+                MSSAU, SE, ORE);
           assert(DT->dominates(PN, BB) && "Conditional PHIs not expected");
           Changed = true;
           continue;
@@ -945,7 +1000,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         LLVM_DEBUG(dbgs() << "LICM rehoisting to "
                           << HoistPoint->getParent()->getName()
                           << ": " << *I << "\n");
-        moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU);
+        moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE);
         HoistPoint = I;
         Changed = true;
       }
@@ -956,7 +1011,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 
     // Now that we've finished hoisting make sure that LI and DT are still
     // valid.
-#ifndef NDEBUG
+#ifdef EXPENSIVE_CHECKS
   if (Changed) {
     assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
            "Dominator tree verification failed");
@@ -1026,7 +1081,8 @@ namespace {
 bool isHoistableAndSinkableInst(Instruction &I) {
   // Only these instructions are hoistable/sinkable.
   return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
-          isa<FenceInst>(I) || isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+          isa<FenceInst>(I) || isa<CastInst>(I) ||
+          isa<UnaryOperator>(I) || isa<BinaryOperator>(I) ||
           isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
           isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
           isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
@@ -1092,7 +1148,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     // in the same alias set as something that ends up being modified.
     if (AA->pointsToConstantMemory(LI->getOperand(0)))
       return true;
-    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+    if (LI->hasMetadata(LLVMContext::MD_invariant_load))
       return true;
 
     if (LI->isAtomic() && !TargetExecutesOncePerLoop)
@@ -1134,6 +1190,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
       // Assumes don't actually alias anything or throw
       return true;
 
+    if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
+      // Widenable conditions don't actually alias anything or throw
+      return true;
+
     // Handle simple cases by querying alias analysis.
     FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
     if (Behavior == FMRB_DoesNotAccessMemory)
@@ -1240,12 +1300,22 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
               // FIXME: More precise: no Uses that alias SI.
               if (!Flags->IsSink && !MSSA->dominates(SIMD, MU))
                 return false;
-            } else if (const auto *MD = dyn_cast<MemoryDef>(&MA))
+            } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
               if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
                 (void)LI; // Silence warning.
                 assert(!LI->isUnordered() && "Expected unordered load");
                 return false;
               }
+              // Any call, while it may not be clobbering SI, it may be a use.
+              if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
+                // Check if the call may read from the memory locattion written
+                // to by SI. Check CI's attributes and arguments; the number of
+                // such checks performed is limited above by NoOfMemAccTooLarge.
+                ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
+                if (isModOrRefSet(MRI))
+                  return false;
+              }
+            }
         }
 
       auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
@@ -1375,8 +1445,7 @@ static Instruction *CloneInstructionInExitBlock(
   if (!I.getName().empty())
     New->setName(I.getName() + ".le");
 
-  MemoryAccess *OldMemAcc;
-  if (MSSAU && (OldMemAcc = MSSAU->getMemorySSA()->getMemoryAccess(&I))) {
+  if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
     // Create a new MemoryAccess and let MemorySSA set its defining access.
     MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
         New, nullptr, New->getParent(), MemorySSA::Beginning);
@@ -1385,7 +1454,7 @@ static Instruction *CloneInstructionInExitBlock(
         MSSAU->insertDef(MemDef, /*RenameUses=*/true);
       else {
         auto *MemUse = cast<MemoryUse>(NewMemAcc);
-        MSSAU->insertUse(MemUse);
+        MSSAU->insertUse(MemUse, /*RenameUses=*/true);
       }
     }
   }
@@ -1424,14 +1493,18 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
 
 static void moveInstructionBefore(Instruction &I, Instruction &Dest,
                                   ICFLoopSafetyInfo &SafetyInfo,
-                                  MemorySSAUpdater *MSSAU) {
+                                  MemorySSAUpdater *MSSAU,
+                                  ScalarEvolution *SE) {
   SafetyInfo.removeInstruction(&I);
   SafetyInfo.insertInstructionTo(&I, Dest.getParent());
   I.moveBefore(&Dest);
   if (MSSAU)
     if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
             MSSAU->getMemorySSA()->getMemoryAccess(&I)))
-      MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), MemorySSA::End);
+      MSSAU->moveToPlace(OldMemAcc, Dest.getParent(),
+                         MemorySSA::BeforeTerminator);
+  if (SE)
+    SE->forgetValue(&I);
 }
 
 static Instruction *sinkThroughTriviallyReplaceablePHI(
@@ -1645,7 +1718,8 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
 ///
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
-                  MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) {
+                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                  OptimizationRemarkEmitter *ORE) {
   LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I
                     << "\n");
   ORE->emit([&]() {
@@ -1666,10 +1740,10 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
 
   if (isa<PHINode>(I))
     // Move the new node to the end of the phi list in the destination block.
-    moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU);
+    moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE);
   else
     // Move the new node to the destination block, before its terminator.
-    moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU);
+    moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE);
 
   // Apply line 0 debug locations when we are moving instructions to different
   // basic blocks because we want to avoid jumpy line tables.
@@ -1783,7 +1857,7 @@ public:
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
       if (UnorderedAtomic)
         NewSI->setOrdering(AtomicOrdering::Unordered);
-      NewSI->setAlignment(Alignment);
+      NewSI->setAlignment(MaybeAlign(Alignment));
       NewSI->setDebugLoc(DL);
       if (AATags)
         NewSI->setAAMetadata(AATags);
@@ -2016,7 +2090,8 @@ bool llvm::promoteLoopAccessesToScalars(
         if (!DereferenceableInPH) {
           DereferenceableInPH = isDereferenceableAndAlignedPointer(
               Store->getPointerOperand(), Store->getValueOperand()->getType(),
-              Store->getAlignment(), MDL, Preheader->getTerminator(), DT);
+              MaybeAlign(Store->getAlignment()), MDL,
+              Preheader->getTerminator(), DT);
         }
       } else
         return false; // Not a load or store.
@@ -2101,20 +2176,21 @@ bool llvm::promoteLoopAccessesToScalars(
       SomePtr->getName() + ".promoted", Preheader->getTerminator());
   if (SawUnorderedAtomic)
     PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
-  PreheaderLoad->setAlignment(Alignment);
+  PreheaderLoad->setAlignment(MaybeAlign(Alignment));
   PreheaderLoad->setDebugLoc(DL);
   if (AATags)
     PreheaderLoad->setAAMetadata(AATags);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
-  MemoryAccess *PreheaderLoadMemoryAccess;
   if (MSSAU) {
-    PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
+    MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
         PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
     MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
-    MSSAU->insertUse(NewMemUse);
+    MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
   }
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
   // Rewrite all the loads in the loop and remember all the definitions from
   // stores in the loop.
   Promoter.run(LoopUses);
@@ -2161,7 +2237,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
     LoopToAliasSetMap.erase(MapI);
   }
   if (!CurAST)
-    CurAST = make_unique<AliasSetTracker>(*AA);
+    CurAST = std::make_unique<AliasSetTracker>(*AA);
 
   // Add everything from the sub loops that are no longer directly available.
   for (Loop *InnerL : RecomputeLoops)
@@ -2180,7 +2256,7 @@ std::unique_ptr<AliasSetTracker>
 LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA(
     Loop *L, AliasAnalysis *AA, MemorySSAUpdater *MSSAU) {
   auto *MSSA = MSSAU->getMemorySSA();
-  auto CurAST = make_unique<AliasSetTracker>(*AA, MSSA, L);
+  auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L);
   CurAST->addAllInstructionsInLoopUsingMSSA();
   return CurAST;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 1fcf1315a177..ab65f56d088f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+#include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "loop-data-prefetch"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -312,8 +313,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
       IRBuilder<> Builder(MemI);
       Module *M = BB->getParent()->getParent();
       Type *I32 = Type::getInt32Ty(BB->getContext());
-      Function *PrefetchFunc =
-          Intrinsic::getDeclaration(M, Intrinsic::prefetch);
+      Function *PrefetchFunc = Intrinsic::getDeclaration(
+          M, Intrinsic::prefetch, PrefPtrValue->getType());
       Builder.CreateCall(
           PrefetchFunc,
           {PrefPtrValue,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 8371367e24e7..2451572d6171 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -191,7 +192,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
-  const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+  const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S)) {
     LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
     return Changed ? LoopDeletionResult::Modified
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index f45e5fd0f50b..8e04e6e0ffe8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -55,6 +55,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 0bc2bcff2ae1..e1738f08eb23 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -55,18 +55,21 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-fusion"
 
-STATISTIC(FuseCounter, "Count number of loop fusions performed");
+STATISTIC(FuseCounter, "Loops fused");
 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
 STATISTIC(InvalidPreheader, "Loop has invalid preheader");
 STATISTIC(InvalidHeader, "Loop has invalid header");
@@ -79,12 +82,16 @@ STATISTIC(MayThrowException, "Loop may throw an exception");
 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
 STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
-STATISTIC(InvalidTripCount,
-          "Loop does not have invariant backedge taken count");
+STATISTIC(UnknownTripCount, "Loop has unknown trip count");
 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
-STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same");
-STATISTIC(NonAdjacent, "Candidates are not adjacent");
-STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader");
+STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
+STATISTIC(NonAdjacent, "Loops are not adjacent");
+STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
+STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
+STATISTIC(NonIdenticalGuards, "Candidates have different guards");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
+STATISTIC(NotRotated, "Candidate is not rotated");
 
 enum FusionDependenceAnalysisChoice {
   FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -110,6 +117,7 @@ static cl::opt<bool>
                            cl::Hidden, cl::init(false), cl::ZeroOrMore);
 #endif
 
+namespace {
 /// This class is used to represent a candidate for loop fusion. When it is
 /// constructed, it checks the conditions for loop fusion to ensure that it
 /// represents a valid candidate. It caches several parts of a loop that are
@@ -143,6 +151,8 @@ struct FusionCandidate {
   SmallVector<Instruction *, 16> MemWrites;
   /// Are all of the members of this fusion candidate still valid
   bool Valid;
+  /// Guard branch of the loop, if it exists
+  BranchInst *GuardBranch;
 
   /// Dominator and PostDominator trees are needed for the
   /// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -151,11 +161,14 @@ struct FusionCandidate {
   const DominatorTree *DT;
   const PostDominatorTree *PDT;
 
+  OptimizationRemarkEmitter &ORE;
+
   FusionCandidate(Loop *L, const DominatorTree *DT,
-                  const PostDominatorTree *PDT)
+                  const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
-        Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) {
+        Latch(L->getLoopLatch()), L(L), Valid(true),
+        GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) {
 
     // Walk over all blocks in the loop and check for conditions that may
     // prevent fusion. For each block, walk over all instructions and collect
@@ -163,28 +176,28 @@ struct FusionCandidate {
     // found, invalidate this object and return.
     for (BasicBlock *BB : L->blocks()) {
       if (BB->hasAddressTaken()) {
-        AddressTakenBB++;
         invalidate();
+        reportInvalidCandidate(AddressTakenBB);
         return;
       }
 
       for (Instruction &I : *BB) {
         if (I.mayThrow()) {
-          MayThrowException++;
           invalidate();
+          reportInvalidCandidate(MayThrowException);
           return;
         }
         if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
           if (SI->isVolatile()) {
-            ContainsVolatileAccess++;
             invalidate();
+            reportInvalidCandidate(ContainsVolatileAccess);
             return;
           }
         }
         if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
           if (LI->isVolatile()) {
-            ContainsVolatileAccess++;
             invalidate();
+            reportInvalidCandidate(ContainsVolatileAccess);
             return;
           }
         }
@@ -214,19 +227,100 @@ struct FusionCandidate {
     assert(Latch == L->getLoopLatch() && "Latch is out of sync");
   }
 
+  /// Get the entry block for this fusion candidate.
+  ///
+  /// If this fusion candidate represents a guarded loop, the entry block is the
+  /// loop guard block. If it represents an unguarded loop, the entry block is
+  /// the preheader of the loop.
+  BasicBlock *getEntryBlock() const {
+    if (GuardBranch)
+      return GuardBranch->getParent();
+    else
+      return Preheader;
+  }
+
+  /// Given a guarded loop, get the successor of the guard that is not in the
+  /// loop.
+  ///
+  /// This method returns the successor of the loop guard that is not located
+  /// within the loop (i.e., the successor of the guard that is not the
+  /// preheader).
+  /// This method is only valid for guarded loops.
+  BasicBlock *getNonLoopBlock() const {
+    assert(GuardBranch && "Only valid on guarded loops.");
+    assert(GuardBranch->isConditional() &&
+           "Expecting guard to be a conditional branch.");
+    return (GuardBranch->getSuccessor(0) == Preheader)
+               ? GuardBranch->getSuccessor(1)
+               : GuardBranch->getSuccessor(0);
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const {
-    dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
+    dbgs() << "\tGuardBranch: ";
+    if (GuardBranch)
+      dbgs() << *GuardBranch;
+    else
+      dbgs() << "nullptr";
+    dbgs() << "\n"
+           << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
+           << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
            << "\n"
            << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
            << "\tExitingBB: "
            << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
            << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
            << "\n"
-           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n";
+           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
+           << "\tEntryBlock: "
+           << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
+           << "\n";
   }
 #endif
 
+  /// Determine if a fusion candidate (representing a loop) is eligible for
+  /// fusion. Note that this only checks whether a single loop can be fused - it
+  /// does not check whether it is *legal* to fuse two loops together.
+  bool isEligibleForFusion(ScalarEvolution &SE) const {
+    if (!isValid()) {
+      LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
+      if (!Preheader)
+        ++InvalidPreheader;
+      if (!Header)
+        ++InvalidHeader;
+      if (!ExitingBlock)
+        ++InvalidExitingBlock;
+      if (!ExitBlock)
+        ++InvalidExitBlock;
+      if (!Latch)
+        ++InvalidLatch;
+      if (L->isInvalid())
+        ++InvalidLoop;
+
+      return false;
+    }
+
+    // Require ScalarEvolution to be able to determine a trip count.
+    if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                        << " trip count not computable!\n");
+      return reportInvalidCandidate(UnknownTripCount);
+    }
+
+    if (!L->isLoopSimplifyForm()) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                        << " is not in simplified form!\n");
+      return reportInvalidCandidate(NotSimplifiedForm);
+    }
+
+    if (!L->isRotatedForm()) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
+      return reportInvalidCandidate(NotRotated);
+    }
+
+    return true;
+  }
+
 private:
   // This is only used internally for now, to clear the MemWrites and MemReads
   // list and setting Valid to false. I can't envision other uses of this right
@@ -239,17 +333,18 @@ private:
     MemReads.clear();
     Valid = false;
   }
-};
-
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                                     const FusionCandidate &FC) {
-  if (FC.isValid())
-    OS << FC.Preheader->getName();
-  else
-    OS << "<Invalid>";
 
-  return OS;
-}
+  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+    using namespace ore;
+    assert(L && Preheader && "Fusion candidate not initialized properly!");
+    ++Stat;
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
+                                        L->getStartLoc(), Preheader)
+             << "[" << Preheader->getParent()->getName() << "]: "
+             << "Loop is not a candidate for fusion: " << Stat.getDesc());
+    return false;
+  }
+};
 
 struct FusionCandidateCompare {
   /// Comparison functor to sort two Control Flow Equivalent fusion candidates
@@ -260,21 +355,24 @@ struct FusionCandidateCompare {
                   const FusionCandidate &RHS) const {
     const DominatorTree *DT = LHS.DT;
 
+    BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
+    BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
+
     // Do not save PDT to local variable as it is only used in asserts and thus
     // will trigger an unused variable warning if building without asserts.
     assert(DT && LHS.PDT && "Expecting valid dominator tree");
 
     // Do this compare first so if LHS == RHS, function returns false.
-    if (DT->dominates(RHS.Preheader, LHS.Preheader)) {
+    if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
       // RHS dominates LHS
       // Verify LHS post-dominates RHS
-      assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader));
+      assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
       return false;
     }
 
-    if (DT->dominates(LHS.Preheader, RHS.Preheader)) {
+    if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
       // Verify RHS Postdominates LHS
-      assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader));
+      assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
       return true;
     }
 
@@ -286,7 +384,6 @@ struct FusionCandidateCompare {
   }
 };
 
-namespace {
 using LoopVector = SmallVector<Loop *, 4>;
 
 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
@@ -301,17 +398,26 @@ using LoopVector = SmallVector<Loop *, 4>;
 // keeps the FusionCandidateSet sorted will also simplify the implementation.
 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
-} // namespace
 
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+#if !defined(NDEBUG)
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                                     const FusionCandidate &FC) {
+  if (FC.isValid())
+    OS << FC.Preheader->getName();
+  else
+    OS << "<Invalid>";
+
+  return OS;
+}
+
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                      const FusionCandidateSet &CandSet) {
-  for (auto IT : CandSet)
-    OS << IT << "\n";
+  for (const FusionCandidate &FC : CandSet)
+    OS << FC << '\n';
 
   return OS;
 }
 
-#if !defined(NDEBUG)
 static void
 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
   dbgs() << "Fusion Candidates: \n";
@@ -391,16 +497,6 @@ static void printLoopVector(const LoopVector &LV) {
 }
 #endif
 
-static void reportLoopFusion(const FusionCandidate &FC0,
-                             const FusionCandidate &FC1,
-                             OptimizationRemarkEmitter &ORE) {
-  using namespace ore;
-  ORE.emit(
-      OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent())
-      << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName()))
-      << " with " << NV("Cand2", StringRef(FC1.Preheader->getName())));
-}
-
 struct LoopFuser {
 private:
   // Sets of control flow equivalent fusion candidates for a given nest level.
@@ -497,53 +593,8 @@ private:
                                const FusionCandidate &FC1) const {
     assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
 
-    if (DT.dominates(FC0.Preheader, FC1.Preheader))
-      return PDT.dominates(FC1.Preheader, FC0.Preheader);
-
-    if (DT.dominates(FC1.Preheader, FC0.Preheader))
-      return PDT.dominates(FC0.Preheader, FC1.Preheader);
-
-    return false;
-  }
-
-  /// Determine if a fusion candidate (representing a loop) is eligible for
-  /// fusion. Note that this only checks whether a single loop can be fused - it
-  /// does not check whether it is *legal* to fuse two loops together.
-  bool eligibleForFusion(const FusionCandidate &FC) const {
-    if (!FC.isValid()) {
-      LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n");
-      if (!FC.Preheader)
-        InvalidPreheader++;
-      if (!FC.Header)
-        InvalidHeader++;
-      if (!FC.ExitingBlock)
-        InvalidExitingBlock++;
-      if (!FC.ExitBlock)
-        InvalidExitBlock++;
-      if (!FC.Latch)
-        InvalidLatch++;
-      if (FC.L->isInvalid())
-        InvalidLoop++;
-
-      return false;
-    }
-
-    // Require ScalarEvolution to be able to determine a trip count.
-    if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) {
-      LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
-                        << " trip count not computable!\n");
-      InvalidTripCount++;
-      return false;
-    }
-
-    if (!FC.L->isLoopSimplifyForm()) {
-      LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
-                        << " is not in simplified form!\n");
-      NotSimplifiedForm++;
-      return false;
-    }
-
-    return true;
+    return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
+                                     DT, PDT);
   }
 
   /// Iterate over all loops in the given loop set and identify the loops that
@@ -551,8 +602,8 @@ private:
   /// Flow Equivalent sets, sorted by dominance.
   void collectFusionCandidates(const LoopVector &LV) {
     for (Loop *L : LV) {
-      FusionCandidate CurrCand(L, &DT, &PDT);
-      if (!eligibleForFusion(CurrCand))
+      FusionCandidate CurrCand(L, &DT, &PDT, ORE);
+      if (!CurrCand.isEligibleForFusion(SE))
         continue;
 
       // Go through each list in FusionCandidates and determine if L is control
@@ -664,31 +715,64 @@ private:
           if (!identicalTripCounts(*FC0, *FC1)) {
             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
                                  "counts. Not fusing.\n");
-            NonEqualTripCount++;
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEqualTripCount);
             continue;
           }
 
           if (!isAdjacent(*FC0, *FC1)) {
             LLVM_DEBUG(dbgs()
                        << "Fusion candidates are not adjacent. Not fusing.\n");
-            NonAdjacent++;
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
+            continue;
+          }
+
+          // Ensure that FC0 and FC1 have identical guards.
+          // If one (or both) are not guarded, this check is not necessary.
+          if (FC0->GuardBranch && FC1->GuardBranch &&
+              !haveIdenticalGuards(*FC0, *FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
+                                 "guards. Not Fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonIdenticalGuards);
             continue;
           }
 
-          // For now we skip fusing if the second candidate has any instructions
-          // in the preheader. This is done because we currently do not have the
-          // safety checks to determine if it is save to move the preheader of
-          // the second candidate past the body of the first candidate. Once
-          // these checks are added, this condition can be removed.
+          // The following three checks look for empty blocks in FC0 and FC1. If
+          // any of these blocks are non-empty, we do not fuse. This is done
+          // because we currently do not have the safety checks to determine if
+          // it is safe to move the blocks past other blocks in the loop. Once
+          // these checks are added, these conditions can be relaxed.
           if (!isEmptyPreheader(*FC1)) {
             LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
                                  "preheader. Not fusing.\n");
-            NonEmptyPreheader++;
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyPreheader);
+            continue;
+          }
+
+          if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyExitBlock);
+            continue;
+          }
+
+          if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyGuardBlock);
             continue;
           }
 
+          // Check the dependencies across the loops and do not fuse if it would
+          // violate them.
           if (!dependencesAllowFusion(*FC0, *FC1)) {
             LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       InvalidDependencies);
             continue;
           }
 
@@ -696,9 +780,11 @@ private:
           LLVM_DEBUG(dbgs()
                      << "\tFusion appears to be "
                      << (BeneficialToFuse ? "" : "un") << "profitable!\n");
-          if (!BeneficialToFuse)
+          if (!BeneficialToFuse) {
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       FusionNotBeneficial);
             continue;
-
+          }
           // All analysis has completed and has determined that fusion is legal
           // and profitable. At this point, start transforming the code and
           // perform fusion.
@@ -710,15 +796,14 @@ private:
           // Note this needs to be done *before* performFusion because
           // performFusion will change the original loops, making it not
           // possible to identify them after fusion is complete.
-          reportLoopFusion(*FC0, *FC1, ORE);
+          reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
 
-          FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT);
+          FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
           FusedCand.verify();
-          assert(eligibleForFusion(FusedCand) &&
+          assert(FusedCand.isEligibleForFusion(SE) &&
                  "Fused candidate should be eligible for fusion!");
 
           // Notify the loop-depth-tree that these loops are not valid objects
-          // anymore.
           LDT.removeLoop(FC1->L);
 
           CandidateSet.erase(FC0);
@@ -889,7 +974,7 @@ private:
     LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
                       << "\n");
     assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
-    assert(DT.dominates(FC0.Preheader, FC1.Preheader));
+    assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
 
     for (Instruction *WriteL0 : FC0.MemWrites) {
       for (Instruction *WriteL1 : FC1.MemWrites)
@@ -939,18 +1024,112 @@ private:
     return true;
   }
 
-  /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this
-  /// case, there is no code in between the two fusion candidates, thus making
-  /// them adjacent.
+  /// Determine if two fusion candidates are adjacent in the CFG.
+  ///
+  /// This method will determine if there are additional basic blocks in the CFG
+  /// between the exit of \p FC0 and the entry of \p FC1.
+  /// If the two candidates are guarded loops, then it checks whether the
+  /// non-loop successor of the \p FC0 guard branch is the entry block of \p
+  /// FC1. If not, then the loops are not adjacent. If the two candidates are
+  /// not guarded loops, then it checks whether the exit block of \p FC0 is the
+  /// preheader of \p FC1.
   bool isAdjacent(const FusionCandidate &FC0,
                   const FusionCandidate &FC1) const {
-    return FC0.ExitBlock == FC1.Preheader;
+    // If the successor of the guard branch is FC1, then the loops are adjacent
+    if (FC0.GuardBranch)
+      return FC0.getNonLoopBlock() == FC1.getEntryBlock();
+    else
+      return FC0.ExitBlock == FC1.getEntryBlock();
+  }
+
+  /// Determine if two fusion candidates have identical guards
+  ///
+  /// This method will determine if two fusion candidates have the same guards.
+  /// The guards are considered the same if:
+  ///   1. The instructions to compute the condition used in the compare are
+  ///      identical.
+  ///   2. The successors of the guard have the same flow into/around the loop.
+  /// If the compare instructions are identical, then the first successor of the
+  /// guard must go to the same place (either the preheader of the loop or the
+  /// NonLoopBlock). In other words, the the first successor of both loops must
+  /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
+  /// the NonLoopBlock). The same must be true for the second successor.
+  bool haveIdenticalGuards(const FusionCandidate &FC0,
+                           const FusionCandidate &FC1) const {
+    assert(FC0.GuardBranch && FC1.GuardBranch &&
+           "Expecting FC0 and FC1 to be guarded loops.");
+
+    if (auto FC0CmpInst =
+            dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
+      if (auto FC1CmpInst =
+              dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
+        if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
+          return false;
+
+    // The compare instructions are identical.
+    // Now make sure the successor of the guards have the same flow into/around
+    // the loop
+    if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
+      return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
+    else
+      return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
+  }
+
+  /// Check that the guard for \p FC *only* contains the cmp/branch for the
+  /// guard.
+  /// Once we are able to handle intervening code, any code in the guard block
+  /// for FC1 will need to be treated as intervening code and checked whether
+  /// it can safely move around the loops.
+  bool isEmptyGuardBlock(const FusionCandidate &FC) const {
+    assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
+    if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
+      auto *GuardBlock = FC.GuardBranch->getParent();
+      // If the generation of the cmp value is in GuardBlock, then the size of
+      // the guard block should be 2 (cmp + branch). If the generation of the
+      // cmp value is in a different block, then the size of the guard block
+      // should only be 1.
+      if (CmpInst->getParent() == GuardBlock)
+        return GuardBlock->size() == 2;
+      else
+        return GuardBlock->size() == 1;
+    }
+
+    return false;
   }
 
   bool isEmptyPreheader(const FusionCandidate &FC) const {
+    assert(FC.Preheader && "Expecting a valid preheader");
     return FC.Preheader->size() == 1;
   }
 
+  bool isEmptyExitBlock(const FusionCandidate &FC) const {
+    assert(FC.ExitBlock && "Expecting a valid exit block");
+    return FC.ExitBlock->size() == 1;
+  }
+
+  /// Simplify the condition of the latch branch of \p FC to true, when both of
+  /// its successors are the same.
+  void simplifyLatchBranch(const FusionCandidate &FC) const {
+    BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
+    if (FCLatchBranch) {
+      assert(FCLatchBranch->isConditional() &&
+             FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
+             "Expecting the two successors of FCLatchBranch to be the same");
+      FCLatchBranch->setCondition(
+          llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType()));
+    }
+  }
+
+  /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
+  /// successor, then merge FC0.Latch with its unique successor.
+  void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
+    moveInstsBottomUp(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
+    if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
+      MergeBlockIntoPredecessor(Succ, &DTU, &LI);
+      DTU.flush();
+    }
+  }
+
   /// Fuse two fusion candidates, creating a new fused loop.
   ///
   /// This method contains the mechanics of fusing two loops, represented by \p
@@ -987,6 +1166,12 @@ private:
     LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
                dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
 
+    // Fusing guarded loops is handled slightly differently than non-guarded
+    // loops and has been broken out into a separate method instead of trying to
+    // intersperse the logic within a single method.
+    if (FC0.GuardBranch)
+      return fuseGuardedLoops(FC0, FC1);
+
     assert(FC1.Preheader == FC0.ExitBlock);
     assert(FC1.Preheader->size() == 1 &&
            FC1.Preheader->getSingleSuccessor() == FC1.Header);
@@ -1078,6 +1263,10 @@ private:
     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
 
+    // Change the condition of FC0 latch branch to true, as both successors of
+    // the branch are the same.
+    simplifyLatchBranch(FC0);
+
     // If FC0.Latch and FC0.ExitingBlock are the same then we have already
     // performed the updates above.
     if (FC0.Latch != FC0.ExitingBlock)
@@ -1100,9 +1289,15 @@ private:
 
     // Is there a way to keep SE up-to-date so we don't need to forget the loops
     // and rebuild the information in subsequent passes of fusion?
+    // Note: Need to forget the loops before merging the loop latches, as
+    // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
 
+    // Move instructions from FC0.Latch to FC1.Latch.
+    // Note: mergeLatch requires an updated DT.
+    mergeLatch(FC0, FC1);
+
     // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
                                         FC1.L->block_end());
@@ -1131,7 +1326,268 @@ private:
     SE.verify();
 #endif
 
-    FuseCounter++;
+    LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+    return FC0.L;
+  }
+
+  /// Report details on loop fusion opportunities.
+  ///
+  /// This template function can be used to report both successful and missed
+  /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
+  /// be one of:
+  ///   - OptimizationRemarkMissed to report when loop fusion is unsuccessful
+  ///     given two valid fusion candidates.
+  ///   - OptimizationRemark to report successful fusion of two fusion
+  ///     candidates.
+  /// The remarks will be printed using the form:
+  ///    <path/filename>:<line number>:<column number>: [<function name>]:
+  ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
+  template <typename RemarkKind>
+  void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
+                        llvm::Statistic &Stat) {
+    assert(FC0.Preheader && FC1.Preheader &&
+           "Expecting valid fusion candidates");
+    using namespace ore;
+    ++Stat;
+    ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
+                        FC0.Preheader)
+             << "[" << FC0.Preheader->getParent()->getName()
+             << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
+             << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
+             << ": " << Stat.getDesc());
+  }
+
+  /// Fuse two guarded fusion candidates, creating a new fused loop.
+  ///
+  /// Fusing guarded loops is handled much the same way as fusing non-guarded
+  /// loops. The rewiring of the CFG is slightly different though, because of
+  /// the presence of the guards around the loops and the exit blocks after the
+  /// loop body. As such, the new loop is rewired as follows:
+  ///    1. Keep the guard branch from FC0 and use the non-loop block target
+  /// from the FC1 guard branch.
+  ///    2. Remove the exit block from FC0 (this exit block should be empty
+  /// right now).
+  ///    3. Remove the guard branch for FC1
+  ///    4. Remove the preheader for FC1.
+  /// The exit block successor for the latch of FC0 is updated to be the header
+  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
+  /// be the header of FC0, thus creating the fused loop.
+  Loop *fuseGuardedLoops(const FusionCandidate &FC0,
+                         const FusionCandidate &FC1) {
+    assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
+
+    BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
+    BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
+    BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
+    BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+
+    assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
+
+    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Update the Loop Guard
+    ////////////////////////////////////////////////////////////////////////////
+    // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
+    // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
+    // Thus, one path from the guard goes to the preheader for FC0 (and thus
+    // executes the new fused loop) and the other path goes to the NonLoopBlock
+    // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+    FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
+    FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
+                                                      FC1.Header);
+
+    // The guard of FC1 is not necessary anymore.
+    FC1.GuardBranch->eraseFromParent();
+    new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
+
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "Expecting guard block to have no predecessors");
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "Expecting guard block to have no successors");
+
+    // Remember the phi nodes originally in the header of FC0 in order to rewire
+    // them later. However, this is only necessary if the new loop carried
+    // values might not dominate the exiting branch. While we do not generally
+    // test if this is the case but simply insert intermediate phi nodes, we
+    // need to make sure these intermediate phi nodes have different
+    // predecessors. To this end, we filter the special case where the exiting
+    // block is the latch block of the first loop. Nothing needs to be done
+    // anyway as all loop carried values dominate the latch and thereby also the
+    // exiting branch.
+    // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
+    // (because the loops are rotated. Thus, nothing will ever be added to
+    // OriginalFC0PHIs.
+    SmallVector<PHINode *, 8> OriginalFC0PHIs;
+    if (FC0.ExitingBlock != FC0.Latch)
+      for (PHINode &PHI : FC0.Header->phis())
+        OriginalFC0PHIs.push_back(&PHI);
+
+    assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
+
+    // Replace incoming blocks for header PHIs first.
+    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+    // The old exiting block of the first loop (FC0) has to jump to the header
+    // of the second as we need to execute the code in the second header block
+    // regardless of the trip count. That is, if the trip count is 0, so the
+    // back edge is never taken, we still have to execute both loop headers,
+    // especially (but not only!) if the second is a do-while style loop.
+    // However, doing so might invalidate the phi nodes of the first loop as
+    // the new values do only need to dominate their latch and not the exiting
+    // predicate. To remedy this potential problem we always introduce phi
+    // nodes in the header of the second loop later that select the loop carried
+    // value, if the second header was reached through an old latch of the
+    // first, or undef otherwise. This is sound as exiting the first implies the
+    // second will exit too, __without__ taking the back-edge (their
+    // trip-counts are equal after all).
+    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+                                                         FC1.Header);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+
+    // Remove FC0 Exit Block
+    // The exit block for FC0 is no longer needed since control will flow
+    // directly to the header of FC1. Since it is an empty block, it can be
+    // removed at this point.
+    // TODO: In the future, we can handle non-empty exit blocks my merging any
+    // instructions from FC0 exit block into FC1 exit block prior to removing
+    // the block.
+    assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
+           "Expecting exit block to be empty");
+    FC0.ExitBlock->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+
+    // Remove FC1 Preheader
+    // The pre-header of L1 is not necessary anymore.
+    assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+    FC1.Preheader->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+    // Moves the phi nodes from the second to the first loops header block.
+    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+      if (SE.isSCEVable(PHI->getType()))
+        SE.forgetValue(PHI);
+      if (PHI->hasNUsesOrMore(1))
+        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+      else
+        PHI->eraseFromParent();
+    }
+
+    // Introduce new phi nodes in the second loop header to ensure
+    // exiting the first and jumping to the header of the second does not break
+    // the SSA property of the phis originally in the first loop. See also the
+    // comment above.
+    Instruction *L1HeaderIP = &FC1.Header->front();
+    for (PHINode *LCPHI : OriginalFC0PHIs) {
+      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+      assert(L1LatchBBIdx >= 0 &&
+             "Expected loop carried value to be rewired at this point!");
+
+      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+      PHINode *L1HeaderPHI = PHINode::Create(
+          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+      L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+                               FC0.ExitingBlock);
+
+      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+    }
+
+    // Update the latches
+
+    // Replace latch terminator destinations.
+    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
+    // Change the condition of FC0 latch branch to true, as both successors of
+    // the branch are the same.
+    simplifyLatchBranch(FC0);
+
+    // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+    // performed the updates above.
+    if (FC0.Latch != FC0.ExitingBlock)
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC0.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+                                                       FC1.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC1.Latch, FC1.Header));
+
+    // All done
+    // Apply the updates to the Dominator Tree and cleanup.
+
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "FC1GuardBlock has successors!!");
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "FC1GuardBlock has predecessors!!");
+
+    // Update DT/PDT
+    DTU.applyUpdates(TreeUpdates);
+
+    LI.removeBlock(FC1.Preheader);
+    DTU.deleteBB(FC1.Preheader);
+    DTU.deleteBB(FC0.ExitBlock);
+    DTU.flush();
+
+    // Is there a way to keep SE up-to-date so we don't need to forget the loops
+    // and rebuild the information in subsequent passes of fusion?
+    // Note: Need to forget the loops before merging the loop latches, as
+    // mergeLatch may remove the only block in FC1.
+    SE.forgetLoop(FC1.L);
+    SE.forgetLoop(FC0.L);
+
+    // Move instructions from FC0.Latch to FC1.Latch.
+    // Note: mergeLatch requires an updated DT.
+    mergeLatch(FC0, FC1);
+
+    // Merge the loops.
+    SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
+                                        FC1.L->block_end());
+    for (BasicBlock *BB : Blocks) {
+      FC0.L->addBlockEntry(BB);
+      FC1.L->removeBlockFromLoop(BB);
+      if (LI.getLoopFor(BB) != FC1.L)
+        continue;
+      LI.changeLoopFor(BB, FC0.L);
+    }
+    while (!FC1.L->empty()) {
+      const auto &ChildLoopIt = FC1.L->begin();
+      Loop *ChildLoop = *ChildLoopIt;
+      FC1.L->removeChildLoop(ChildLoopIt);
+      FC0.L->addChildLoop(ChildLoop);
+    }
+
+    // Delete the now empty loop L1.
+    LI.erase(FC1.L);
+
+#ifndef NDEBUG
+    assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+    assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+    assert(PDT.verify());
+    LI.verify(DT);
+    SE.verify();
+#endif
 
     LLVM_DEBUG(dbgs() << "Fusion done:\n");
 
@@ -1177,6 +1633,7 @@ struct LoopFuseLegacy : public FunctionPass {
     return LF.fuseLoops(F);
   }
 };
+} // namespace
 
 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index e561494f19cf..b77843d7cd71 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -81,6 +81,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -171,7 +172,7 @@ private:
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 
   bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
-                               unsigned StoreAlignment, Value *StoredVal,
+                               MaybeAlign StoreAlignment, Value *StoredVal,
                                Instruction *TheStore,
                                SmallPtrSetImpl<Instruction *> &Stores,
                                const SCEVAddRecExpr *Ev, const SCEV *BECount,
@@ -217,7 +218,8 @@ public:
     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+            *L->getHeader()->getParent());
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
             *L->getHeader()->getParent());
@@ -729,7 +731,8 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
 
     bool NegStride = StoreSize == -Stride;
 
-    if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
+    if (processLoopStridedStore(StorePtr, StoreSize,
+                                MaybeAlign(HeadStore->getAlignment()),
                                 StoredVal, HeadStore, AdjacentStores, StoreEv,
                                 BECount, NegStride)) {
       TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
@@ -784,9 +787,9 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   SmallPtrSet<Instruction *, 1> MSIs;
   MSIs.insert(MSI);
   bool NegStride = SizeInBytes == -Stride;
-  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
-                                 MSI->getDestAlignment(), SplatValue, MSI, MSIs,
-                                 Ev, BECount, NegStride, /*IsLoopMemset=*/true);
+  return processLoopStridedStore(
+      Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
+      SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -876,7 +879,7 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
 /// processLoopStridedStore - We see a strided store of some value.  If we can
 /// transform this into a memset or memset_pattern in the loop preheader, do so.
 bool LoopIdiomRecognize::processLoopStridedStore(
-    Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
+    Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
     Value *StoredVal, Instruction *TheStore,
     SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
     const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
@@ -898,12 +901,12 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
 
   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
-  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
+  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
 
   const SCEV *Start = Ev->getStart();
   // Handle negative strided loops.
   if (NegStride)
-    Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
+    Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
@@ -931,7 +934,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // Okay, everything looks good, insert the memset.
 
   const SCEV *NumBytesS =
-      getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
+      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
@@ -939,12 +942,12 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     return false;
 
   Value *NumBytes =
-      Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
 
   CallInst *NewCall;
   if (SplatValue) {
-    NewCall =
-        Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
+    NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
+                                   MaybeAlign(StoreAlignment));
   } else {
     // Everything is emitted in default address space
     Type *Int8PtrTy = DestInt8PtrTy;
@@ -952,7 +955,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Module *M = TheStore->getModule();
     StringRef FuncName = "memset_pattern16";
     FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
-                                                Int8PtrTy, Int8PtrTy, IntPtr);
+                                                Int8PtrTy, Int8PtrTy, IntIdxTy);
     inferLibFuncAttributes(M, FuncName, *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -961,7 +964,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
                                             GlobalValue::PrivateLinkage,
                                             PatternValue, ".memset_pattern");
     GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
-    GV->setAlignment(16);
+    GV->setAlignment(Align(16));
     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
   }
@@ -1019,11 +1022,11 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   const SCEV *StrStart = StoreEv->getStart();
   unsigned StrAS = SI->getPointerAddressSpace();
-  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
+  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
 
   // Handle negative strided loops.
   if (NegStride)
-    StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
+    StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
 
   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
   // this into a memcpy in the loop preheader now if we want.  However, this
@@ -1049,7 +1052,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   // Handle negative strided loops.
   if (NegStride)
-    LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
+    LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
 
   // For a memcpy, we have to make sure that the input array is not being
   // mutated by the loop.
@@ -1071,18 +1074,18 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // Okay, everything is safe, we can transform this!
 
   const SCEV *NumBytesS =
-      getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
+      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
 
   Value *NumBytes =
-      Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
 
   CallInst *NewCall = nullptr;
   // Check whether to generate an unordered atomic memcpy:
   //  If the load or store are atomic, then they must necessarily be unordered
   //  by previous checks.
   if (!SI->isAtomic() && !LI->isAtomic())
-    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
-                                   LoadBasePtr, LI->getAlignment(), NumBytes);
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
+                                   LI->getAlign(), NumBytes);
   else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 31191b52895c..901204181a7c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Scalar.h"
@@ -192,7 +193,8 @@ public:
         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
             *L->getHeader()->getParent());
     const TargetLibraryInfo &TLI =
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+            *L->getHeader()->getParent());
     MemorySSA *MSSA = nullptr;
     Optional<MemorySSAUpdater> MSSAU;
     if (EnableMSSALoopDependency) {
@@ -225,7 +227,8 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   Optional<MemorySSAUpdater> MSSAU;
   if (AR.MSSA) {
     MSSAU = MemorySSAUpdater(AR.MSSA);
-    AR.MSSA->verifyMemorySSA();
+    if (VerifyMemorySSA)
+      AR.MSSA->verifyMemorySSA();
   }
   if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI,
                         MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
@@ -233,7 +236,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
 
   auto PA = getLoopPassPreservedAnalyses();
   PA.preserveSet<CFGAnalyses>();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 9a42365adc1b..6ce2d06058cf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -410,8 +411,6 @@ public:
   void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
 
 private:
-  void splitInnerLoopLatch(Instruction *);
-  void splitInnerLoopHeader();
   bool adjustLoopLinks();
   void adjustLoopPreheaders();
   bool adjustLoopBranches();
@@ -718,22 +717,6 @@ bool LoopInterchangeLegality::findInductionAndReductions(
   return true;
 }
 
-static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
-  for (PHINode &PHI : Block->phis()) {
-    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
-    if (PHI.getNumIncomingValues() > 1)
-      return false;
-    Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0));
-    if (!Ins)
-      return false;
-    // Incoming value for lcssa phi's in outer loop exit can only be inner loop
-    // exits lcssa phi else it would not be tightly nested.
-    if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
-      return false;
-  }
-  return true;
-}
-
 // This function indicates the current limitations in the transform as a result
 // of which we do not proceed.
 bool LoopInterchangeLegality::currentLimitations() {
@@ -832,21 +815,6 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
-  BasicBlock *InnerExit = InnerLoop->getExitBlock();
-  if (!containsSafePHI(InnerExit, false)) {
-    LLVM_DEBUG(
-        dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Only inner loops with LCSSA PHIs can be interchange "
-                "currently.";
-    });
-    return true;
-  }
-
   // TODO: Current limitation: Since we split the inner loop latch at the point
   // were induction variable is incremented (induction.next); We cannot have
   // more than 1 user of induction.next since it would result in broken code
@@ -922,6 +890,28 @@ bool LoopInterchangeLegality::currentLimitations() {
   return false;
 }
 
+// We currently only support LCSSA PHI nodes in the inner loop exit, if their
+// users are either reduction PHIs or PHIs outside the outer loop (which means
+// the we are only interested in the final value after the loop).
+static bool
+areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
+                              SmallPtrSetImpl<PHINode *> &Reductions) {
+  BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
+  for (PHINode &PHI : InnerExit->phis()) {
+    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+    if (PHI.getNumIncomingValues() > 1)
+      return false;
+    if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
+          PHINode *PN = dyn_cast<PHINode>(U);
+          return !PN || (Reductions.find(PN) == Reductions.end() &&
+                         OuterL->contains(PN->getParent()));
+        })) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // We currently support LCSSA PHI nodes in the outer loop exit, if their
 // incoming values do not come from the outer loop latch or if the
 // outer loop latch has a single predecessor. In that case, the value will
@@ -929,7 +919,7 @@ bool LoopInterchangeLegality::currentLimitations() {
 // will still be true after interchanging. If we have multiple predecessor,
 // that may not be the case, e.g. because the outer loop latch may be executed
 // if the inner loop is not executed.
-static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
   BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
   for (PHINode &PHI : LoopNestExit->phis()) {
     //  FIXME: We currently are not able to detect floating point reductions
@@ -1014,7 +1004,19 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     return false;
   }
 
-  if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+  if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
+                                     OuterInnerReductions)) {
+    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Found unsupported PHI node in loop exit.";
+    });
+    return false;
+  }
+
+  if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
@@ -1226,7 +1228,7 @@ bool LoopInterchangeTransform::transform() {
 
   if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-    LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
+    LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
     PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
     if (!InductionPHI) {
       LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
@@ -1242,11 +1244,55 @@ bool LoopInterchangeTransform::transform() {
     if (&InductionPHI->getParent()->front() != InductionPHI)
       InductionPHI->moveBefore(&InductionPHI->getParent()->front());
 
-    // Split at the place were the induction variable is
-    // incremented/decremented.
-    // TODO: This splitting logic may not work always. Fix this.
-    splitInnerLoopLatch(InnerIndexVar);
-    LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+    // Create a new latch block for the inner loop. We split at the
+    // current latch's terminator and then move the condition and all
+    // operands that are not either loop-invariant or the induction PHI into the
+    // new latch block.
+    BasicBlock *NewLatch =
+        SplitBlock(InnerLoop->getLoopLatch(),
+                   InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
+
+    SmallSetVector<Instruction *, 4> WorkList;
+    unsigned i = 0;
+    auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+      for (; i < WorkList.size(); i++) {
+        // Duplicate instruction and move it the new latch. Update uses that
+        // have been moved.
+        Instruction *NewI = WorkList[i]->clone();
+        NewI->insertBefore(NewLatch->getFirstNonPHI());
+        assert(!NewI->mayHaveSideEffects() &&
+               "Moving instructions with side-effects may change behavior of "
+               "the loop nest!");
+        for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end();
+             UI != UE;) {
+          Use &U = *UI++;
+          Instruction *UserI = cast<Instruction>(U.getUser());
+          if (!InnerLoop->contains(UserI->getParent()) ||
+              UserI->getParent() == NewLatch || UserI == InductionPHI)
+            U.set(NewI);
+        }
+        // Add operands of moved instruction to the worklist, except if they are
+        // outside the inner loop or are the induction PHI.
+        for (Value *Op : WorkList[i]->operands()) {
+          Instruction *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI ||
+              this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
+              OpI == InductionPHI)
+            continue;
+          WorkList.insert(OpI);
+        }
+      }
+    };
+
+    // FIXME: Should we interchange when we have a constant condition?
+    Instruction *CondI = dyn_cast<Instruction>(
+        cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
+            ->getCondition());
+    if (CondI)
+      WorkList.insert(CondI);
+    MoveInstructions();
+    WorkList.insert(cast<Instruction>(InnerIndexVar));
+    MoveInstructions();
 
     // Splits the inner loops phi nodes out into a separate basic block.
     BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
@@ -1263,10 +1309,6 @@ bool LoopInterchangeTransform::transform() {
   return true;
 }
 
-void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
-  SplitBlock(InnerLoop->getLoopLatch(), Inc, DT, LI);
-}
-
 /// \brief Move all instructions except the terminator from FromBB right before
 /// InsertBefore
 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
@@ -1277,31 +1319,39 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
                 FromBB->getTerminator()->getIterator());
 }
 
-/// Update BI to jump to NewBB instead of OldBB. Records updates to
-/// the dominator tree in DTUpdates, if DT should be preserved.
+// Update BI to jump to NewBB instead of OldBB. Records updates to the
+// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
+// \p OldBB  is exactly once in BI's successor list.
 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
                             BasicBlock *NewBB,
-                            std::vector<DominatorTree::UpdateType> &DTUpdates) {
-  assert(llvm::count_if(successors(BI),
-                        [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 &&
-         "BI must jump to OldBB at most once.");
-  for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) {
-    if (BI->getSuccessor(i) == OldBB) {
-      BI->setSuccessor(i, NewBB);
-
-      DTUpdates.push_back(
-          {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
-      DTUpdates.push_back(
-          {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
-      break;
+                            std::vector<DominatorTree::UpdateType> &DTUpdates,
+                            bool MustUpdateOnce = true) {
+  assert((!MustUpdateOnce ||
+          llvm::count_if(successors(BI),
+                         [OldBB](BasicBlock *BB) {
+                           return BB == OldBB;
+                         }) == 1) && "BI must jump to OldBB exactly once.");
+  bool Changed = false;
+  for (Use &Op : BI->operands())
+    if (Op == OldBB) {
+      Op.set(NewBB);
+      Changed = true;
     }
+
+  if (Changed) {
+    DTUpdates.push_back(
+        {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
+    DTUpdates.push_back(
+        {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
   }
+  assert(Changed && "Expected a successor to be updated");
 }
 
 // Move Lcssa PHIs to the right place.
 static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
                           BasicBlock *InnerLatch, BasicBlock *OuterHeader,
-                          BasicBlock *OuterLatch, BasicBlock *OuterExit) {
+                          BasicBlock *OuterLatch, BasicBlock *OuterExit,
+                          Loop *InnerLoop, LoopInfo *LI) {
 
   // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
   // defined either in the header or latch. Those blocks will become header and
@@ -1356,19 +1406,17 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
     P->moveBefore(InnerExit->getFirstNonPHI());
 
   // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
-  // incoming values from the outer latch or header, we have to add a new PHI
+  // incoming values defined in the outer loop, we have to add a new PHI
   // in the inner loop latch, which became the exit block of the outer loop,
   // after interchanging.
   if (OuterExit) {
     for (PHINode &P : OuterExit->phis()) {
       if (P.getNumIncomingValues() != 1)
         continue;
-      // Skip Phis with incoming values not defined in the outer loop's header
-      // and latch. Also skip incoming phis defined in the latch. Those should
+      // Skip Phis with incoming values defined in the inner loop. Those should
       // already have been updated.
       auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
-      if (!I || ((I->getParent() != OuterLatch || isa<PHINode>(I)) &&
-                 I->getParent() != OuterHeader))
+      if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
         continue;
 
       PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
@@ -1443,12 +1491,21 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   if (!InnerLoopHeaderSuccessor)
     return false;
 
-  // Adjust Loop Preheader and headers
+  // Adjust Loop Preheader and headers.
+  // The branches in the outer loop predecessor and the outer loop header can
+  // be unconditional branches or conditional branches with duplicates. Consider
+  // this when updating the successors.
   updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
-                  InnerLoopPreHeader, DTUpdates);
-  updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates);
+                  InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
+  // The outer loop header might or might not branch to the outer latch.
+  // We are guaranteed to branch to the inner loop preheader.
+  if (std::find(succ_begin(OuterLoopHeaderBI), succ_end(OuterLoopHeaderBI),
+                OuterLoopLatch) != succ_end(OuterLoopHeaderBI))
+    updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates,
+                    /*MustUpdateOnce=*/false);
   updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
-                  InnerLoopHeaderSuccessor, DTUpdates);
+                  InnerLoopHeaderSuccessor, DTUpdates,
+                  /*MustUpdateOnce=*/false);
 
   // Adjust reduction PHI's now that the incoming block has changed.
   InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
@@ -1482,7 +1539,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
                    OuterLoopPreHeader);
 
   moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
-                OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock());
+                OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
+                InnerLoop, LI);
   // For PHIs in the exit block of the outer loop, outer's latch has been
   // replaced by Inners'.
   OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 2b3d5e0ce9b7..4e1b4e87ebc9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -49,6 +49,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -435,7 +436,8 @@ public:
                                           PH->getTerminator());
     Value *Initial = new LoadInst(
         Cand.Load->getType(), InitialPtr, "load_initial",
-        /* isVolatile */ false, Cand.Load->getAlignment(), PH->getTerminator());
+        /* isVolatile */ false, MaybeAlign(Cand.Load->getAlignment()),
+        PH->getTerminator());
 
     PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
                                    &L->getHeader()->front());
@@ -487,7 +489,7 @@ public:
     // Filter the candidates further.
     SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
     unsigned NumForwarding = 0;
-    for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
+    for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
       LLVM_DEBUG(dbgs() << "Candidate " << Cand);
 
       // Make sure that the stored values is available everywhere in the loop in
@@ -543,7 +545,8 @@ public:
       auto *HeaderBB = L->getHeader();
       auto *F = HeaderBB->getParent();
       bool OptForSize = F->hasOptSize() ||
-                        llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI);
+                        llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI,
+                                                    PGSOQueryType::IRPass);
       if (OptForSize) {
         LLVM_DEBUG(
             dbgs() << "Versioning is needed but not allowed when optimizing "
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 507a1e251ca6..1a42f6b23443 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -191,9 +191,12 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
@@ -248,7 +251,9 @@ struct LoopICmp {
 
 class LoopPredication {
   AliasAnalysis *AA;
+  DominatorTree *DT;
   ScalarEvolution *SE;
+  LoopInfo *LI;
   BranchProbabilityInfo *BPI;
 
   Loop *L;
@@ -300,10 +305,13 @@ class LoopPredication {
   // within the loop. We identify such unprofitable loops through BPI.
   bool isLoopProfitableToPredicate();
 
+  bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
+
 public:
-  LoopPredication(AliasAnalysis *AA, ScalarEvolution *SE,
+  LoopPredication(AliasAnalysis *AA, DominatorTree *DT,
+                  ScalarEvolution *SE, LoopInfo *LI,
                   BranchProbabilityInfo *BPI)
-    : AA(AA), SE(SE), BPI(BPI){};
+    : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {};
   bool runOnLoop(Loop *L);
 };
 
@@ -323,10 +331,12 @@ public:
     if (skipLoop(L))
       return false;
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     BranchProbabilityInfo &BPI =
         getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    LoopPredication LP(AA, SE, &BPI);
+    LoopPredication LP(AA, DT, SE, LI, &BPI);
     return LP.runOnLoop(L);
   }
 };
@@ -352,7 +362,7 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
       AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
   auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
-  LoopPredication LP(&AR.AA, &AR.SE, BPI);
+  LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, BPI);
   if (!LP.runOnLoop(&L))
     return PreservedAnalyses::all();
 
@@ -543,7 +553,7 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
     if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
       if (LI->isUnordered() && L->hasLoopInvariantOperands(LI))
         if (AA->pointsToConstantMemory(LI->getOperand(0)) ||
-            LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr)
+            LI->hasMetadata(LLVMContext::MD_invariant_load))
           return true;
   return false;
 }
@@ -823,9 +833,9 @@ bool LoopPredication::widenWidenableBranchGuardConditions(
   Value *AllChecks = Builder.CreateAnd(Checks);
   auto *OldCond = BI->getCondition();
   BI->setCondition(AllChecks);
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
   assert(isGuardAsWidenableBranch(BI) &&
          "Stopped being a guard after transform?");
-  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
 
   LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
   return true;
@@ -953,6 +963,233 @@ bool LoopPredication::isLoopProfitableToPredicate() {
   return true;
 }
 
+/// If we can (cheaply) find a widenable branch which controls entry into the
+/// loop, return it.
+static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) {
+  // Walk back through any unconditional executed blocks and see if we can find
+  // a widenable condition which seems to control execution of this loop.  Note
+  // that we predict that maythrow calls are likely untaken and thus that it's
+  // profitable to widen a branch before a maythrow call with a condition
+  // afterwards even though that may cause the slow path to run in a case where
+  // it wouldn't have otherwise.
+  BasicBlock *BB = L->getLoopPreheader();
+  if (!BB)
+    return nullptr;
+  do {
+    if (BasicBlock *Pred = BB->getSinglePredecessor())
+      if (BB == Pred->getSingleSuccessor()) {
+        BB = Pred;
+        continue;
+      }
+    break;
+  } while (true);
+
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *Term = Pred->getTerminator();
+
+    Value *Cond, *WC;
+    BasicBlock *IfTrueBB, *IfFalseBB;
+    if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) &&
+        IfTrueBB == BB)
+      return cast<BranchInst>(Term);
+  }
+  return nullptr;
+}
+
+/// Return the minimum of all analyzeable exit counts.  This is an upper bound
+/// on the actual exit count.  If there are not at least two analyzeable exits,
+/// returns SCEVCouldNotCompute.
+static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE,
+                                                       DominatorTree &DT,
+                                                       Loop *L) {
+  SmallVector<BasicBlock *, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  SmallVector<const SCEV *, 4> ExitCounts;
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
+    if (isa<SCEVCouldNotCompute>(ExitCount))
+      continue;
+    assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
+           "We should only have known counts for exiting blocks that "
+           "dominate latch!");
+    ExitCounts.push_back(ExitCount);
+  }
+  if (ExitCounts.size() < 2)
+    return SE.getCouldNotCompute();
+  return SE.getUMinFromMismatchedTypes(ExitCounts);
+}
+
+/// Return true if we can be fairly sure that executing block BB will probably
+/// lead to executing an __llvm_deoptimize.  This is a profitability heuristic,
+/// not a legality constraint.
+static bool isVeryLikelyToDeopt(BasicBlock *BB) {
+  while (BB->getUniqueSuccessor())
+    // Will skip side effects, that's okay
+    BB = BB->getUniqueSuccessor();
+
+  return BB->getTerminatingDeoptimizeCall();
+}
+
+/// This implements an analogous, but entirely distinct transform from the main
+/// loop predication transform.  This one is phrased in terms of using a
+/// widenable branch *outside* the loop to allow us to simplify loop exits in a
+/// following loop.  This is close in spirit to the IndVarSimplify transform
+/// of the same name, but is materially different widening loosens legality
+/// sharply.
+bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  // The transformation performed here aims to widen a widenable condition
+  // above the loop such that all analyzeable exit leading to deopt are dead.
+  // It assumes that the latch is the dominant exit for profitability and that
+  // exits branching to deoptimizing blocks are rarely taken. It relies on the
+  // semantics of widenable expressions for legality. (i.e. being able to fall
+  // down the widenable path spuriously allows us to ignore exit order,
+  // unanalyzeable exits, side effects, exceptional exits, and other challenges
+  // which restrict the applicability of the non-WC based version of this
+  // transform in IndVarSimplify.)
+  //
+  // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may
+  // imply flags on the expression being hoisted and inserting new uses (flags
+  // are only correct for current uses).  The result is that we may be
+  // inserting a branch on the value which can be either poison or undef.  In
+  // this case, the branch can legally go either way; we just need to avoid
+  // introducing UB.  This is achieved through the use of the freeze
+  // instruction.  
+
+  SmallVector<BasicBlock *, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  if (ExitingBlocks.empty())
+    return false; // Nothing to do.
+
+  auto *Latch = L->getLoopLatch();
+  if (!Latch)
+    return false;
+
+  auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI);
+  if (!WidenableBR)
+    return false;
+
+  const SCEV *LatchEC = SE->getExitCount(L, Latch);
+  if (isa<SCEVCouldNotCompute>(LatchEC))
+    return false; // profitability - want hot exit in analyzeable set
+
+  // At this point, we have found an analyzeable latch, and a widenable
+  // condition above the loop.  If we have a widenable exit within the loop
+  // (for which we can't compute exit counts), drop the ability to further
+  // widen so that we gain ability to analyze it's exit count and perform this
+  // transform.  TODO: It'd be nice to know for sure the exit became
+  // analyzeable after dropping widenability.
+  {
+    bool Invalidate = false;
+    
+    for (auto *ExitingBB : ExitingBlocks) {
+      if (LI->getLoopFor(ExitingBB) != L)
+        continue;
+
+      auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+      if (!BI)
+        continue;
+
+      Use *Cond, *WC;
+      BasicBlock *IfTrueBB, *IfFalseBB;
+      if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) &&
+          L->contains(IfTrueBB)) {
+        WC->set(ConstantInt::getTrue(IfTrueBB->getContext()));
+        Invalidate = true;
+      }
+    }
+    if (Invalidate)
+      SE->forgetLoop(L);
+  }
+
+  // The use of umin(all analyzeable exits) instead of latch is subtle, but
+  // important for profitability.  We may have a loop which hasn't been fully
+  // canonicalized just yet.  If the exit we chose to widen is provably never
+  // taken, we want the widened form to *also* be provably never taken.  We
+  // can't guarantee this as a current unanalyzeable exit may later become
+  // analyzeable, but we can at least avoid the obvious cases.
+  const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L);
+  if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() ||
+      !SE->isLoopInvariant(MinEC, L) ||
+      !isSafeToExpandAt(MinEC, WidenableBR, *SE))
+    return false;
+
+  // Subtlety: We need to avoid inserting additional uses of the WC.  We know
+  // that it can only have one transitive use at the moment, and thus moving
+  // that use to just before the branch and inserting code before it and then
+  // modifying the operand is legal.
+  auto *IP = cast<Instruction>(WidenableBR->getCondition());
+  IP->moveBefore(WidenableBR);
+  Rewriter.setInsertPoint(IP);
+  IRBuilder<> B(IP);
+
+  bool Changed = false;
+  Value *MinECV = nullptr; // lazily generated if needed
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    // If our exiting block exits multiple loops, we can only rewrite the
+    // innermost one.  Otherwise, we're changing how many times the innermost
+    // loop runs before it exits.
+    if (LI->getLoopFor(ExitingBB) != L)
+      continue;
+
+    // Can't rewrite non-branch yet.
+    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      continue;
+
+    // If already constant, nothing to do.
+    if (isa<Constant>(BI->getCondition()))
+      continue;
+
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    if (isa<SCEVCouldNotCompute>(ExitCount) ||
+        ExitCount->getType()->isPointerTy() ||
+        !isSafeToExpandAt(ExitCount, WidenableBR, *SE))
+      continue;
+
+    const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+    BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1);
+    if (!isVeryLikelyToDeopt(ExitBB))
+      // Profitability: indicator of rarely/never taken exit
+      continue;
+
+    // If we found a widenable exit condition, do two things:
+    // 1) fold the widened exit test into the widenable condition
+    // 2) fold the branch to untaken - avoids infinite looping
+
+    Value *ECV = Rewriter.expandCodeFor(ExitCount);
+    if (!MinECV)
+      MinECV = Rewriter.expandCodeFor(MinEC);
+    Value *RHS = MinECV;
+    if (ECV->getType() != RHS->getType()) {
+      Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+      ECV = B.CreateZExt(ECV, WiderTy);
+      RHS = B.CreateZExt(RHS, WiderTy);
+    }
+    assert(!Latch || DT->dominates(ExitingBB, Latch));
+    Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS);
+    // Freeze poison or undef to an arbitrary bit pattern to ensure we can
+    // branch without introducing UB.  See NOTE ON POISON/UNDEF above for
+    // context.
+    NewCond = B.CreateFreeze(NewCond);
+
+    widenWidenableBranch(WidenableBR, NewCond);
+
+    Value *OldCond = BI->getCondition();
+    BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue));
+    Changed = true;
+  }
+
+  if (Changed)
+    // We just mutated a bunch of loop exits changing there exit counts
+    // widely.  We need to force recomputation of the exit counts given these
+    // changes.  Note that all of the inserted exits are never taken, and
+    // should be removed next time the CFG is modified.
+    SE->forgetLoop(L);
+  return Changed;
+}
+
 bool LoopPredication::runOnLoop(Loop *Loop) {
   L = Loop;
 
@@ -1004,16 +1241,12 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
           cast<BranchInst>(BB->getTerminator()));
   }
 
-  if (Guards.empty() && GuardsAsWidenableBranches.empty())
-    return false;
-
   SCEVExpander Expander(*SE, *DL, "loop-predication");
-
   bool Changed = false;
   for (auto *Guard : Guards)
     Changed |= widenGuardConditions(Guard, Expander);
   for (auto *Guard : GuardsAsWidenableBranches)
     Changed |= widenWidenableBranchGuardConditions(Guard, Expander);
-
+  Changed |= predicateLoopExits(L, Expander);
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 166b57f20b43..da13a342ae12 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -45,6 +44,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -53,6 +53,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <cassert>
 #include <cstddef>
@@ -1644,7 +1645,8 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index e009947690af..0868e742f4ee 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -55,7 +57,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
     AR.MSSA->verifyMemorySSA();
 
   auto PA = getLoopPassPreservedAnalyses();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
@@ -94,17 +96,15 @@ public:
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-    auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-    auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
     Optional<MemorySSAUpdater> MSSAU;
     if (EnableMSSALoopDependency) {
       MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
       MSSAU = MemorySSAUpdater(MSSA);
     }
-    return LoopRotation(L, LI, TTI, AC, DT, SE,
+    return LoopRotation(L, LI, TTI, AC, &DT, &SE,
                         MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
                         false, MaxHeaderSize, false);
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 046f4c8af492..b27e65e0adb7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -30,6 +30,8 @@
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils.h"
@@ -660,6 +662,9 @@ static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
     // Merge Succ into Pred and delete it.
     MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU);
 
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+
     Changed = true;
   }
 
@@ -690,7 +695,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &LPMU) {
   Optional<MemorySSAUpdater> MSSAU;
-  if (EnableMSSALoopDependency && AR.MSSA)
+  if (AR.MSSA)
     MSSAU = MemorySSAUpdater(AR.MSSA);
   bool DeleteCurrentLoop = false;
   if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE,
@@ -702,7 +707,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
     LPMU.markLoopAsDeleted(L, "loop-simplifycfg");
 
   auto PA = getLoopPassPreservedAnalyses();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 975452e13f09..1c03a4bf6c02 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -41,14 +41,15 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
@@ -230,12 +231,9 @@ static bool sinkInstruction(Loop &L, Instruction &I,
     IC->setName(I.getName());
     IC->insertBefore(&*N->getFirstInsertionPt());
     // Replaces uses of I with IC in N
-    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
-      Use &U = *UI++;
-      auto *I = cast<Instruction>(U.getUser());
-      if (I->getParent() == N)
-        U.set(IC);
-    }
+    I.replaceUsesWithIf(IC, [N](Use &U) {
+      return cast<Instruction>(U.getUser())->getParent() == N;
+    });
     // Replaces uses of I with IC in blocks dominated by N
     replaceDominatedUsesWith(&I, IC, DT, N);
     LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 59a387a186b8..e9f368628a08 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -74,7 +74,6 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -97,6 +96,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -108,6 +108,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -115,8 +116,8 @@
 #include <cstdlib>
 #include <iterator>
 #include <limits>
-#include <numeric>
 #include <map>
+#include <numeric>
 #include <utility>
 
 using namespace llvm;
@@ -1386,7 +1387,9 @@ void Cost::RateFormula(const Formula &F,
 
   // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
   // additional instruction (at least fill).
-  unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1;
+  // TODO: Need distinguish register class?
+  unsigned TTIRegNum = TTI->getNumberOfRegisters(
+                       TTI->getRegisterClassForType(false, F.getType())) - 1;
   if (C.NumRegs > TTIRegNum) {
     // Cost already exceeded TTIRegNum, then only newly added register can add
     // new instructions.
@@ -3165,6 +3168,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
     LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
     return;
   }
+  assert(IVSrc && "Failed to find IV chain source");
 
   LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
   Type *IVTy = IVSrc->getType();
@@ -3265,12 +3269,12 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     // requirements for both N and i at the same time. Limiting this code to
     // equality icmps is not a problem because all interesting loops use
     // equality icmps, thanks to IndVarSimplify.
-    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
+    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
+      // If CI can be saved in some target, like replaced inside hardware loop
+      // in PowerPC, no need to generate initial formulae for it.
+      if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
+        continue;
       if (CI->isEquality()) {
-        // If CI can be saved in some target, like replaced inside hardware loop
-        // in PowerPC, no need to generate initial formulae for it.
-        if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
-          continue;
         // Swap the operands if needed to put the OperandValToReplace on the
         // left, for consistency.
         Value *NV = CI->getOperand(1);
@@ -3298,6 +3302,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
             Factors.insert(-(uint64_t)Factors[i]);
         Factors.insert(-1);
       }
+    }
 
     // Get or create an LSRUse.
     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
@@ -4834,6 +4839,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
         }
       }
     }
+    assert(Best && "Failed to find best LSRUse candidate");
 
     LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
                       << " will yield profitable reuse.\n");
@@ -5740,7 +5746,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
       *L->getHeader()->getParent());
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
       *L->getHeader()->getParent());
-  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
   return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 86891eb451bb..92ad8dafa5ab 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -166,7 +167,7 @@ static bool computeUnrollAndJamCount(
   bool UseUpperBound = false;
   bool ExplicitUnroll = computeUnrollCount(
       L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
-      OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+      /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
   if (ExplicitUnroll || UseUpperBound) {
     // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
     // for the unroller instead.
@@ -293,9 +294,9 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (Latch != Exit || SubLoopLatch != SubLoopExit)
     return LoopUnrollResult::Unmodified;
 
-  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, SE, TTI, nullptr, nullptr, OptLevel,
-      None, None, None, None, None, None);
+  TargetTransformInfo::UnrollingPreferences UP =
+      gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
+                                 None, None, None, None, None, None, None);
   if (AllowUnrollAndJam.getNumOccurrences() > 0)
     UP.UnrollAndJam = AllowUnrollAndJam;
   if (UnrollAndJamThreshold.getNumOccurrences() > 0)
@@ -426,51 +427,76 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   return UnrollResult;
 }
 
+static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI,
+                                  ScalarEvolution &SE,
+                                  const TargetTransformInfo &TTI,
+                                  AssumptionCache &AC, DependenceInfo &DI,
+                                  OptimizationRemarkEmitter &ORE,
+                                  int OptLevel) {
+  bool DidSomething = false;
+
+  // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop unroll and jam pass
+  // will simplify all loops, regardless of whether anything end up being
+  // unroll and jammed.
+  for (auto &L : LI) {
+    DidSomething |=
+        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+    DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  internal::appendLoopsToWorklist(reverse(LI), Worklist);
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    formLCSSA(*L, DT, &LI, &SE);
+    LoopUnrollResult Result =
+        tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel);
+    if (Result != LoopUnrollResult::Unmodified)
+      DidSomething = true;
+  }
+
+  return DidSomething;
+}
+
 namespace {
 
-class LoopUnrollAndJam : public LoopPass {
+class LoopUnrollAndJam : public FunctionPass {
 public:
   static char ID; // Pass ID, replacement for typeid
   unsigned OptLevel;
 
-  LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
+  LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) {
     initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
   }
 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (skipLoop(L))
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
       return false;
 
-    Function &F = *L->getHeader()->getParent();
-
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
-    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
-    // pass.  Function analyses need to be preserved across loop transformations
-    // but ORE cannot be preserved (see comment before the pass definition).
-    OptimizationRemarkEmitter ORE(&F);
-
-    LoopUnrollResult Result =
-        tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    if (Result == LoopUnrollResult::FullyUnrolled)
-      LPM.markLoopAsDeleted(*L);
-
-    return Result != LoopUnrollResult::Unmodified;
+    return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
   }
 
   /// This transformation requires natural loop information & requires that
   /// loop preheaders be inserted into the CFG...
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DependenceAnalysisWrapperPass>();
-    getLoopAnalysisUsage(AU);
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 };
 
@@ -480,10 +506,13 @@ char LoopUnrollAndJam::ID = 0;
 
 INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
                       "Unroll and Jam loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
                     "Unroll and Jam loops", false, false)
 
@@ -491,26 +520,18 @@ Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
   return new LoopUnrollAndJam(OptLevel);
 }
 
-PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
-                                            LoopStandardAnalysisResults &AR,
-                                            LPMUpdater &) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  // FIXME: This should probably be optional rather than required.
-  if (!ORE)
-    report_fatal_error(
-        "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
-        "a higher level");
-
-  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
-
-  LoopUnrollResult Result = tryToUnrollAndJamLoop(
-      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
-
-  if (Result == LoopUnrollResult::Unmodified)
+PreservedAnalyses LoopUnrollAndJamPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F);
+  OptimizationRemarkEmitter &ORE =
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2fa7436213dd..4c2b079c6bb5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -46,6 +46,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -178,7 +179,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
-    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
+    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
+    Optional<bool> UserAllowProfileBasedPeeling,
+    Optional<unsigned> UserFullUnrollMaxCount) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
@@ -202,6 +205,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.UpperBound = false;
   UP.AllowPeeling = true;
   UP.UnrollAndJam = false;
+  UP.PeelProfiledIterations = true;
   UP.UnrollAndJamInnerLoopThreshold = 60;
 
   // Override with any target specific settings
@@ -209,7 +213,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
 
   // Apply size attributes
   bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
+                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
   if (OptForSize) {
     UP.Threshold = UP.OptSizeThreshold;
     UP.PartialThreshold = UP.PartialOptSizeThreshold;
@@ -257,6 +262,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.UpperBound = *UserUpperBound;
   if (UserAllowPeeling.hasValue())
     UP.AllowPeeling = *UserAllowPeeling;
+  if (UserAllowProfileBasedPeeling.hasValue())
+    UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
+  if (UserFullUnrollMaxCount.hasValue())
+    UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
 
   return UP;
 }
@@ -730,7 +739,7 @@ bool llvm::computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
     OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
-    unsigned &TripMultiple, unsigned LoopSize,
+    bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
 
   // Check for explicit Count.
@@ -781,18 +790,34 @@ bool llvm::computeUnrollCount(
   // Also we need to check if we exceed FullUnrollMaxCount.
   // If using the upper bound to unroll, TripMultiple should be set to 1 because
   // we do not know when loop may exit.
-  // MaxTripCount and ExactTripCount cannot both be non zero since we only
+
+  // We can unroll by the upper bound amount if it's generally allowed or if
+  // we know that the loop is executed either the upper bound or zero times.
+  // (MaxOrZero unrolling keeps only the first loop test, so the number of
+  // loop tests remains the same compared to the non-unrolled version, whereas
+  // the generic upper bound unrolling keeps all but the last loop test so the
+  // number of loop tests goes up which may end up being worse on targets with
+  // constrained branch predictor resources so is controlled by an option.)
+  // In addition we only unroll small upper bounds.
+  unsigned FullUnrollMaxTripCount = MaxTripCount;
+  if (!(UP.UpperBound || MaxOrZero) ||
+      FullUnrollMaxTripCount > UnrollMaxUpperBound)
+    FullUnrollMaxTripCount = 0;
+
+  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
   // compute the former when the latter is zero.
   unsigned ExactTripCount = TripCount;
-  assert((ExactTripCount == 0 || MaxTripCount == 0) &&
-         "ExtractTripCount and MaxTripCount cannot both be non zero.");
-  unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
+         "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
+
+  unsigned FullUnrollTripCount =
+      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
   UP.Count = FullUnrollTripCount;
   if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
     // When computing the unrolled size, note that BEInsns are not replicated
     // like the rest of the loop body.
     if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
-      UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
       TripCount = FullUnrollTripCount;
       TripMultiple = UP.UpperBound ? 1 : TripMultiple;
       return ExplicitUnroll;
@@ -806,7 +831,7 @@ bool llvm::computeUnrollCount(
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
         if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
-          UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
           TripCount = FullUnrollTripCount;
           TripMultiple = UP.UpperBound ? 1 : TripMultiple;
           return ExplicitUnroll;
@@ -882,6 +907,8 @@ bool llvm::computeUnrollCount(
                   "because "
                   "unrolled size is too large.";
       });
+    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+                      << "\n");
     return ExplicitUnroll;
   }
   assert(TripCount == 0 &&
@@ -903,6 +930,12 @@ bool llvm::computeUnrollCount(
     return false;
   }
 
+  // Don't unroll a small upper bound loop unless user or TTI asked to do so.
+  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+    UP.Count = 0;
+    return false;
+  }
+
   // Check if the runtime trip count is too small when profile is available.
   if (L->getHeader()->getParent()->hasProfileData()) {
     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
@@ -966,7 +999,11 @@ bool llvm::computeUnrollCount(
 
   if (UP.Count > UP.MaxCount)
     UP.Count = UP.MaxCount;
-  LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+
+  if (MaxTripCount && UP.Count > MaxTripCount)
+    UP.Count = MaxTripCount;
+
+  LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
                     << "\n");
   if (UP.Count < 2)
     UP.Count = 0;
@@ -976,13 +1013,14 @@ bool llvm::computeUnrollCount(
 static LoopUnrollResult tryToUnrollLoop(
     Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
     const TargetTransformInfo &TTI, AssumptionCache &AC,
-    OptimizationRemarkEmitter &ORE,
-    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
-    bool PreserveLCSSA, int OptLevel,
+    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+    ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
     bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
     Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
     Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
-    Optional<bool> ProvidedAllowPeeling) {
+    Optional<bool> ProvidedAllowPeeling,
+    Optional<bool> ProvidedAllowProfileBasedPeeling,
+    Optional<unsigned> ProvidedFullUnrollMaxCount) {
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
                     << L->getHeader()->getName() << "\n");
@@ -1007,7 +1045,8 @@ static LoopUnrollResult tryToUnrollLoop(
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
       L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
-      ProvidedAllowPeeling);
+      ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+      ProvidedFullUnrollMaxCount);
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.
@@ -1028,10 +1067,10 @@ static LoopUnrollResult tryToUnrollLoop(
     return LoopUnrollResult::Unmodified;
   }
 
-  // When optimizing for size, use LoopSize as threshold, to (fully) unroll
-  // loops, if it does not increase code size.
+  // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
+  // later), to (fully) unroll loops, if it does not increase code size.
   if (OptForSize)
-    UP.Threshold = std::max(UP.Threshold, LoopSize);
+    UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
 
   if (NumInlineCandidates != 0) {
     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
@@ -1040,7 +1079,6 @@ static LoopUnrollResult tryToUnrollLoop(
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
-  unsigned MaxTripCount = 0;
   unsigned TripMultiple = 1;
   // If there are multiple exiting blocks but one of them is the latch, use the
   // latch for the trip count estimation. Otherwise insist on a single exiting
@@ -1070,28 +1108,18 @@ static LoopUnrollResult tryToUnrollLoop(
 
   // Try to find the trip count upper bound if we cannot find the exact trip
   // count.
+  unsigned MaxTripCount = 0;
   bool MaxOrZero = false;
   if (!TripCount) {
     MaxTripCount = SE.getSmallConstantMaxTripCount(L);
     MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
-    // We can unroll by the upper bound amount if it's generally allowed or if
-    // we know that the loop is executed either the upper bound or zero times.
-    // (MaxOrZero unrolling keeps only the first loop test, so the number of
-    // loop tests remains the same compared to the non-unrolled version, whereas
-    // the generic upper bound unrolling keeps all but the last loop test so the
-    // number of loop tests goes up which may end up being worse on targets with
-    // constrained branch predictor resources so is controlled by an option.)
-    // In addition we only unroll small upper bounds.
-    if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
-      MaxTripCount = 0;
-    }
   }
 
   // computeUnrollCount() decides whether it is beneficial to use upper bound to
   // fully unroll the loop.
   bool UseUpperBound = false;
   bool IsCountSetExplicitly = computeUnrollCount(
-      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
       TripMultiple, LoopSize, UP, UseUpperBound);
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
@@ -1139,7 +1167,7 @@ static LoopUnrollResult tryToUnrollLoop(
   // If the loop was peeled, we already "used up" the profile information
   // we had, so we don't want to unroll or peel again.
   if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
-      (IsCountSetExplicitly || UP.PeelCount))
+      (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount)))
     L->setLoopAlreadyUnrolled();
 
   return UnrollResult;
@@ -1169,18 +1197,24 @@ public:
   Optional<bool> ProvidedRuntime;
   Optional<bool> ProvidedUpperBound;
   Optional<bool> ProvidedAllowPeeling;
+  Optional<bool> ProvidedAllowProfileBasedPeeling;
+  Optional<unsigned> ProvidedFullUnrollMaxCount;
 
   LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
              bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
              Optional<bool> UpperBound = None,
-             Optional<bool> AllowPeeling = None)
+             Optional<bool> AllowPeeling = None,
+             Optional<bool> AllowProfileBasedPeeling = None,
+             Optional<unsigned> ProvidedFullUnrollMaxCount = None)
       : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
         ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
-        ProvidedAllowPeeling(AllowPeeling) {
+        ProvidedAllowPeeling(AllowPeeling),
+        ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
+        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
@@ -1203,10 +1237,11 @@ public:
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
     LoopUnrollResult Result = tryToUnrollLoop(
-        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr,
-        PreserveLCSSA, OptLevel, OnlyWhenForced,
-        ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
-        ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling);
+        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
+        OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
+        ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+        ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+        ProvidedFullUnrollMaxCount);
 
     if (Result == LoopUnrollResult::FullyUnrolled)
       LPM.markLoopAsDeleted(*L);
@@ -1283,14 +1318,16 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
 
   std::string LoopName = L.getName();
 
-  bool Changed =
-      tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
-                      /*BFI*/ nullptr, /*PSI*/ nullptr,
-                      /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
-                      ForgetSCEV, /*Count*/ None,
-                      /*Threshold*/ None, /*AllowPartial*/ false,
-                      /*Runtime*/ false, /*UpperBound*/ false,
-                      /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
+  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+                                 /*BFI*/ nullptr, /*PSI*/ nullptr,
+                                 /*PreserveLCSSA*/ true, OptLevel,
+                                 OnlyWhenForced, ForgetSCEV, /*Count*/ None,
+                                 /*Threshold*/ None, /*AllowPartial*/ false,
+                                 /*Runtime*/ false, /*UpperBound*/ false,
+                                 /*AllowPeeling*/ false,
+                                 /*AllowProfileBasedPeeling*/ false,
+                                 /*FullUnrollMaxCount*/ None) !=
+                 LoopUnrollResult::Unmodified;
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -1430,7 +1467,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
         /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
         UnrollOpts.ForgetSCEV, /*Count*/ None,
         /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
-        UnrollOpts.AllowUpperBound, LocalAllowPeeling);
+        UnrollOpts.AllowUpperBound, LocalAllowPeeling,
+        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
     Changed |= Result != LoopUnrollResult::Unmodified;
 
     // The parent must not be damaged by unrolling!
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index b5b8e720069c..915e053704b2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -59,6 +59,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -420,7 +421,8 @@ enum OperatorChain {
 /// cost of creating an entirely new loop.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
                                    OperatorChain &ParentChain,
-                                   DenseMap<Value *, Value *> &Cache) {
+                                   DenseMap<Value *, Value *> &Cache,
+                                   MemorySSAUpdater *MSSAU) {
   auto CacheIt = Cache.find(Cond);
   if (CacheIt != Cache.end())
     return CacheIt->second;
@@ -438,7 +440,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
   // TODO: Handle: br (VARIANT|INVARIANT).
 
   // Hoist simple values out.
-  if (L->makeLoopInvariant(Cond, Changed)) {
+  if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) {
     Cache[Cond] = Cond;
     return Cond;
   }
@@ -478,7 +480,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
         // which will cause the branch to go away in one loop and the condition to
         // simplify in the other one.
         if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
-                                              ParentChain, Cache)) {
+                                              ParentChain, Cache, MSSAU)) {
           Cache[Cond] = LHS;
           return LHS;
         }
@@ -486,7 +488,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
         // operand(1).
         ParentChain = NewChain;
         if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
-                                              ParentChain, Cache)) {
+                                              ParentChain, Cache, MSSAU)) {
           Cache[Cond] = RHS;
           return RHS;
         }
@@ -500,12 +502,12 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant along with the operator chain type.
 /// Otherwise, return null.
-static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
-                                                              Loop *L,
-                                                              bool &Changed) {
+static std::pair<Value *, OperatorChain>
+FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                     MemorySSAUpdater *MSSAU) {
   DenseMap<Value *, Value *> Cache;
   OperatorChain OpChain = OC_OpChainNone;
-  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
 
   // In case we do find a LIV, it can not be obtained by walking up a mixed
   // operator chain.
@@ -525,7 +527,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   if (EnableMSSALoopDependency) {
     MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-    MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
     assert(DT && "Cannot update MemorySSA without a valid DomTree.");
   }
   currentLoop = L;
@@ -682,7 +684,7 @@ bool LoopUnswitch::processCurrentLoop() {
     for (auto &I : *BB) {
       auto CS = CallSite(&I);
       if (!CS) continue;
-      if (CS.hasFnAttr(Attribute::Convergent))
+      if (CS.isConvergent())
         return false;
       if (auto *II = dyn_cast<InvokeInst>(&I))
         if (!II->getUnwindDest()->canSplitPredecessors())
@@ -694,8 +696,9 @@ bool LoopUnswitch::processCurrentLoop() {
   }
 
   for (IntrinsicInst *Guard : Guards) {
-    Value *LoopCond =
-        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
+    Value *LoopCond = FindLIVLoopCondition(Guard->getOperand(0), currentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
     if (LoopCond &&
         UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
       // NB! Unswitching (if successful) could have erased some of the
@@ -735,8 +738,9 @@ bool LoopUnswitch::processCurrentLoop() {
       if (BI->isConditional()) {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
-        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                               currentLoop, Changed).first;
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+                                               Changed, MSSAU.get())
+                              .first;
         if (LoopCond && !EqualityPropUnSafe(*LoopCond) &&
             UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
@@ -748,7 +752,7 @@ bool LoopUnswitch::processCurrentLoop() {
       Value *LoopCond;
       OperatorChain OpChain;
       std::tie(LoopCond, OpChain) =
-        FindLIVLoopCondition(SC, currentLoop, Changed);
+          FindLIVLoopCondition(SC, currentLoop, Changed, MSSAU.get());
 
       unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
@@ -808,8 +812,9 @@ bool LoopUnswitch::processCurrentLoop() {
     for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
-        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                               currentLoop, Changed).first;
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+                                               Changed, MSSAU.get())
+                              .first;
         if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
@@ -1123,8 +1128,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     if (!BI->isConditional())
       return false;
 
-    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                           currentLoop, Changed).first;
+    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -1157,8 +1163,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
     // If this isn't switching on an invariant condition, we can't unswitch it.
-    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                           currentLoop, Changed).first;
+    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -1240,6 +1247,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   LoopBlocks.clear();
   NewBlocks.clear();
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
   // First step, split the preheader and exit blocks, and add these blocks to
   // the LoopBlocks list.
   BasicBlock *NewPreheader =
@@ -1607,36 +1617,30 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         // If BI's parent is the only pred of the successor, fold the two blocks
         // together.
         BasicBlock *Pred = BI->getParent();
+        (void)Pred;
         BasicBlock *Succ = BI->getSuccessor(0);
         BasicBlock *SinglePred = Succ->getSinglePredecessor();
         if (!SinglePred) continue;  // Nothing to do.
         assert(SinglePred == Pred && "CFG broken");
 
-        LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
-                          << Succ->getName() << "\n");
-
-        // Resolve any single entry PHI nodes in Succ.
-        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
-          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM,
-                            MSSAU.get());
-
-        // If Succ has any successors with PHI nodes, update them to have
-        // entries coming from Pred instead of Succ.
-        Succ->replaceAllUsesWith(Pred);
-
-        // Move all of the successor contents from Succ to Pred.
-        Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
-                                   Succ->begin(), Succ->end());
-        if (MSSAU)
-          MSSAU->moveAllAfterMergeBlocks(Succ, Pred, BI);
+        // Make the LPM and Worklist updates specific to LoopUnswitch.
         LPM->deleteSimpleAnalysisValue(BI, L);
         RemoveFromWorklist(BI, Worklist);
-        BI->eraseFromParent();
-
-        // Remove Succ from the loop tree.
-        LI->removeBlock(Succ);
         LPM->deleteSimpleAnalysisValue(Succ, L);
-        Succ->eraseFromParent();
+        auto SuccIt = Succ->begin();
+        while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
+          for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
+            if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It)))
+              Worklist.push_back(Use);
+          for (User *U : PN->users())
+            Worklist.push_back(cast<Instruction>(U));
+          LPM->deleteSimpleAnalysisValue(PN, L);
+          RemoveFromWorklist(PN, Worklist);
+          ++NumSimplify;
+        }
+        // Merge the block and make the remaining analyses updates.
+        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+        MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get());
         ++NumSimplify;
         continue;
       }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 896dd8bcb922..7b9af527d444 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -79,6 +79,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -112,37 +113,6 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
         "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
     cl::init(2), cl::Hidden);
 
-/// Create MDNode for input string.
-static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
-  LLVMContext &Context = TheLoop->getHeader()->getContext();
-  Metadata *MDs[] = {
-      MDString::get(Context, Name),
-      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
-  return MDNode::get(Context, MDs);
-}
-
-/// Set input string into loop metadata by keeping other values intact.
-void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
-                                   unsigned V) {
-  SmallVector<Metadata *, 4> MDs(1);
-  // If the loop already has metadata, retain it.
-  MDNode *LoopID = TheLoop->getLoopID();
-  if (LoopID) {
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
-      MDs.push_back(Node);
-    }
-  }
-  // Add new metadata.
-  MDs.push_back(createStringMetadata(TheLoop, MDString, V));
-  // Replace current metadata node with new one.
-  LLVMContext &Context = TheLoop->getHeader()->getContext();
-  MDNode *NewLoopID = MDNode::get(Context, MDs);
-  // Set operand 0 to refer to the loop id itself.
-  NewLoopID->replaceOperandWith(0, NewLoopID);
-  TheLoop->setLoopID(NewLoopID);
-}
-
 namespace {
 
 struct LoopVersioningLICM : public LoopPass {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
index e076424d9042..ab7b85e89e7b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
new file mode 100644
index 000000000000..21c6c32e8e02
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -0,0 +1,171 @@
+//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls
+// and provides constant propagation and basic CFG cleanup on the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "lower-is-constant-intrinsic"
+
+STATISTIC(IsConstantIntrinsicsHandled,
+          "Number of 'is.constant' intrinsic calls handled");
+STATISTIC(ObjectSizeIntrinsicsHandled,
+          "Number of 'objectsize' intrinsic calls handled");
+
+static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
+  Value *Op = II->getOperand(0);
+
+  return isa<Constant>(Op) ? ConstantInt::getTrue(II->getType())
+                           : ConstantInt::getFalse(II->getType());
+}
+
+static bool replaceConditionalBranchesOnConstant(Instruction *II,
+                                                 Value *NewValue) {
+  bool HasDeadBlocks = false;
+  SmallSetVector<Instruction *, 8> Worklist;
+  replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr,
+                                &Worklist);
+  for (auto I : Worklist) {
+    BranchInst *BI = dyn_cast<BranchInst>(I);
+    if (!BI)
+      continue;
+    if (BI->isUnconditional())
+      continue;
+
+    BasicBlock *Target, *Other;
+    if (match(BI->getOperand(0), m_Zero())) {
+      Target = BI->getSuccessor(1);
+      Other = BI->getSuccessor(0);
+    } else if (match(BI->getOperand(0), m_One())) {
+      Target = BI->getSuccessor(0);
+      Other = BI->getSuccessor(1);
+    } else {
+      Target = nullptr;
+      Other = nullptr;
+    }
+    if (Target && Target != Other) {
+      BasicBlock *Source = BI->getParent();
+      Other->removePredecessor(Source);
+      BI->eraseFromParent();
+      BranchInst::Create(Target, Source);
+      if (pred_begin(Other) == pred_end(Other))
+        HasDeadBlocks = true;
+    }
+  }
+  return HasDeadBlocks;
+}
+
+static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) {
+  bool HasDeadBlocks = false;
+  const auto &DL = F.getParent()->getDataLayout();
+  SmallVector<WeakTrackingVH, 8> Worklist;
+
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT) {
+    for (Instruction &I: *BB) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (!II)
+        continue;
+      switch (II->getIntrinsicID()) {
+      default:
+        break;
+      case Intrinsic::is_constant:
+      case Intrinsic::objectsize:
+        Worklist.push_back(WeakTrackingVH(&I));
+        break;
+      }
+    }
+  }
+  for (WeakTrackingVH &VH: Worklist) {
+    // Items on the worklist can be mutated by earlier recursive replaces.
+    // This can remove the intrinsic as dead (VH == null), but also replace
+    // the intrinsic in place.
+    if (!VH)
+      continue;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH);
+    if (!II)
+      continue;
+    Value *NewValue;
+    switch (II->getIntrinsicID()) {
+    default:
+      continue;
+    case Intrinsic::is_constant:
+      NewValue = lowerIsConstantIntrinsic(II);
+      IsConstantIntrinsicsHandled++;
+      break;
+    case Intrinsic::objectsize:
+      NewValue = lowerObjectSizeCall(II, DL, TLI, true);
+      ObjectSizeIntrinsicsHandled++;
+      break;
+    }
+    HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue);
+  }
+  if (HasDeadBlocks)
+    removeUnreachableBlocks(F);
+  return !Worklist.empty();
+}
+
+PreservedAnalyses
+LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
+  if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+/// Legacy pass for lowering is.constant intrinsics out of the IR.
+///
+/// When this pass is run over a function it converts is.constant intrinsics
+/// into 'true' or 'false'. This is completements the normal constand folding
+/// to 'true' as part of Instruction Simplify passes.
+class LowerConstantIntrinsics : public FunctionPass {
+public:
+  static char ID;
+  LowerConstantIntrinsics() : FunctionPass(ID) {
+    initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+    return lowerConstantIntrinsics(F, TLI);
+  }
+};
+} // namespace
+
+char LowerConstantIntrinsics::ID = 0;
+INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics",
+                "Lower constant intrinsics", false, false)
+
+FunctionPass *llvm::createLowerConstantIntrinsicsPass() {
+  return new LowerConstantIntrinsics();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 0d67c0d740ec..53671c7bc3d1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -22,10 +22,12 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 
 using namespace llvm;
 
@@ -71,15 +73,20 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   unsigned n = SI.getNumCases(); // +1 for default case.
   SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
 
-  if (Case == *SI.case_default())
-    Weights[0] = LikelyBranchWeight;
-  else
-    Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+  uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
+  Weights[Index] = LikelyBranchWeight;
+
+  SI.setMetadata(
+      LLVMContext::MD_misexpect,
+      MDBuilder(CI->getContext())
+          .createMisExpect(Index, LikelyBranchWeight, UnlikelyBranchWeight));
+
+  SI.setCondition(ArgValue);
+  misexpect::checkFrontendInstrumentation(SI);
 
   SI.setMetadata(LLVMContext::MD_prof,
                  MDBuilder(CI->getContext()).createBranchWeights(Weights));
 
-  SI.setCondition(ArgValue);
   return true;
 }
 
@@ -155,7 +162,7 @@ static void handlePhiDef(CallInst *Expect) {
     return Result;
   };
 
-  auto *PhiDef = dyn_cast<PHINode>(V);
+  auto *PhiDef = cast<PHINode>(V);
 
   // Get the first dominating conditional branch of the operand
   // i's incoming block.
@@ -280,19 +287,28 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
 
   MDBuilder MDB(CI->getContext());
   MDNode *Node;
+  MDNode *ExpNode;
 
   if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
-      (Predicate == CmpInst::ICMP_EQ))
+      (Predicate == CmpInst::ICMP_EQ)) {
     Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
-  else
+    ExpNode = MDB.createMisExpect(0, LikelyBranchWeight, UnlikelyBranchWeight);
+  } else {
     Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+    ExpNode = MDB.createMisExpect(1, LikelyBranchWeight, UnlikelyBranchWeight);
+  }
 
-  BSI.setMetadata(LLVMContext::MD_prof, Node);
+  BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode);
 
   if (CmpI)
     CmpI->setOperand(0, ArgValue);
   else
     BSI.setCondition(ArgValue);
+
+  misexpect::checkFrontendInstrumentation(BSI);
+
+  BSI.setMetadata(LLVMContext::MD_prof, Node);
+
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index 9489e01774d6..45f5929e3b90 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/GuardUtils.h"
@@ -60,7 +61,7 @@ static bool lowerGuardIntrinsic(Function &F) {
   DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
 
   for (auto *CI : ToLower) {
-    makeGuardControlFlowExplicit(DeoptIntrinsic, CI);
+    makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false);
     CI->eraseFromParent();
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
new file mode 100644
index 000000000000..0ff6ee8bcfcc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -0,0 +1,894 @@
+//===- LowerMatrixIntrinsics.cpp -  Lower matrix intrinsics -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower matrix intrinsics to vector operations.
+//
+// TODO:
+//  * Implement multiply & add fusion
+//  * Add remark, summarizing the available matrix optimization opportunities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-matrix-intrinsics"
+
+static cl::opt<bool> EnableShapePropagation("matrix-propagate-shape",
+                                            cl::init(true));
+
+static cl::opt<bool> AllowContractEnabled(
+    "matrix-allow-contract", cl::init(false), cl::Hidden,
+    cl::desc("Allow the use of FMAs if available and profitable. This may "
+             "result in different results, due to less rounding error."));
+
+namespace {
+
+// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute
+// the start address of column \p Col with type (\p EltType x \p NumRows)
+// assuming \p Stride elements between start two consecutive columns.
+// \p Stride must be >= \p NumRows.
+//
+// Consider a 4x4 matrix like below
+//
+//      0       1      2      3
+// 0   v_0_0  v_0_1  v_0_2  v_0_3
+// 1   v_1_0  v_1_1  v_1_2  v_1_3
+// 2   v_2_0  v_2_1  v_2_2  v_2_3
+// 3   v_3_0  v_3_1  v_3_2  v_3_3
+
+// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,
+// we need a pointer to the first element of the submatrix as base pointer.
+// Then we can use computeColumnAddr to compute the addresses for the columns
+// of the sub-matrix.
+//
+// Column 0: computeColumnAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)
+//           -> just returns Base
+// Column 1: computeColumnAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)
+//           -> returns Base + (1 * 4)
+// Column 2: computeColumnAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)
+//           -> returns Base + (2 * 4)
+//
+// The graphic below illustrates the number of elements in a column (marked
+// with |) and the number of skipped elements (marked with }).
+//
+//         v_0_0  v_0_1 {v_0_2 {v_0_3
+//                Base   Col 1  Col 2
+//                  |     |      |
+//         v_1_0 |v_1_1 |v_1_2 |v_1_3
+//         v_2_0 |v_2_1 |v_2_2 |v_2_3
+//         v_3_0 {v_3_1 {v_3_2  v_3_3
+//
+Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride,
+                         unsigned NumRows, Type *EltType,
+                         IRBuilder<> &Builder) {
+
+  assert((!isa<ConstantInt>(Stride) ||
+          cast<ConstantInt>(Stride)->getZExtValue() >= NumRows) &&
+         "Stride must be >= the number of rows.");
+  unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
+
+  // Compute the start of the column with index Col as Col * Stride.
+  Value *ColumnStart = Builder.CreateMul(Col, Stride, "col.start");
+
+  // Get pointer to the start of the selected column. Skip GEP creation,
+  // if we select column 0.
+  if (isa<ConstantInt>(ColumnStart) && cast<ConstantInt>(ColumnStart)->isZero())
+    ColumnStart = BasePtr;
+  else
+    ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart, "col.gep");
+
+  // Cast elementwise column start pointer to a pointer to a column
+  // (EltType x NumRows)*.
+  Type *ColumnType = VectorType::get(EltType, NumRows);
+  Type *ColumnPtrType = PointerType::get(ColumnType, AS);
+  return Builder.CreatePointerCast(ColumnStart, ColumnPtrType, "col.cast");
+}
+
+/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
+///
+/// Currently, the lowering for each matrix intrinsic is done as follows:
+/// 1. Propagate the shape information from intrinsics to connected
+/// instructions.
+/// 2. Lower instructions with shape information.
+///  2.1. Get column vectors for each argument. If we already lowered the
+///       definition of an argument, use the produced column vectors directly.
+///       If not, split the operand vector containing an embedded matrix into
+///       a set of column vectors,
+///  2.2. Lower the instruction in terms of columnwise operations, which yields
+///       a set of column vectors containing result matrix. Note that we lower
+///       all instructions that have shape information. Besides the intrinsics,
+///       this includes stores for example.
+///  2.3. Update uses of the lowered instruction. If we have shape information
+///       for a user, there is nothing to do, as we will look up the result
+///       column matrix when lowering the user. For other uses, we embed the
+///       result matrix in a flat vector and update the use.
+///  2.4. Cache the result column matrix for the instruction we lowered
+/// 3. After we lowered all instructions in a function, remove the now
+///    obsolete instructions.
+///
+class LowerMatrixIntrinsics {
+  Function &Func;
+  const DataLayout &DL;
+  const TargetTransformInfo &TTI;
+
+  /// Wrapper class representing a matrix as a set of column vectors.
+  /// All column vectors must have the same vector type.
+  class ColumnMatrixTy {
+    SmallVector<Value *, 16> Columns;
+
+  public:
+    ColumnMatrixTy() : Columns() {}
+    ColumnMatrixTy(ArrayRef<Value *> Cols)
+        : Columns(Cols.begin(), Cols.end()) {}
+
+    Value *getColumn(unsigned i) const { return Columns[i]; }
+
+    void setColumn(unsigned i, Value *V) { Columns[i] = V; }
+
+    size_t getNumColumns() const { return Columns.size(); }
+    size_t getNumRows() const {
+      assert(Columns.size() > 0 && "Cannot call getNumRows without columns");
+      return cast<VectorType>(Columns[0]->getType())->getNumElements();
+    }
+
+    const SmallVectorImpl<Value *> &getColumnVectors() const { return Columns; }
+
+    SmallVectorImpl<Value *> &getColumnVectors() { return Columns; }
+
+    void addColumn(Value *V) { Columns.push_back(V); }
+
+    iterator_range<SmallVector<Value *, 8>::iterator> columns() {
+      return make_range(Columns.begin(), Columns.end());
+    }
+
+    /// Embed the columns of the matrix into a flat vector by concatenating
+    /// them.
+    Value *embedInVector(IRBuilder<> &Builder) const {
+      return Columns.size() == 1 ? Columns[0]
+                                 : concatenateVectors(Builder, Columns);
+    }
+  };
+
+  struct ShapeInfo {
+    unsigned NumRows;
+    unsigned NumColumns;
+
+    ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
+        : NumRows(NumRows), NumColumns(NumColumns) {}
+
+    ShapeInfo(Value *NumRows, Value *NumColumns)
+        : NumRows(cast<ConstantInt>(NumRows)->getZExtValue()),
+          NumColumns(cast<ConstantInt>(NumColumns)->getZExtValue()) {}
+
+    bool operator==(const ShapeInfo &other) {
+      return NumRows == other.NumRows && NumColumns == other.NumColumns;
+    }
+    bool operator!=(const ShapeInfo &other) { return !(*this == other); }
+
+    /// Returns true if shape-information is defined, meaning both dimensions
+    /// are != 0.
+    operator bool() const {
+      assert(NumRows == 0 || NumColumns != 0);
+      return NumRows != 0;
+    }
+  };
+
+  /// Maps instructions to their shape information. The shape information
+  /// describes the shape to be used while lowering. This matches the shape of
+  /// the result value of the instruction, with the only exceptions being store
+  /// instructions and the matrix_columnwise_store intrinsics. For those, the
+  /// shape information indicates that those instructions should be lowered
+  /// using shape information as well.
+  DenseMap<Value *, ShapeInfo> ShapeMap;
+
+  /// List of instructions to remove. While lowering, we are not replacing all
+  /// users of a lowered instruction, if shape information is available and
+  /// those need to be removed after we finished lowering.
+  SmallVector<Instruction *, 16> ToRemove;
+
+  /// Map from instructions to their produced column matrix.
+  DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
+
+public:
+  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI)
+      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {}
+
+  /// Return the set of column vectors that a matrix value is lowered to.
+  ///
+  /// If we lowered \p MatrixVal, just return the cache result column matrix.
+  /// Otherwie split the flat vector \p MatrixVal containing a matrix with
+  /// shape \p SI into column vectors.
+  ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
+                           IRBuilder<> Builder) {
+    VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType());
+    assert(VType && "MatrixVal must be a vector type");
+    assert(VType->getNumElements() == SI.NumRows * SI.NumColumns &&
+           "The vector size must match the number of matrix elements");
+
+    // Check if we lowered MatrixVal using shape information. In that case,
+    // return the existing column matrix, if it matches the requested shape
+    // information. If there is a mis-match, embed the result in a flat
+    // vector and split it later.
+    auto Found = Inst2ColumnMatrix.find(MatrixVal);
+    if (Found != Inst2ColumnMatrix.end()) {
+      ColumnMatrixTy &M = Found->second;
+      // Return the found matrix, if its shape matches the requested shape
+      // information
+      if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())
+        return M;
+
+      MatrixVal = M.embedInVector(Builder);
+    }
+
+    // Otherwise split MatrixVal.
+    SmallVector<Value *, 16> SplitVecs;
+    Value *Undef = UndefValue::get(VType);
+    for (unsigned MaskStart = 0; MaskStart < VType->getNumElements();
+         MaskStart += SI.NumRows) {
+      Constant *Mask = createSequentialMask(Builder, MaskStart, SI.NumRows, 0);
+      Value *V = Builder.CreateShuffleVector(MatrixVal, Undef, Mask, "split");
+      SplitVecs.push_back(V);
+    }
+
+    return {SplitVecs};
+  }
+
+  /// If \p V already has a known shape return false.  Otherwise set the shape
+  /// for instructions that support it.
+  bool setShapeInfo(Value *V, ShapeInfo Shape) {
+    assert(Shape && "Shape not set");
+    if (isa<UndefValue>(V) || !supportsShapeInfo(V))
+      return false;
+
+    auto SIter = ShapeMap.find(V);
+    if (SIter != ShapeMap.end()) {
+      LLVM_DEBUG(dbgs() << "  not overriding existing shape: "
+                        << SIter->second.NumRows << " "
+                        << SIter->second.NumColumns << " for " << *V << "\n");
+      return false;
+    }
+
+    ShapeMap.insert({V, Shape});
+    LLVM_DEBUG(dbgs() << "  " << Shape.NumRows << " x " << Shape.NumColumns
+                      << " for " << *V << "\n");
+    return true;
+  }
+
+  bool isUniformShape(Value *V) {
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return true;
+
+    switch (I->getOpcode()) {
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul: // Scalar multiply.
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::Sub:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Returns true if shape information can be used for \p V. The supported
+  /// instructions must match the instructions that can be lowered by this pass.
+  bool supportsShapeInfo(Value *V) {
+    Instruction *Inst = dyn_cast<Instruction>(V);
+    if (!Inst)
+      return false;
+
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+    if (II)
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::matrix_multiply:
+      case Intrinsic::matrix_transpose:
+      case Intrinsic::matrix_columnwise_load:
+      case Intrinsic::matrix_columnwise_store:
+        return true;
+      default:
+        return false;
+      }
+    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
+  }
+
+  /// Propagate the shape information of instructions to their users.
+  /// The work list contains instructions for which we can compute the shape,
+  /// either based on the information provided by matrix intrinsics or known
+  /// shapes of operands.
+  SmallVector<Instruction *, 32>
+  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
+    // Pop an element for which we guaranteed to have at least one of the
+    // operand shapes.  Add the shape for this and then add users to the work
+    // list.
+    LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");
+    while (!WorkList.empty()) {
+      Instruction *Inst = WorkList.back();
+      WorkList.pop_back();
+
+      // New entry, set the value and insert operands
+      bool Propagate = false;
+
+      Value *MatrixA;
+      Value *MatrixB;
+      Value *M;
+      Value *N;
+      Value *K;
+      if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>(
+                          m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+                          m_Value(N), m_Value(K)))) {
+        Propagate = setShapeInfo(Inst, {M, K});
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>(
+                                 m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+        // Flip dimensions.
+        Propagate = setShapeInfo(Inst, {N, M});
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
+                                 m_Value(MatrixA), m_Value(), m_Value(),
+                                 m_Value(M), m_Value(N)))) {
+        Propagate = setShapeInfo(Inst, {N, M});
+      } else if (match(Inst,
+                       m_Intrinsic<Intrinsic::matrix_columnwise_load>(
+                           m_Value(), m_Value(), m_Value(M), m_Value(N)))) {
+        Propagate = setShapeInfo(Inst, {M, N});
+      } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
+        auto OpShape = ShapeMap.find(MatrixA);
+        if (OpShape != ShapeMap.end())
+          setShapeInfo(Inst, OpShape->second);
+        continue;
+      } else if (isUniformShape(Inst)) {
+        // Find the first operand that has a known shape and use that.
+        for (auto &Op : Inst->operands()) {
+          auto OpShape = ShapeMap.find(Op.get());
+          if (OpShape != ShapeMap.end()) {
+            Propagate |= setShapeInfo(Inst, OpShape->second);
+            break;
+          }
+        }
+      }
+
+      if (Propagate) {
+        NewWorkList.push_back(Inst);
+        for (auto *User : Inst->users())
+          if (ShapeMap.count(User) == 0)
+            WorkList.push_back(cast<Instruction>(User));
+      }
+    }
+
+    return NewWorkList;
+  }
+
+  /// Propagate the shape to operands of instructions with shape information.
+  /// \p Worklist contains the instruction for which we already know the shape.
+  SmallVector<Instruction *, 32>
+  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
+
+    auto pushInstruction = [](Value *V,
+                              SmallVectorImpl<Instruction *> &WorkList) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (I)
+        WorkList.push_back(I);
+    };
+    // Pop an element with known shape.  Traverse the operands, if their shape
+    // derives from the result shape and is unknown, add it and add them to the
+    // worklist.
+    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
+    while (!WorkList.empty()) {
+      Value *V = WorkList.back();
+      WorkList.pop_back();
+
+      size_t BeforeProcessingV = WorkList.size();
+      if (!isa<Instruction>(V))
+        continue;
+
+      Value *MatrixA;
+      Value *MatrixB;
+      Value *M;
+      Value *N;
+      Value *K;
+      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(
+                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+                       m_Value(N), m_Value(K)))) {
+        if (setShapeInfo(MatrixA, {M, N}))
+          pushInstruction(MatrixA, WorkList);
+
+        if (setShapeInfo(MatrixB, {N, K}))
+          pushInstruction(MatrixB, WorkList);
+
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(
+                              m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+        // Flip dimensions.
+        if (setShapeInfo(MatrixA, {M, N}))
+          pushInstruction(MatrixA, WorkList);
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
+                              m_Value(MatrixA), m_Value(), m_Value(),
+                              m_Value(M), m_Value(N)))) {
+        if (setShapeInfo(MatrixA, {M, N})) {
+          pushInstruction(MatrixA, WorkList);
+        }
+      } else if (isa<LoadInst>(V) ||
+                 match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) {
+        // Nothing to do, no matrix input.
+      } else if (isa<StoreInst>(V)) {
+        // Nothing to do.  We forward-propagated to this so we would just
+        // backward propagate to an instruction with an already known shape.
+      } else if (isUniformShape(V)) {
+        // Propagate to all operands.
+        ShapeInfo Shape = ShapeMap[V];
+        for (Use &U : cast<Instruction>(V)->operands()) {
+          if (setShapeInfo(U.get(), Shape))
+            pushInstruction(U.get(), WorkList);
+        }
+      }
+      // After we discovered new shape info for new instructions in the
+      // worklist, we use their users as seeds for the next round of forward
+      // propagation.
+      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)
+        for (User *U : WorkList[I]->users())
+          if (isa<Instruction>(U) && V != U)
+            NewWorkList.push_back(cast<Instruction>(U));
+    }
+    return NewWorkList;
+  }
+
+  bool Visit() {
+    if (EnableShapePropagation) {
+      SmallVector<Instruction *, 32> WorkList;
+
+      // Initially only the shape of matrix intrinsics is known.
+      // Initialize the work list with ops carrying shape information.
+      for (BasicBlock &BB : Func)
+        for (Instruction &Inst : BB) {
+          IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
+          if (!II)
+            continue;
+
+          switch (II->getIntrinsicID()) {
+          case Intrinsic::matrix_multiply:
+          case Intrinsic::matrix_transpose:
+          case Intrinsic::matrix_columnwise_load:
+          case Intrinsic::matrix_columnwise_store:
+            WorkList.push_back(&Inst);
+            break;
+          default:
+            break;
+          }
+        }
+      // Propagate shapes until nothing changes any longer.
+      while (!WorkList.empty()) {
+        WorkList = propagateShapeForward(WorkList);
+        WorkList = propagateShapeBackward(WorkList);
+      }
+    }
+
+    ReversePostOrderTraversal<Function *> RPOT(&Func);
+    bool Changed = false;
+    for (auto *BB : RPOT) {
+      for (Instruction &Inst : make_early_inc_range(*BB)) {
+        IRBuilder<> Builder(&Inst);
+
+        if (CallInst *CInst = dyn_cast<CallInst>(&Inst))
+          Changed |= VisitCallInst(CInst);
+
+        Value *Op1;
+        Value *Op2;
+        if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst))
+          Changed |= VisitBinaryOperator(BinOp);
+        if (match(&Inst, m_Load(m_Value(Op1))))
+          Changed |= VisitLoad(&Inst, Op1, Builder);
+        else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2))))
+          Changed |= VisitStore(&Inst, Op1, Op2, Builder);
+      }
+    }
+
+    for (Instruction *Inst : reverse(ToRemove))
+      Inst->eraseFromParent();
+
+    return Changed;
+  }
+
+  LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType,
+                             IRBuilder<> Builder) {
+    unsigned Align = DL.getABITypeAlignment(EltType);
+    return Builder.CreateAlignedLoad(ColumnPtr, Align, "col.load");
+  }
+
+  StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr,
+                               Type *EltType, IRBuilder<> Builder) {
+    unsigned Align = DL.getABITypeAlignment(EltType);
+    return Builder.CreateAlignedStore(ColumnValue, ColumnPtr, Align);
+  }
+
+
+  /// Turns \p BasePtr into an elementwise pointer to \p EltType.
+  Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) {
+    unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
+    Type *EltPtrType = PointerType::get(EltType, AS);
+    return Builder.CreatePointerCast(BasePtr, EltPtrType);
+  }
+
+  /// Replace intrinsic calls
+  bool VisitCallInst(CallInst *Inst) {
+    if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic())
+      return false;
+
+    switch (Inst->getCalledFunction()->getIntrinsicID()) {
+    case Intrinsic::matrix_multiply:
+      LowerMultiply(Inst);
+      break;
+    case Intrinsic::matrix_transpose:
+      LowerTranspose(Inst);
+      break;
+    case Intrinsic::matrix_columnwise_load:
+      LowerColumnwiseLoad(Inst);
+      break;
+    case Intrinsic::matrix_columnwise_store:
+      LowerColumnwiseStore(Inst);
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride,
+                 ShapeInfo Shape) {
+    IRBuilder<> Builder(Inst);
+    auto VType = cast<VectorType>(Inst->getType());
+    Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
+    ColumnMatrixTy Result;
+    // Distance between start of one column and the start of the next
+    for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) {
+      Value *GEP =
+          computeColumnAddr(EltPtr, Builder.getInt32(C), Stride, Shape.NumRows,
+                            VType->getElementType(), Builder);
+      Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder);
+      Result.addColumn(Column);
+    }
+
+    finalizeLowering(Inst, Result, Builder);
+  }
+
+  /// Lowers llvm.matrix.columnwise.load.
+  ///
+  /// The intrinsic loads a matrix from memory using a stride between columns.
+  void LowerColumnwiseLoad(CallInst *Inst) {
+    Value *Ptr = Inst->getArgOperand(0);
+    Value *Stride = Inst->getArgOperand(1);
+    LowerLoad(Inst, Ptr, Stride,
+              {Inst->getArgOperand(2), Inst->getArgOperand(3)});
+  }
+
+  void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride,
+                  ShapeInfo Shape) {
+    IRBuilder<> Builder(Inst);
+    auto VType = cast<VectorType>(Matrix->getType());
+    Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
+    auto LM = getMatrix(Matrix, Shape, Builder);
+    for (auto C : enumerate(LM.columns())) {
+      Value *GEP =
+          computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride,
+                            Shape.NumRows, VType->getElementType(), Builder);
+      createColumnStore(C.value(), GEP, VType->getElementType(), Builder);
+    }
+
+    ToRemove.push_back(Inst);
+  }
+
+  /// Lowers llvm.matrix.columnwise.store.
+  ///
+  /// The intrinsic store a matrix back memory using a stride between columns.
+  void LowerColumnwiseStore(CallInst *Inst) {
+    Value *Matrix = Inst->getArgOperand(0);
+    Value *Ptr = Inst->getArgOperand(1);
+    Value *Stride = Inst->getArgOperand(2);
+    LowerStore(Inst, Matrix, Ptr, Stride,
+               {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+  }
+
+  /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from
+  /// the matrix \p LM represented as a vector of column vectors.
+  Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J,
+                       unsigned NumElts, IRBuilder<> Builder) {
+    Value *Col = LM.getColumn(J);
+    Value *Undef = UndefValue::get(Col->getType());
+    Constant *Mask = createSequentialMask(Builder, I, NumElts, 0);
+    return Builder.CreateShuffleVector(Col, Undef, Mask, "block");
+  }
+
+  // Set elements I..I+NumElts-1 to Block
+  Value *insertVector(Value *Col, unsigned I, Value *Block,
+                      IRBuilder<> Builder) {
+
+    // First, bring Block to the same size as Col
+    unsigned BlockNumElts =
+        cast<VectorType>(Block->getType())->getNumElements();
+    unsigned NumElts = cast<VectorType>(Col->getType())->getNumElements();
+    assert(NumElts >= BlockNumElts && "Too few elements for current block");
+
+    Value *ExtendMask =
+        createSequentialMask(Builder, 0, BlockNumElts, NumElts - BlockNumElts);
+    Value *Undef = UndefValue::get(Block->getType());
+    Block = Builder.CreateShuffleVector(Block, Undef, ExtendMask);
+
+    // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,
+    // 8, 4, 5, 6
+    SmallVector<Constant *, 16> Mask;
+    unsigned i;
+    for (i = 0; i < I; i++)
+      Mask.push_back(Builder.getInt32(i));
+
+    unsigned VecNumElts = cast<VectorType>(Col->getType())->getNumElements();
+    for (; i < I + BlockNumElts; i++)
+      Mask.push_back(Builder.getInt32(i - I + VecNumElts));
+
+    for (; i < VecNumElts; i++)
+      Mask.push_back(Builder.getInt32(i));
+
+    Value *MaskVal = ConstantVector::get(Mask);
+
+    return Builder.CreateShuffleVector(Col, Block, MaskVal);
+  }
+
+  Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
+                      IRBuilder<> &Builder, bool AllowContraction) {
+
+    if (!Sum)
+      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
+
+    if (UseFPOp) {
+      if (AllowContraction) {
+        // Use fmuladd for floating point operations and let the backend decide
+        // if that's profitable.
+        Value *FMulAdd = Intrinsic::getDeclaration(
+            Func.getParent(), Intrinsic::fmuladd, A->getType());
+        return Builder.CreateCall(FMulAdd, {A, B, Sum});
+      }
+      Value *Mul = Builder.CreateFMul(A, B);
+      return Builder.CreateFAdd(Sum, Mul);
+    }
+
+    Value *Mul = Builder.CreateMul(A, B);
+    return Builder.CreateAdd(Sum, Mul);
+  }
+
+  /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For
+  /// users with shape information, there's nothing to do: the will use the
+  /// cached value when they are lowered. For other users, \p Matrix is
+  /// flattened and the uses are updated to use it. Also marks \p Inst for
+  /// deletion.
+  void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix,
+                        IRBuilder<> &Builder) {
+    Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
+
+    ToRemove.push_back(Inst);
+    Value *Flattened = nullptr;
+    for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
+      Use &U = *I++;
+      if (ShapeMap.find(U.getUser()) == ShapeMap.end()) {
+        if (!Flattened)
+          Flattened = Matrix.embedInVector(Builder);
+        U.set(Flattened);
+      }
+    }
+  }
+
+  /// Lowers llvm.matrix.multiply.
+  void LowerMultiply(CallInst *MatMul) {
+    IRBuilder<> Builder(MatMul);
+    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+    const ColumnMatrixTy &Lhs =
+        getMatrix(MatMul->getArgOperand(0), LShape, Builder);
+    const ColumnMatrixTy &Rhs =
+        getMatrix(MatMul->getArgOperand(1), RShape, Builder);
+
+    const unsigned R = LShape.NumRows;
+    const unsigned M = LShape.NumColumns;
+    const unsigned C = RShape.NumColumns;
+    assert(M == RShape.NumRows);
+
+    // Initialize the output
+    ColumnMatrixTy Result;
+    for (unsigned J = 0; J < C; ++J)
+      Result.addColumn(UndefValue::get(VectorType::get(EltType, R)));
+
+    const unsigned VF = std::max(TTI.getRegisterBitWidth(true) /
+                                     EltType->getPrimitiveSizeInBits(),
+                                 uint64_t(1));
+
+    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+                                                  MatMul->hasAllowContract());
+    // Multiply columns from the first operand with scalars from the second
+    // operand.  Then move along the K axes and accumulate the columns.  With
+    // this the adds can be vectorized without reassociation.
+    for (unsigned J = 0; J < C; ++J) {
+      unsigned BlockSize = VF;
+      for (unsigned I = 0; I < R; I += BlockSize) {
+        // Gradually lower the vectorization factor to cover the remainder.
+        while (I + BlockSize > R)
+          BlockSize /= 2;
+
+        Value *Sum = nullptr;
+        for (unsigned K = 0; K < M; ++K) {
+          Value *L = extractVector(Lhs, I, K, BlockSize, Builder);
+          Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K);
+          Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
+          Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(),
+                             Builder, AllowContract);
+        }
+        Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder));
+      }
+    }
+    finalizeLowering(MatMul, Result, Builder);
+  }
+
+  /// Lowers llvm.matrix.transpose.
+  void LowerTranspose(CallInst *Inst) {
+    ColumnMatrixTy Result;
+    IRBuilder<> Builder(Inst);
+    Value *InputVal = Inst->getArgOperand(0);
+    VectorType *VectorTy = cast<VectorType>(InputVal->getType());
+    ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
+    ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
+
+    for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) {
+      // Build a single column vector for this row. First initialize it.
+      Value *ResultColumn = UndefValue::get(
+          VectorType::get(VectorTy->getElementType(), ArgShape.NumColumns));
+
+      // Go through the elements of this row and insert it into the resulting
+      // column vector.
+      for (auto C : enumerate(InputMatrix.columns())) {
+        Value *Elt = Builder.CreateExtractElement(C.value(), Row);
+        // We insert at index Column since that is the row index after the
+        // transpose.
+        ResultColumn =
+            Builder.CreateInsertElement(ResultColumn, Elt, C.index());
+      }
+      Result.addColumn(ResultColumn);
+    }
+
+    finalizeLowering(Inst, Result, Builder);
+  }
+
+  /// Lower load instructions, if shape information is available.
+  bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    LowerLoad(Inst, Ptr, Builder.getInt32(I->second.NumRows), I->second);
+    return true;
+  }
+
+  bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr,
+                  IRBuilder<> &Builder) {
+    auto I = ShapeMap.find(StoredVal);
+    if (I == ShapeMap.end())
+      return false;
+
+    LowerStore(Inst, StoredVal, Ptr, Builder.getInt32(I->second.NumRows), I->second);
+    return true;
+  }
+
+  /// Lower binary operators, if shape information is available.
+  bool VisitBinaryOperator(BinaryOperator *Inst) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    Value *Lhs = Inst->getOperand(0);
+    Value *Rhs = Inst->getOperand(1);
+
+    IRBuilder<> Builder(Inst);
+    ShapeInfo &Shape = I->second;
+
+    ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder);
+    ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder);
+
+    // Add each column and store the result back into the opmapping
+    ColumnMatrixTy Result;
+    auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) {
+      switch (Inst->getOpcode()) {
+      case Instruction::Add:
+        return Builder.CreateAdd(LHS, RHS);
+      case Instruction::Mul:
+        return Builder.CreateMul(LHS, RHS);
+      case Instruction::Sub:
+        return Builder.CreateSub(LHS, RHS);
+      case Instruction::FAdd:
+        return Builder.CreateFAdd(LHS, RHS);
+      case Instruction::FMul:
+        return Builder.CreateFMul(LHS, RHS);
+      case Instruction::FSub:
+        return Builder.CreateFSub(LHS, RHS);
+      default:
+        llvm_unreachable("Unsupported binary operator for matrix");
+      }
+    };
+    for (unsigned C = 0; C < Shape.NumColumns; ++C)
+      Result.addColumn(
+          BuildColumnOp(LoweredLhs.getColumn(C), LoweredRhs.getColumn(C)));
+
+    finalizeLowering(Inst, Result, Builder);
+    return true;
+  }
+};
+} // namespace
+
+PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  LowerMatrixIntrinsics LMT(F, TTI);
+  if (LMT.Visit()) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+class LowerMatrixIntrinsicsLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) {
+    initializeLowerMatrixIntrinsicsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    LowerMatrixIntrinsics LMT(F, *TTI);
+    bool C = LMT.Visit();
+    return C;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // namespace
+
+static const char pass_name[] = "Lower the matrix intrinsics";
+char LowerMatrixIntrinsicsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
+                      false, false)
+INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
+                    false, false)
+
+Pass *llvm::createLowerMatrixIntrinsicsPass() {
+  return new LowerMatrixIntrinsicsLegacyPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
index 5342f2ddcb6b..73b2cd06fa23 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/GuardUtils.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index 789232e0f5ce..5ffae128f5f0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -33,10 +33,11 @@
 
 #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
 #include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/GuardUtils.h"
@@ -56,23 +57,11 @@ struct MakeGuardsExplicitLegacyPass : public FunctionPass {
 
 static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
   // Replace the guard with an explicit branch (just like in GuardWidening).
-  BasicBlock *BB = Guard->getParent();
-  makeGuardControlFlowExplicit(DeoptIntrinsic, Guard);
-  BranchInst *ExplicitGuard = cast<BranchInst>(BB->getTerminator());
-  assert(ExplicitGuard->isConditional() && "Must be!");
+  BasicBlock *OriginalBB = Guard->getParent();
+  (void)OriginalBB;
+  makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true);
+  assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold");
 
-  // We want the guard to be expressed as explicit control flow, but still be
-  // widenable. For that, we add Widenable Condition intrinsic call to the
-  // guard's condition.
-  IRBuilder<> B(ExplicitGuard);
-  auto *WidenableCondition =
-      B.CreateIntrinsic(Intrinsic::experimental_widenable_condition,
-                        {}, {}, nullptr, "widenable_cond");
-  WidenableCondition->setCallingConv(Guard->getCallingConv());
-  auto *NewCond =
-      B.CreateAnd(ExplicitGuard->getCondition(), WidenableCondition);
-  NewCond->setName("exiplicit_guard_cond");
-  ExplicitGuard->setCondition(NewCond);
   Guard->eraseFromParent();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index bb5ec253cbf2..c24fa40860eb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -49,12 +48,14 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -69,90 +70,6 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 
-static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
-                                  bool &VariableIdxFound,
-                                  const DataLayout &DL) {
-  // Skip over the first indices.
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  for (unsigned i = 1; i != Idx; ++i, ++GTI)
-    /*skip along*/;
-
-  // Compute the offset implied by the rest of the indices.
-  int64_t Offset = 0;
-  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
-    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!OpC)
-      return VariableIdxFound = true;
-    if (OpC->isZero()) continue;  // No offset.
-
-    // Handle struct indices, which add their field offset to the pointer.
-    if (StructType *STy = GTI.getStructTypeOrNull()) {
-      Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
-      continue;
-    }
-
-    // Otherwise, we have a sequential type like an array or vector.  Multiply
-    // the index by the ElementSize.
-    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
-    Offset += Size*OpC->getSExtValue();
-  }
-
-  return Offset;
-}
-
-/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
-/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
-/// might be &A[40]. In this case offset would be -8.
-static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
-                            const DataLayout &DL) {
-  Ptr1 = Ptr1->stripPointerCasts();
-  Ptr2 = Ptr2->stripPointerCasts();
-
-  // Handle the trivial case first.
-  if (Ptr1 == Ptr2) {
-    Offset = 0;
-    return true;
-  }
-
-  GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
-  GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
-
-  bool VariableIdxFound = false;
-
-  // If one pointer is a GEP and the other isn't, then see if the GEP is a
-  // constant offset from the base, as in "P" and "gep P, 1".
-  if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
-    Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
-    return !VariableIdxFound;
-  }
-
-  if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
-    Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
-    return !VariableIdxFound;
-  }
-
-  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
-  // base.  After that base, they may have some number of common (and
-  // potentially variable) indices.  After that they handle some constant
-  // offset, which determines their offset from each other.  At this point, we
-  // handle no other case.
-  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
-    return false;
-
-  // Skip any common indices and track the GEP types.
-  unsigned Idx = 1;
-  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
-    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
-      break;
-
-  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
-  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
-  if (VariableIdxFound) return false;
-
-  Offset = Offset2-Offset1;
-  return true;
-}
-
 namespace {
 
 /// Represents a range of memset'd bytes with the ByteVal value.
@@ -419,12 +336,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
         break;
 
       // Check to see if this store is to a constant offset from the start ptr.
-      int64_t Offset;
-      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
-                           DL))
+      Optional<int64_t> Offset =
+          isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
+      if (!Offset)
         break;
 
-      Ranges.addStore(Offset, NextStore);
+      Ranges.addStore(*Offset, NextStore);
     } else {
       MemSetInst *MSI = cast<MemSetInst>(BI);
 
@@ -433,11 +350,11 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
         break;
 
       // Check to see if this store is to a constant offset from the start ptr.
-      int64_t Offset;
-      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
+      Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL);
+      if (!Offset)
         break;
 
-      Ranges.addMemSet(Offset, MSI);
+      Ranges.addMemSet(*Offset, MSI);
     }
   }
 
@@ -471,16 +388,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     StartPtr = Range.StartPtr;
 
     // Determine alignment
-    unsigned Alignment = Range.Alignment;
-    if (Alignment == 0) {
-      Type *EltType =
-        cast<PointerType>(StartPtr->getType())->getElementType();
-      Alignment = DL.getABITypeAlignment(EltType);
-    }
-
-    AMemSet =
-      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+    const Align Alignment = DL.getValueOrABITypeAlignment(
+        MaybeAlign(Range.Alignment),
+        cast<PointerType>(StartPtr->getType())->getElementType());
 
+    AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
+                                   Alignment);
     LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
                                                    : Range.TheStores) dbgs()
                                               << *SI << '\n';
@@ -500,25 +413,21 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   return AMemSet;
 }
 
-static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) {
-  unsigned StoreAlign = SI->getAlignment();
-  if (!StoreAlign)
-    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
-  return StoreAlign;
+static Align findStoreAlignment(const DataLayout &DL, const StoreInst *SI) {
+  return DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()),
+                                       SI->getOperand(0)->getType());
 }
 
-static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) {
-  unsigned LoadAlign = LI->getAlignment();
-  if (!LoadAlign)
-    LoadAlign = DL.getABITypeAlignment(LI->getType());
-  return LoadAlign;
+static Align findLoadAlignment(const DataLayout &DL, const LoadInst *LI) {
+  return DL.getValueOrABITypeAlignment(MaybeAlign(LI->getAlignment()),
+                                       LI->getType());
 }
 
-static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
-                                     const LoadInst *LI) {
-  unsigned StoreAlign = findStoreAlignment(DL, SI);
-  unsigned LoadAlign = findLoadAlignment(DL, LI);
-  return MinAlign(StoreAlign, LoadAlign);
+static Align findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                 const LoadInst *LI) {
+  Align StoreAlign = findStoreAlignment(DL, SI);
+  Align LoadAlign = findLoadAlignment(DL, LI);
+  return commonAlignment(StoreAlign, LoadAlign);
 }
 
 // This method try to lift a store instruction before position P.
@@ -733,7 +642,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
             LI, SI->getPointerOperand()->stripPointerCasts(),
             LI->getPointerOperand()->stripPointerCasts(),
             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
-            findCommonAlignment(DL, SI, LI), C);
+            findCommonAlignment(DL, SI, LI).value(), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
@@ -766,12 +675,11 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
     auto *T = V->getType();
     if (T->isAggregateType()) {
       uint64_t Size = DL.getTypeStoreSize(T);
-      unsigned Align = SI->getAlignment();
-      if (!Align)
-        Align = DL.getABITypeAlignment(T);
+      const Align MA =
+          DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), T);
       IRBuilder<> Builder(SI);
       auto *M =
-          Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, Align);
+          Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, MA);
 
       LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
 
@@ -983,7 +891,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   // If the destination wasn't sufficiently aligned then increase its alignment.
   if (!isDestSufficientlyAligned) {
     assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
-    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+    cast<AllocaInst>(cpyDest)->setAlignment(MaybeAlign(srcAlign));
   }
 
   // Drop any cached information about the call, because we may have changed
@@ -1066,12 +974,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // example we could be moving from movaps -> movq on x86.
   IRBuilder<> Builder(M);
   if (UseMemMove)
-    Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(),
-                          MDep->getRawSource(), MDep->getSourceAlignment(),
+    Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
+                          MDep->getRawSource(), MDep->getSourceAlign(),
                           M->getLength(), M->isVolatile());
   else
-    Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(),
-                         MDep->getRawSource(), MDep->getSourceAlignment(),
+    Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
+                         MDep->getRawSource(), MDep->getSourceAlign(),
                          M->getLength(), M->isVolatile());
 
   // Remove the instruction we're replacing.
@@ -1141,7 +1049,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   Builder.CreateMemSet(
       Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest,
                         SrcSize),
-      MemSet->getOperand(1), MemsetLen, Align);
+      MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
 
   MD->removeInstruction(MemSet);
   MemSet->eraseFromParent();
@@ -1209,8 +1117,8 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   }
 
   IRBuilder<> Builder(MemCpy);
-  Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
-                       CopySize, MemCpy->getDestAlignment());
+  Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), CopySize,
+                       MaybeAlign(MemCpy->getDestAlignment()));
   return true;
 }
 
@@ -1237,7 +1145,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
                                            M->getModule()->getDataLayout())) {
         IRBuilder<> Builder(M);
         Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
-                             M->getDestAlignment(), false);
+                             MaybeAlign(M->getDestAlignment()), false);
         MD->removeInstruction(M);
         M->eraseFromParent();
         ++NumCpyToSet;
@@ -1520,7 +1428,7 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
     return false;
 
   auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
     return getAnalysis<AAResultsWrapperPass>().getAAResults();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index 3d047a193267..ce1e142101b8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -50,6 +50,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -897,7 +898,7 @@ public:
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F)) return false;
-    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     // MergeICmps does not need the DominatorTree, but we update it if it's
     // already available.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 30645f4400e3..6b0d0202d9bb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -14,9 +14,11 @@
 // diamond (hammock) and merges them into a single load in the header. Similar
 // it sinks and merges two stores to the tail block (footer). The algorithm
 // iterates over the instructions of one side of the diamond and attempts to
-// find a matching load/store on the other side. It hoists / sinks when it
-// thinks it safe to do so.  This optimization helps with eg. hiding load
-// latencies, triggering if-conversion, and reducing static code size.
+// find a matching load/store on the other side. New tail/footer block may be
+// insterted if the tail/footer block has more predecessors (not only the two
+// predecessors that are forming the diamond). It hoists / sinks when it thinks
+// it safe to do so.  This optimization helps with eg. hiding load latencies,
+// triggering if-conversion, and reducing static code size.
 //
 // NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
 //
@@ -81,6 +83,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
@@ -103,7 +106,9 @@ class MergedLoadStoreMotion {
   // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
   const int MagicCompileTimeControl = 250;
 
+  const bool SplitFooterBB;
 public:
+  MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {}
   bool run(Function &F, AliasAnalysis &AA);
 
 private:
@@ -114,7 +119,9 @@ private:
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
   bool isStoreSinkBarrierInRange(const Instruction &Start,
                                  const Instruction &End, MemoryLocation Loc);
-  bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+  bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const;
+  void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand,
+                         StoreInst *ElseInst);
   bool mergeStores(BasicBlock *BB);
 };
 } // end anonymous namespace
@@ -217,74 +224,82 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
 }
 
 ///
+/// Check if 2 stores can be sunk together with corresponding GEPs
+///
+bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0,
+                                                 StoreInst *S1) const {
+  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+         (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+         (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0);
+}
+
+///
 /// Merge two stores to same address and sink into \p BB
 ///
 /// Also sinks GEP instruction computing the store address
 ///
-bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
-                                      StoreInst *S1) {
+void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0,
+                                              StoreInst *S1) {
   // Only one definition?
   auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
   auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
-  if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
-      (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
-      (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
-    LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
-               dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
-               dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
-    // Hoist the instruction.
-    BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
-    // Intersect optional metadata.
-    S0->andIRFlags(S1);
-    S0->dropUnknownNonDebugMetadata();
-
-    // Create the new store to be inserted at the join point.
-    StoreInst *SNew = cast<StoreInst>(S0->clone());
-    Instruction *ANew = A0->clone();
-    SNew->insertBefore(&*InsertPt);
-    ANew->insertBefore(SNew);
-
-    assert(S0->getParent() == A0->getParent());
-    assert(S1->getParent() == A1->getParent());
-
-    // New PHI operand? Use it.
-    if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
-      SNew->setOperand(0, NewPN);
-    S0->eraseFromParent();
-    S1->eraseFromParent();
-    A0->replaceAllUsesWith(ANew);
-    A0->eraseFromParent();
-    A1->replaceAllUsesWith(ANew);
-    A1->eraseFromParent();
-    return true;
-  }
-  return false;
+  LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+             dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+             dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+  // Hoist the instruction.
+  BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+  // Intersect optional metadata.
+  S0->andIRFlags(S1);
+  S0->dropUnknownNonDebugMetadata();
+
+  // Create the new store to be inserted at the join point.
+  StoreInst *SNew = cast<StoreInst>(S0->clone());
+  Instruction *ANew = A0->clone();
+  SNew->insertBefore(&*InsertPt);
+  ANew->insertBefore(SNew);
+
+  assert(S0->getParent() == A0->getParent());
+  assert(S1->getParent() == A1->getParent());
+
+  // New PHI operand? Use it.
+  if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
+    SNew->setOperand(0, NewPN);
+  S0->eraseFromParent();
+  S1->eraseFromParent();
+  A0->replaceAllUsesWith(ANew);
+  A0->eraseFromParent();
+  A1->replaceAllUsesWith(ANew);
+  A1->eraseFromParent();
 }
 
 ///
 /// True when two stores are equivalent and can sink into the footer
 ///
-/// Starting from a diamond tail block, iterate over the instructions in one
-/// predecessor block and try to match a store in the second predecessor.
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a store in the second successor.
 ///
-bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) {
 
   bool MergedStores = false;
-  assert(T && "Footer of a diamond cannot be empty");
-
-  pred_iterator PI = pred_begin(T), E = pred_end(T);
-  assert(PI != E);
-  BasicBlock *Pred0 = *PI;
-  ++PI;
-  BasicBlock *Pred1 = *PI;
-  ++PI;
+  BasicBlock *TailBB = getDiamondTail(HeadBB);
+  BasicBlock *SinkBB = TailBB;
+  assert(SinkBB && "Footer of a diamond cannot be empty");
+
+  succ_iterator SI = succ_begin(HeadBB);
+  assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors");
+  BasicBlock *Pred0 = *SI;
+  ++SI;
+  assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor");
+  BasicBlock *Pred1 = *SI;
   // tail block  of a diamond/hammock?
   if (Pred0 == Pred1)
     return false; // No.
-  if (PI != E)
-    return false; // No. More than 2 predecessors.
-
-  // #Instructions in Succ1 for Compile Time Control
+  // bail out early if we can not merge into the footer BB
+  if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3))
+    return false;
+  // #Instructions in Pred1 for Compile Time Control
   auto InstsNoDbg = Pred1->instructionsWithoutDebug();
   int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
   int NStores = 0;
@@ -304,14 +319,23 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
     if (NStores * Size1 >= MagicCompileTimeControl)
       break;
     if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
-      bool Res = sinkStore(T, S0, S1);
-      MergedStores |= Res;
-      // Don't attempt to sink below stores that had to stick around
-      // But after removal of a store and some of its feeding
-      // instruction search again from the beginning since the iterator
-      // is likely stale at this point.
-      if (!Res)
+      if (!canSinkStoresAndGEPs(S0, S1))
+        // Don't attempt to sink below stores that had to stick around
+        // But after removal of a store and some of its feeding
+        // instruction search again from the beginning since the iterator
+        // is likely stale at this point.
         break;
+
+      if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) {
+        // We have more than 2 predecessors. Insert a new block
+        // postdominating 2 predecessors we're going to sink from.
+        SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split");
+        if (!SinkBB)
+          break;
+      }
+
+      MergedStores = true;
+      sinkStoresAndGEPs(SinkBB, S0, S1);
       RBI = Pred0->rbegin();
       RBE = Pred0->rend();
       LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
@@ -328,13 +352,15 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
 
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
+  // This loop doesn't care about newly inserted/split blocks 
+  // since they never will be diamond heads.
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
     BasicBlock *BB = &*FI++;
 
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
     if (isDiamondHead(BB)) {
-      Changed |= mergeStores(getDiamondTail(BB));
+      Changed |= mergeStores(BB);
     }
   }
   return Changed;
@@ -342,9 +368,11 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
 
 namespace {
 class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+  const bool SplitFooterBB;
 public:
   static char ID; // Pass identification, replacement for typeid
-  MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) {
+  MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false)
+      : FunctionPass(ID), SplitFooterBB(SplitFooterBB) {
     initializeMergedLoadStoreMotionLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -355,13 +383,14 @@ public:
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
-    MergedLoadStoreMotion Impl;
+    MergedLoadStoreMotion Impl(SplitFooterBB);
     return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
   }
 
 private:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
+    if (!SplitFooterBB)
+      AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
@@ -373,8 +402,8 @@ char MergedLoadStoreMotionLegacyPass::ID = 0;
 ///
 /// createMergedLoadStoreMotionPass - The public interface to this file.
 ///
-FunctionPass *llvm::createMergedLoadStoreMotionPass() {
-  return new MergedLoadStoreMotionLegacyPass();
+FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) {
+  return new MergedLoadStoreMotionLegacyPass(SplitFooterBB);
 }
 
 INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
@@ -385,13 +414,14 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
 
 PreservedAnalyses
 MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
-  MergedLoadStoreMotion Impl;
+  MergedLoadStoreMotion Impl(Options.SplitFooterBB);
   auto &AA = AM.getResult<AAManager>(F);
   if (!Impl.run(F, AA))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  if (!Options.SplitFooterBB)
+    PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 94436b55752a..bba9082e31b2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -82,7 +82,6 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -101,10 +100,12 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 
@@ -170,7 +171,7 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 08ac2b666fce..6a643480f312 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -76,7 +76,6 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -89,10 +88,12 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ArrayRecycler.h"
@@ -105,6 +106,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/VNCoercion.h"
 #include <algorithm>
@@ -122,6 +124,7 @@
 using namespace llvm;
 using namespace llvm::GVNExpression;
 using namespace llvm::VNCoercion;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "newgvn"
 
@@ -487,11 +490,11 @@ namespace {
 
 class NewGVN {
   Function &F;
-  DominatorTree *DT;
-  const TargetLibraryInfo *TLI;
-  AliasAnalysis *AA;
-  MemorySSA *MSSA;
-  MemorySSAWalker *MSSAWalker;
+  DominatorTree *DT = nullptr;
+  const TargetLibraryInfo *TLI = nullptr;
+  AliasAnalysis *AA = nullptr;
+  MemorySSA *MSSA = nullptr;
+  MemorySSAWalker *MSSAWalker = nullptr;
   const DataLayout &DL;
   std::unique_ptr<PredicateInfo> PredInfo;
 
@@ -503,7 +506,7 @@ class NewGVN {
   const SimplifyQuery SQ;
 
   // Number of function arguments, used by ranking
-  unsigned int NumFuncArgs;
+  unsigned int NumFuncArgs = 0;
 
   // RPOOrdering of basic blocks
   DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
@@ -514,9 +517,9 @@ class NewGVN {
   // startsout in, and represents any value. Being an optimistic analysis,
   // anything in the TOP class has the value TOP, which is indeterminate and
   // equivalent to everything.
-  CongruenceClass *TOPClass;
+  CongruenceClass *TOPClass = nullptr;
   std::vector<CongruenceClass *> CongruenceClasses;
-  unsigned NextCongruenceNum;
+  unsigned NextCongruenceNum = 0;
 
   // Value Mappings.
   DenseMap<Value *, CongruenceClass *> ValueToClass;
@@ -656,7 +659,7 @@ public:
          TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
          const DataLayout &DL)
       : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
-        PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)),
+        PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
         SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {}
 
   bool runGVN();
@@ -860,7 +863,7 @@ private:
 
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
-  int64_t StartingVNCounter;
+  int64_t StartingVNCounter = 0;
 };
 
 } // end anonymous namespace
@@ -1332,7 +1335,7 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
   E->setOpcode(0);
   E->op_push_back(PointerOp);
   if (LI)
-    E->setAlignment(LI->getAlignment());
+    E->setAlignment(MaybeAlign(LI->getAlignment()));
 
   // TODO: Value number heap versions. We may be able to discover
   // things alias analysis can't on it's own (IE that a store and a
@@ -1637,8 +1640,11 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   if (AA->doesNotAccessMemory(CI)) {
     return createCallExpression(CI, TOPClass->getMemoryLeader());
   } else if (AA->onlyReadsMemory(CI)) {
-    MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
-    return createCallExpression(CI, DefiningAccess);
+    if (auto *MA = MSSA->getMemoryAccess(CI)) {
+      auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA);
+      return createCallExpression(CI, DefiningAccess);
+    } else // MSSA determined that CI does not access memory.
+      return createCallExpression(CI, TOPClass->getMemoryLeader());
   }
   return nullptr;
 }
@@ -1754,7 +1760,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
     return true;
   });
   // If we are left with no operands, it's dead.
-  if (empty(Filtered)) {
+  if (Filtered.empty()) {
     // If it has undef at this point, it means there are no-non-undef arguments,
     // and thus, the value of the phi node must be undef.
     if (HasUndef) {
@@ -2464,9 +2470,9 @@ Value *NewGVN::findConditionEquivalence(Value *Cond) const {
 // Process the outgoing edges of a block for reachability.
 void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
   // Evaluate reachability of terminator instruction.
-  BranchInst *BR;
-  if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
-    Value *Cond = BR->getCondition();
+  Value *Cond;
+  BasicBlock *TrueSucc, *FalseSucc;
+  if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) {
     Value *CondEvaluated = findConditionEquivalence(Cond);
     if (!CondEvaluated) {
       if (auto *I = dyn_cast<Instruction>(Cond)) {
@@ -2479,8 +2485,6 @@ void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
       }
     }
     ConstantInt *CI;
-    BasicBlock *TrueSucc = BR->getSuccessor(0);
-    BasicBlock *FalseSucc = BR->getSuccessor(1);
     if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
       if (CI->isOne()) {
         LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
@@ -4196,7 +4200,7 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) {
     return false;
   return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
-                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
                 &getAnalysis<AAResultsWrapperPass>().getAAResults(),
                 &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
                 F.getParent()->getDataLayout())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 039123218544..58763ec72ece 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -161,7 +162,7 @@ public:
       return false;
 
     TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     return runPartiallyInlineLibCalls(F, TLI, TTI);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index b544f0a39ea8..5c4a89977c38 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -47,6 +47,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 
 #include "llvm/ADT/SetVector.h"
@@ -131,7 +132,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     for (Loop *I : *LI) {
       runOnLoopAndSubLoops(I);
     }
@@ -240,7 +241,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
 static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
                                     BasicBlock *Pred) {
   // A conservative bound on the loop as a whole.
-  const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+  const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
   if (MaxTrips != SE->getCouldNotCompute() &&
       SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
           CountedLoopTripWidth))
@@ -478,7 +479,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
     return false;
 
   const TargetLibraryInfo &TLI =
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   bool Modified = false;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
index fa8c9e2a5fe4..41940e980faa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -30,7 +30,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -50,12 +49,14 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <utility>
@@ -173,7 +174,7 @@ void ReassociatePass::BuildRankMap(Function &F,
                       << "\n");
   }
 
-  // Traverse basic blocks in ReversePostOrder
+  // Traverse basic blocks in ReversePostOrder.
   for (BasicBlock *BB : RPOT) {
     unsigned BBRank = RankMap[BB] = ++Rank << 16;
 
@@ -861,7 +862,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
     // this use.  We do this by moving it to the entry block (if it is a
     // non-instruction value) or right after the definition.  These negates will
     // be zapped by reassociate later, so we don't need much finesse here.
-    BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+    Instruction *TheNeg = cast<Instruction>(U);
 
     // Verify that the negate is in this function, V might be a constant expr.
     if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
@@ -1898,6 +1899,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
   ValueRankMap.erase(I);
   Insts.remove(I);
   RedoInsts.remove(I);
+  llvm::salvageDebugInfoOrMarkUndef(*I);
   I->eraseFromParent();
   for (auto Op : Ops)
     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
@@ -1914,6 +1916,7 @@ void ReassociatePass::EraseInst(Instruction *I) {
   // Erase the dead instruction.
   ValueRankMap.erase(I);
   RedoInsts.remove(I);
+  llvm::salvageDebugInfoOrMarkUndef(*I);
   I->eraseFromParent();
   // Optimize its operands.
   SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
@@ -1938,88 +1941,132 @@ void ReassociatePass::EraseInst(Instruction *I) {
   MadeChange = true;
 }
 
-// Canonicalize expressions of the following form:
-//  x + (-Constant * y) -> x - (Constant * y)
-//  x - (-Constant * y) -> x + (Constant * y)
-Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
-  if (!I->hasOneUse() || I->getType()->isVectorTy())
-    return nullptr;
-
-  // Must be a fmul or fdiv instruction.
-  unsigned Opcode = I->getOpcode();
-  if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
-    return nullptr;
-
-  auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
-  auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
-
-  // Both operands are constant, let it get constant folded away.
-  if (C0 && C1)
-    return nullptr;
-
-  ConstantFP *CF = C0 ? C0 : C1;
-
-  // Must have one constant operand.
-  if (!CF)
-    return nullptr;
+/// Recursively analyze an expression to build a list of instructions that have
+/// negative floating-point constant operands. The caller can then transform
+/// the list to create positive constants for better reassociation and CSE.
+static void getNegatibleInsts(Value *V,
+                              SmallVectorImpl<Instruction *> &Candidates) {
+  // Handle only one-use instructions. Combining negations does not justify
+  // replicating instructions.
+  Instruction *I;
+  if (!match(V, m_OneUse(m_Instruction(I))))
+    return;
 
-  // Must be a negative ConstantFP.
-  if (!CF->isNegative())
-    return nullptr;
+  // Handle expressions of multiplications and divisions.
+  // TODO: This could look through floating-point casts.
+  const APFloat *C;
+  switch (I->getOpcode()) {
+    case Instruction::FMul:
+      // Not expecting non-canonical code here. Bail out and wait.
+      if (match(I->getOperand(0), m_Constant()))
+        break;
 
-  // User must be a binary operator with one or more uses.
-  Instruction *User = I->user_back();
-  if (!isa<BinaryOperator>(User) || User->use_empty())
-    return nullptr;
+      if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) {
+        Candidates.push_back(I);
+        LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n');
+      }
+      getNegatibleInsts(I->getOperand(0), Candidates);
+      getNegatibleInsts(I->getOperand(1), Candidates);
+      break;
+    case Instruction::FDiv:
+      // Not expecting non-canonical code here. Bail out and wait.
+      if (match(I->getOperand(0), m_Constant()) &&
+          match(I->getOperand(1), m_Constant()))
+        break;
 
-  unsigned UserOpcode = User->getOpcode();
-  if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
-    return nullptr;
+      if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) ||
+          (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) {
+        Candidates.push_back(I);
+        LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n');
+      }
+      getNegatibleInsts(I->getOperand(0), Candidates);
+      getNegatibleInsts(I->getOperand(1), Candidates);
+      break;
+    default:
+      break;
+  }
+}
 
-  // Subtraction is not commutative. Explicitly, the following transform is
-  // not valid: (-Constant * y) - x  -> x + (Constant * y)
-  if (!User->isCommutative() && User->getOperand(1) != I)
+/// Given an fadd/fsub with an operand that is a one-use instruction
+/// (the fadd/fsub), try to change negative floating-point constants into
+/// positive constants to increase potential for reassociation and CSE.
+Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I,
+                                                              Instruction *Op,
+                                                              Value *OtherOp) {
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub");
+
+  // Collect instructions with negative FP constants from the subtree that ends
+  // in Op.
+  SmallVector<Instruction *, 4> Candidates;
+  getNegatibleInsts(Op, Candidates);
+  if (Candidates.empty())
     return nullptr;
 
   // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
   // resulting subtract will be broken up later.  This can get us into an
   // infinite loop during reassociation.
-  if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User))
+  bool IsFSub = I->getOpcode() == Instruction::FSub;
+  bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1;
+  if (NeedsSubtract && ShouldBreakUpSubtract(I))
     return nullptr;
 
-  // Change the sign of the constant.
-  APFloat Val = CF->getValueAPF();
-  Val.changeSign();
-  I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
-
-  // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
-  // ((-Const*y) + x) -> (x + (-Const*y)).
-  if (User->getOperand(0) == I && User->isCommutative())
-    cast<BinaryOperator>(User)->swapOperands();
-
-  Value *Op0 = User->getOperand(0);
-  Value *Op1 = User->getOperand(1);
-  BinaryOperator *NI;
-  switch (UserOpcode) {
-  default:
-    llvm_unreachable("Unexpected Opcode!");
-  case Instruction::FAdd:
-    NI = BinaryOperator::CreateFSub(Op0, Op1);
-    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
-    break;
-  case Instruction::FSub:
-    NI = BinaryOperator::CreateFAdd(Op0, Op1);
-    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
-    break;
+  for (Instruction *Negatible : Candidates) {
+    const APFloat *C;
+    if (match(Negatible->getOperand(0), m_APFloat(C))) {
+      assert(!match(Negatible->getOperand(1), m_Constant()) &&
+             "Expecting only 1 constant operand");
+      assert(C->isNegative() && "Expected negative FP constant");
+      Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C)));
+      MadeChange = true;
+    }
+    if (match(Negatible->getOperand(1), m_APFloat(C))) {
+      assert(!match(Negatible->getOperand(0), m_Constant()) &&
+             "Expecting only 1 constant operand");
+      assert(C->isNegative() && "Expected negative FP constant");
+      Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C)));
+      MadeChange = true;
+    }
   }
+  assert(MadeChange == true && "Negative constant candidate was not changed");
 
-  NI->insertBefore(User);
-  NI->setName(User->getName());
-  User->replaceAllUsesWith(NI);
-  NI->setDebugLoc(I->getDebugLoc());
+  // Negations cancelled out.
+  if (Candidates.size() % 2 == 0)
+    return I;
+
+  // Negate the final operand in the expression by flipping the opcode of this
+  // fadd/fsub.
+  assert(Candidates.size() % 2 == 1 && "Expected odd number");
+  IRBuilder<> Builder(I);
+  Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I)
+                          : Builder.CreateFSubFMF(OtherOp, Op, I);
+  I->replaceAllUsesWith(NewInst);
   RedoInsts.insert(I);
-  MadeChange = true;
-  return NI;
+  return dyn_cast<Instruction>(NewInst);
+}
+
+/// Canonicalize expressions that contain a negative floating-point constant
+/// of the following form:
+///   OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree)
+///   (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree)
+///   OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree)
+///
+/// The fadd/fsub opcode may be switched to allow folding a negation into the
+/// input instruction.
+Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) {
+  LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n');
+  Value *X;
+  Instruction *Op;
+  if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op)))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op)))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  return I;
 }
 
 /// Inspect and optimize the given instruction. Note that erasing
@@ -2042,16 +2089,16 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
       I = NI;
     }
 
-  // Canonicalize negative constants out of expressions.
-  if (Instruction *Res = canonicalizeNegConstExpr(I))
-    I = Res;
-
   // Commute binary operators, to canonicalize the order of their operands.
   // This can potentially expose more CSE opportunities, and makes writing other
   // transformations simpler.
   if (I->isCommutative())
     canonicalizeOperands(I);
 
+  // Canonicalize negative constants out of expressions.
+  if (Instruction *Res = canonicalizeNegFPConstants(I))
+    I = Res;
+
   // Don't optimize floating-point instructions unless they are 'fast'.
   if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
     return;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 3296322e00d5..0716c1320982 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -16,16 +16,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index c358258d24cf..b242f100faff 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -54,6 +54,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -172,8 +173,6 @@ public:
 
   bool runOnModule(Module &M) override {
     bool Changed = false;
-    const TargetLibraryInfo &TLI =
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     for (Function &F : M) {
       // Nothing to do for declarations.
       if (F.isDeclaration() || F.empty())
@@ -186,6 +185,8 @@ public:
 
       TargetTransformInfo &TTI =
           getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+      const TargetLibraryInfo &TLI =
+          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
       auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
 
       Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
@@ -2530,7 +2531,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   // statepoints surviving this pass.  This makes testing easier and the
   // resulting IR less confusing to human readers.
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-  bool MadeChange = removeUnreachableBlocks(F, nullptr, &DTU);
+  bool MadeChange = removeUnreachableBlocks(F, &DTU);
   // Flush the Dominator Tree.
   DTU.getDomTree();
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp
index 4093e50ce899..e696ea83a300 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -29,7 +29,6 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/IR/BasicBlock.h"
@@ -49,12 +48,14 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PredicateInfo.h"
 #include <cassert>
 #include <utility>
@@ -191,7 +192,7 @@ public:
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
   const DataLayout &DL;
-  const TargetLibraryInfo *TLI;
+  std::function<const TargetLibraryInfo &(Function &)> GetTLI;
   SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value *, LatticeVal> ValueState;  // The state each value is in.
   // The state each parameter is in.
@@ -268,8 +269,9 @@ public:
     return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy};
   }
 
-  SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
-      : DL(DL), TLI(tli) {}
+  SCCPSolver(const DataLayout &DL,
+             std::function<const TargetLibraryInfo &(Function &)> GetTLI)
+      : DL(DL), GetTLI(std::move(GetTLI)) {}
 
   /// MarkBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
@@ -1290,7 +1292,7 @@ CallOverdefined:
       // If we can constant fold this, mark the result of the call as a
       // constant.
       if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()), F,
-                                         Operands, TLI)) {
+                                         Operands, &GetTLI(*F))) {
         // call -> undef.
         if (isa<UndefValue>(C))
           return;
@@ -1465,7 +1467,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       }
 
       LatticeVal &LV = getValueState(&I);
-      if (!LV.isUnknown()) continue;
+      if (!LV.isUnknown())
+        continue;
+
+      // There are two reasons a call can have an undef result
+      // 1. It could be tracked.
+      // 2. It could be constant-foldable.
+      // Because of the way we solve return values, tracked calls must
+      // never be marked overdefined in ResolvedUndefsIn.
+      if (CallSite CS = CallSite(&I)) {
+        if (Function *F = CS.getCalledFunction())
+          if (TrackedRetVals.count(F))
+            continue;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(&I);
+        return true;
+      }
 
       // extractvalue is safe; check here because the argument is a struct.
       if (isa<ExtractValueInst>(I))
@@ -1638,19 +1657,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::Call:
       case Instruction::Invoke:
       case Instruction::CallBr:
-        // There are two reasons a call can have an undef result
-        // 1. It could be tracked.
-        // 2. It could be constant-foldable.
-        // Because of the way we solve return values, tracked calls must
-        // never be marked overdefined in ResolvedUndefsIn.
-        if (Function *F = CallSite(&I).getCalledFunction())
-          if (TrackedRetVals.count(F))
-            break;
-
-        // If the call is constant-foldable, we mark it overdefined because
-        // we do not know what return values are valid.
-        markOverdefined(&I);
-        return true;
+        llvm_unreachable("Call-like instructions should have be handled early");
       default:
         // If we don't know what should happen here, conservatively mark it
         // overdefined.
@@ -1751,7 +1758,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
                      [](const LatticeVal &LV) { return LV.isOverdefined(); }))
       return false;
     std::vector<Constant *> ConstVals;
-    auto *ST = dyn_cast<StructType>(V->getType());
+    auto *ST = cast<StructType>(V->getType());
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       LatticeVal V = IVs[i];
       ConstVals.push_back(V.isConstant()
@@ -1796,7 +1803,8 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
 static bool runSCCP(Function &F, const DataLayout &DL,
                     const TargetLibraryInfo *TLI) {
   LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
-  SCCPSolver Solver(DL, TLI);
+  SCCPSolver Solver(
+      DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; });
 
   // Mark the first block of the function as being executable.
   Solver.MarkBlockExecutable(&F.front());
@@ -1891,7 +1899,7 @@ public:
       return false;
     const DataLayout &DL = F.getParent()->getDataLayout();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     return runSCCP(F, DL, TLI);
   }
 };
@@ -1924,6 +1932,27 @@ static void findReturnsToZap(Function &F,
     return;
   }
 
+  assert(
+      all_of(F.users(),
+             [&Solver](User *U) {
+               if (isa<Instruction>(U) &&
+                   !Solver.isBlockExecutable(cast<Instruction>(U)->getParent()))
+                 return true;
+               // Non-callsite uses are not impacted by zapping. Also, constant
+               // uses (like blockaddresses) could stuck around, without being
+               // used in the underlying IR, meaning we do not have lattice
+               // values for them.
+               if (!CallSite(U))
+                 return true;
+               if (U->getType()->isStructTy()) {
+                 return all_of(
+                     Solver.getStructLatticeValueFor(U),
+                     [](const LatticeVal &LV) { return !LV.isOverdefined(); });
+               }
+               return !Solver.getLatticeValueFor(U).isOverdefined();
+             }) &&
+      "We can only zap functions where all live users have a concrete value");
+
   for (BasicBlock &BB : F) {
     if (CallInst *CI = BB.getTerminatingMustTailCall()) {
       LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
@@ -1974,9 +2003,10 @@ static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) {
 }
 
 bool llvm::runIPSCCP(
-    Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI,
+    Module &M, const DataLayout &DL,
+    std::function<const TargetLibraryInfo &(Function &)> GetTLI,
     function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
-  SCCPSolver Solver(DL, TLI);
+  SCCPSolver Solver(DL, GetTLI);
 
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
@@ -2167,7 +2197,7 @@ bool llvm::runIPSCCP(
     findReturnsToZap(*F, ReturnsToZap, Solver);
   }
 
-  for (const auto &F : Solver.getMRVFunctionsTracked()) {
+  for (auto F : Solver.getMRVFunctionsTracked()) {
     assert(F->getReturnType()->isStructTy() &&
            "The return type should be a struct");
     StructType *STy = cast<StructType>(F->getReturnType());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
index bd4c21d65abc..89916e43fce2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -41,7 +41,6 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -71,6 +70,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -80,6 +80,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
@@ -361,7 +362,7 @@ private:
 
   /// The beginning and ending offsets of the alloca for this
   /// partition.
-  uint64_t BeginOffset, EndOffset;
+  uint64_t BeginOffset = 0, EndOffset = 0;
 
   /// The start and end iterators of this partition.
   iterator SI, SJ;
@@ -959,14 +960,16 @@ private:
       std::tie(UsedI, I) = Uses.pop_back_val();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+        Size = std::max(Size,
+                        DL.getTypeStoreSize(LI->getType()).getFixedSize());
         continue;
       }
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         Value *Op = SI->getOperand(0);
         if (Op == UsedI)
           return SI;
-        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+        Size = std::max(Size,
+                        DL.getTypeStoreSize(Op->getType()).getFixedSize());
         continue;
       }
 
@@ -1197,7 +1200,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
   // TODO: Allow recursive phi users.
   // TODO: Allow stores.
   BasicBlock *BB = PN.getParent();
-  unsigned MaxAlign = 0;
+  MaybeAlign MaxAlign;
   uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
   APInt MaxSize(APWidth, 0);
   bool HaveLoad = false;
@@ -1218,8 +1221,8 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
       if (BBI->mayWriteToMemory())
         return false;
 
-    uint64_t Size = DL.getTypeStoreSizeInBits(LI->getType());
-    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+    uint64_t Size = DL.getTypeStoreSize(LI->getType());
+    MaxAlign = std::max(MaxAlign, MaybeAlign(LI->getAlignment()));
     MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
     HaveLoad = true;
   }
@@ -1266,11 +1269,11 @@ static void speculatePHINodeLoads(PHINode &PN) {
   PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
                                         PN.getName() + ".sroa.speculated");
 
-  // Get the AA tags and alignment to use from one of the loads.  It doesn't
+  // Get the AA tags and alignment to use from one of the loads. It does not
   // matter which one we get and if any differ.
   AAMDNodes AATags;
   SomeLoad->getAAMetadata(AATags);
-  unsigned Align = SomeLoad->getAlignment();
+  const MaybeAlign Align = MaybeAlign(SomeLoad->getAlignment());
 
   // Rewrite all loads of the PN to use the new PHI.
   while (!PN.use_empty()) {
@@ -1338,11 +1341,11 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
     // Both operands to the select need to be dereferenceable, either
     // absolutely (e.g. allocas) or at this point because we can see other
     // accesses to it.
-    if (!isSafeToLoadUnconditionally(TValue, LI->getType(), LI->getAlignment(),
-                                     DL, LI))
+    if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
+                                     MaybeAlign(LI->getAlignment()), DL, LI))
       return false;
-    if (!isSafeToLoadUnconditionally(FValue, LI->getType(), LI->getAlignment(),
-                                     DL, LI))
+    if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
+                                     MaybeAlign(LI->getAlignment()), DL, LI))
       return false;
   }
 
@@ -1368,8 +1371,8 @@ static void speculateSelectInstLoads(SelectInst &SI) {
     NumLoadsSpeculated += 2;
 
     // Transfer alignment and AA info if present.
-    TL->setAlignment(LI->getAlignment());
-    FL->setAlignment(LI->getAlignment());
+    TL->setAlignment(MaybeAlign(LI->getAlignment()));
+    FL->setAlignment(MaybeAlign(LI->getAlignment()));
 
     AAMDNodes Tags;
     LI->getAAMetadata(Tags);
@@ -1678,24 +1681,20 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
 }
 
 /// Compute the adjusted alignment for a load or store from an offset.
-static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
-                                     const DataLayout &DL) {
-  unsigned Alignment;
+static Align getAdjustedAlignment(Instruction *I, uint64_t Offset,
+                                  const DataLayout &DL) {
+  MaybeAlign Alignment;
   Type *Ty;
   if (auto *LI = dyn_cast<LoadInst>(I)) {
-    Alignment = LI->getAlignment();
+    Alignment = MaybeAlign(LI->getAlignment());
     Ty = LI->getType();
   } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-    Alignment = SI->getAlignment();
+    Alignment = MaybeAlign(SI->getAlignment());
     Ty = SI->getValueOperand()->getType();
   } else {
     llvm_unreachable("Only loads and stores are allowed!");
   }
-
-  if (!Alignment)
-    Alignment = DL.getABITypeAlignment(Ty);
-
-  return MinAlign(Alignment, Offset);
+  return commonAlignment(DL.getValueOrABITypeAlignment(Alignment, Ty), Offset);
 }
 
 /// Test whether we can convert a value from the old to the new type.
@@ -2298,9 +2297,9 @@ class llvm::sroa::AllocaSliceRewriter
 
   // The new offsets of the slice currently being rewritten relative to the
   // original alloca.
-  uint64_t NewBeginOffset, NewEndOffset;
+  uint64_t NewBeginOffset = 0, NewEndOffset = 0;
 
-  uint64_t SliceSize;
+  uint64_t SliceSize = 0;
   bool IsSplittable = false;
   bool IsSplit = false;
   Use *OldUse = nullptr;
@@ -2430,13 +2429,14 @@ private:
   ///
   /// You can optionally pass a type to this routine and if that type's ABI
   /// alignment is itself suitable, this will return zero.
-  unsigned getSliceAlign(Type *Ty = nullptr) {
-    unsigned NewAIAlign = NewAI.getAlignment();
-    if (!NewAIAlign)
-      NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
-    unsigned Align =
-        MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
-    return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
+  MaybeAlign getSliceAlign(Type *Ty = nullptr) {
+    const MaybeAlign NewAIAlign = DL.getValueOrABITypeAlignment(
+        MaybeAlign(NewAI.getAlignment()), NewAI.getAllocatedType());
+    const MaybeAlign Align =
+        commonAlignment(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+    return (Ty && Align && Align->value() == DL.getABITypeAlignment(Ty))
+               ? None
+               : Align;
   }
 
   unsigned getIndex(uint64_t Offset) {
@@ -2798,7 +2798,7 @@ private:
       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
       CallInst *New = IRB.CreateMemSet(
           getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
-          getSliceAlign(), II.isVolatile());
+          MaybeAlign(getSliceAlign()), II.isVolatile());
       if (AATags)
         New->setAAMetadata(AATags);
       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
@@ -2884,7 +2884,7 @@ private:
     assert((IsDest && II.getRawDest() == OldPtr) ||
            (!IsDest && II.getRawSource() == OldPtr));
 
-    unsigned SliceAlign = getSliceAlign();
+    MaybeAlign SliceAlign = getSliceAlign();
 
     // For unsplit intrinsics, we simply modify the source and destination
     // pointers in place. This isn't just an optimization, it is a matter of
@@ -2954,10 +2954,10 @@ private:
     // Compute the relative offset for the other pointer within the transfer.
     unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
     APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
-    unsigned OtherAlign =
-      IsDest ? II.getSourceAlignment() : II.getDestAlignment();
-    OtherAlign =  MinAlign(OtherAlign ? OtherAlign : 1,
-                           OtherOffset.zextOrTrunc(64).getZExtValue());
+    Align OtherAlign =
+        assumeAligned(IsDest ? II.getSourceAlignment() : II.getDestAlignment());
+    OtherAlign =
+        commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
 
     if (EmitMemCpy) {
       // Compute the other pointer, folding as much as possible to produce
@@ -2970,7 +2970,7 @@ private:
       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
 
       Value *DestPtr, *SrcPtr;
-      unsigned DestAlign, SrcAlign;
+      MaybeAlign DestAlign, SrcAlign;
       // Note: IsDest is true iff we're copying into the new alloca slice
       if (IsDest) {
         DestPtr = OurPtr;
@@ -3017,9 +3017,9 @@ private:
 
     Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
                                    OtherPtr->getName() + ".");
-    unsigned SrcAlign = OtherAlign;
+    MaybeAlign SrcAlign = OtherAlign;
     Value *DstPtr = &NewAI;
-    unsigned DstAlign = SliceAlign;
+    MaybeAlign DstAlign = SliceAlign;
     if (!IsDest) {
       std::swap(SrcPtr, DstPtr);
       std::swap(SrcAlign, DstAlign);
@@ -3115,20 +3115,17 @@ private:
       Instruction *I = Uses.pop_back_val();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        unsigned LoadAlign = LI->getAlignment();
-        if (!LoadAlign)
-          LoadAlign = DL.getABITypeAlignment(LI->getType());
+        MaybeAlign LoadAlign = DL.getValueOrABITypeAlignment(
+            MaybeAlign(LI->getAlignment()), LI->getType());
         LI->setAlignment(std::min(LoadAlign, getSliceAlign()));
         continue;
       }
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        unsigned StoreAlign = SI->getAlignment();
-        if (!StoreAlign) {
           Value *Op = SI->getOperand(0);
-          StoreAlign = DL.getABITypeAlignment(Op->getType());
-        }
-        SI->setAlignment(std::min(StoreAlign, getSliceAlign()));
-        continue;
+          MaybeAlign StoreAlign = DL.getValueOrABITypeAlignment(
+              MaybeAlign(SI->getAlignment()), Op->getType());
+          SI->setAlignment(std::min(StoreAlign, getSliceAlign()));
+          continue;
       }
 
       assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
@@ -3220,7 +3217,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
 
   /// The current pointer use being rewritten. This is used to dig up the used
   /// value (as opposed to the user).
-  Use *U;
+  Use *U = nullptr;
 
   /// Used to calculate offsets, and hence alignment, of subobjects.
   const DataLayout &DL;
@@ -3275,7 +3272,7 @@ private:
     Type *BaseTy;
 
     /// Known alignment of the base pointer.
-    unsigned BaseAlign;
+    Align BaseAlign;
 
     /// To calculate offset of each component so we can correctly deduce
     /// alignments.
@@ -3284,7 +3281,7 @@ private:
     /// Initialize the splitter with an insertion point, Ptr and start with a
     /// single zero GEP index.
     OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
-               unsigned BaseAlign, const DataLayout &DL)
+               Align BaseAlign, const DataLayout &DL)
         : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
           BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
 
@@ -3306,7 +3303,7 @@ private:
       if (Ty->isSingleValueType()) {
         unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
         return static_cast<Derived *>(this)->emitFunc(
-            Ty, Agg, MinAlign(BaseAlign, Offset), Name);
+            Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
       }
 
       if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
@@ -3347,18 +3344,20 @@ private:
     AAMDNodes AATags;
 
     LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
-                   AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL)
+                   AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
         : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
-                                     DL), AATags(AATags) {}
+                                     DL),
+          AATags(AATags) {}
 
     /// Emit a leaf load of a single value. This is called at the leaves of the
     /// recursive emission to actually load values.
-    void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) {
+    void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Load the single value and insert it using the indices.
       Value *GEP =
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
-      LoadInst *Load = IRB.CreateAlignedLoad(Ty, GEP, Align, Name + ".load");
+      LoadInst *Load =
+          IRB.CreateAlignedLoad(Ty, GEP, Alignment.value(), Name + ".load");
       if (AATags)
         Load->setAAMetadata(AATags);
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
@@ -3386,14 +3385,14 @@ private:
 
   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
     StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
-                    AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL)
+                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
         : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
                                       DL),
           AATags(AATags) {}
     AAMDNodes AATags;
     /// Emit a leaf store of a single value. This is called at the leaves of the
     /// recursive emission to actually produce stores.
-    void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) {
+    void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Extract the single value and store it using the indices.
       //
@@ -3404,7 +3403,7 @@ private:
       Value *InBoundsGEP =
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
       StoreInst *Store =
-          IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Align);
+          IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment.value());
       if (AATags)
         Store->setAAMetadata(AATags);
       LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
@@ -3861,8 +3860,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
           getAdjustedPtr(IRB, DL, BasePtr,
                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
                          PartPtrTy, BasePtr->getName() + "."),
-          getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
-          LI->getName());
+          getAdjustedAlignment(LI, PartOffset, DL).value(),
+          /*IsVolatile*/ false, LI->getName());
       PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
                                 LLVMContext::MD_access_group});
 
@@ -3919,7 +3918,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
             getAdjustedPtr(IRB, DL, StoreBasePtr,
                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
                            PartPtrTy, StoreBasePtr->getName() + "."),
-            getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+            getAdjustedAlignment(SI, PartOffset, DL).value(),
+            /*IsVolatile*/ false);
         PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
                                    LLVMContext::MD_access_group});
         LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
@@ -4003,8 +4003,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
             getAdjustedPtr(IRB, DL, LoadBasePtr,
                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
                            LoadPartPtrTy, LoadBasePtr->getName() + "."),
-            getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
-            LI->getName());
+            getAdjustedAlignment(LI, PartOffset, DL).value(),
+            /*IsVolatile*/ false, LI->getName());
       }
 
       // And store this partition.
@@ -4015,7 +4015,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
           getAdjustedPtr(IRB, DL, StoreBasePtr,
                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
                          StorePartPtrTy, StoreBasePtr->getName() + "."),
-          getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+          getAdjustedAlignment(SI, PartOffset, DL).value(),
+          /*IsVolatile*/ false);
 
       // Now build a new slice for the alloca.
       NewSlices.push_back(
@@ -4150,20 +4151,19 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     // FIXME: We might want to defer PHI speculation until after here.
     // FIXME: return nullptr;
   } else {
-    unsigned Alignment = AI.getAlignment();
-    if (!Alignment) {
-      // The minimum alignment which users can rely on when the explicit
-      // alignment is omitted or zero is that required by the ABI for this
-      // type.
-      Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
-    }
-    Alignment = MinAlign(Alignment, P.beginOffset());
+    // If alignment is unspecified we fallback on the one required by the ABI
+    // for this type. We also make sure the alignment is compatible with
+    // P.beginOffset().
+    const Align Alignment = commonAlignment(
+        DL.getValueOrABITypeAlignment(MaybeAlign(AI.getAlignment()),
+                                      AI.getAllocatedType()),
+        P.beginOffset());
     // If we will get at least this much alignment from the type alone, leave
     // the alloca's alignment unconstrained.
-    if (Alignment <= DL.getABITypeAlignment(SliceTy))
-      Alignment = 0;
+    const bool IsUnconstrained = Alignment <= DL.getABITypeAlignment(SliceTy);
     NewAI = new AllocaInst(
-      SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
+        SliceTy, AI.getType()->getAddressSpace(), nullptr,
+        IsUnconstrained ? MaybeAlign() : Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     // Copy the old AI debug location over to the new one.
     NewAI->setDebugLoc(AI.getDebugLoc());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp
index 869cf00e0a89..9d088547b436 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -79,8 +79,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopVersioningLICMPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
+  initializeLowerConstantIntrinsicsPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeLowerGuardIntrinsicLegacyPassPass(Registry);
+  initializeLowerMatrixIntrinsicsLegacyPassPass(Registry);
   initializeLowerWidenableConditionLegacyPassPass(Registry);
   initializeMemCpyOptLegacyPassPass(Registry);
   initializeMergeICmpsLegacyPassPass(Registry);
@@ -88,6 +90,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
   initializeReassociateLegacyPassPass(Registry);
+  initializeRedundantDbgInstEliminationPass(Registry);
   initializeRegToMemPass(Registry);
   initializeRewriteStatepointsForGCLegacyPassPass(Registry);
   initializeSCCPLegacyPassPass(Registry);
@@ -123,6 +126,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createAggressiveDCEPass());
 }
 
+void LLVMAddDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadCodeEliminationPass());
+}
+
 void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBitTrackingDCEPass());
 }
@@ -280,6 +287,10 @@ void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBasicAAWrapperPass());
 }
 
+void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerConstantIntrinsicsPass());
+}
+
 void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLowerExpectIntrinsicPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 2ee1a3a95f2a..c25c6c632b8f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
@@ -21,6 +22,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -33,12 +35,12 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Options.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -173,8 +175,8 @@ struct VectorLayout {
 
 class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 public:
-  ScalarizerVisitor(unsigned ParallelLoopAccessMDKind)
-    : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind) {
+  ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT)
+    : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) {
   }
 
   bool visit(Function &F);
@@ -214,6 +216,8 @@ private:
   GatherList Gathered;
 
   unsigned ParallelLoopAccessMDKind;
+
+  DominatorTree *DT;
 };
 
 class ScalarizerLegacyPass : public FunctionPass {
@@ -225,6 +229,11 @@ public:
   }
 
   bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
 };
 
 } // end anonymous namespace
@@ -232,6 +241,7 @@ public:
 char ScalarizerLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer",
                       "Scalarize vector operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
                     "Scalarize vector operations", false, false)
 
@@ -303,7 +313,8 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) {
   Module &M = *F.getParent();
   unsigned ParallelLoopAccessMDKind =
       M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
-  ScalarizerVisitor Impl(ParallelLoopAccessMDKind);
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
   return Impl.visit(F);
 }
 
@@ -340,6 +351,15 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
     return Scatterer(BB, BB->begin(), V, &Scattered[V]);
   }
   if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+    // When scalarizing PHI nodes we might try to examine/rewrite InsertElement
+    // nodes in predecessors. If those predecessors are unreachable from entry,
+    // then the IR in those blocks could have unexpected properties resulting in
+    // infinite loops in Scatterer::operator[]. By simply treating values
+    // originating from instructions in unreachable blocks as undef we do not
+    // need to analyse them further.
+    if (!DT->isReachableFromEntry(VOp->getParent()))
+      return Scatterer(Point->getParent(), Point->getIterator(),
+                       UndefValue::get(V->getType()));
     // Put the scattered form of an instruction directly after the
     // instruction.
     BasicBlock *BB = VOp->getParent();
@@ -856,7 +876,10 @@ PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM)
   Module &M = *F.getParent();
   unsigned ParallelLoopAccessMDKind =
       M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
-  ScalarizerVisitor Impl(ParallelLoopAccessMDKind);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
   bool Changed = Impl.visit(F);
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return Changed ? PA : PreservedAnalyses::all();
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index f6a12fb13142..2a1a040bf83e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -164,7 +164,6 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -182,6 +181,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -189,6 +189,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -1121,7 +1122,7 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   bool Changed = false;
   for (BasicBlock &B : F) {
     for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index aeac6f548b32..d7a34acb4318 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -38,8 +38,10 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GenericDomTree.h"
@@ -263,7 +265,7 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
 /// to an entirely separate nest.
 static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
                                  DominatorTree &DT, LoopInfo &LI,
-                                 MemorySSAUpdater *MSSAU) {
+                                 MemorySSAUpdater *MSSAU, ScalarEvolution *SE) {
   // If the loop is already at the top level, we can't hoist it anywhere.
   Loop *OldParentL = L.getParentLoop();
   if (!OldParentL)
@@ -317,7 +319,7 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
     // Because we just hoisted a loop out of this one, we have essentially
     // created new exit paths from it. That means we need to form LCSSA PHI
     // nodes for values used in the no-longer-nested loop.
-    formLCSSA(*OldContainingL, DT, &LI, nullptr);
+    formLCSSA(*OldContainingL, DT, &LI, SE);
 
     // We shouldn't need to form dedicated exits because the exit introduced
     // here is the (just split by unswitching) preheader. However, after trivial
@@ -329,6 +331,20 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
   }
 }
 
+// Return the top-most loop containing ExitBB and having ExitBB as exiting block
+// or the loop containing ExitBB, if there is no parent loop containing ExitBB
+// as exiting block.
+static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) {
+  Loop *TopMost = LI.getLoopFor(ExitBB);
+  Loop *Current = TopMost;
+  while (Current) {
+    if (Current->isLoopExiting(ExitBB))
+      TopMost = Current;
+    Current = Current->getParentLoop();
+  }
+  return TopMost;
+}
+
 /// Unswitch a trivial branch if the condition is loop invariant.
 ///
 /// This routine should only be called when loop code leading to the branch has
@@ -413,9 +429,10 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   });
 
   // If we have scalar evolutions, we need to invalidate them including this
-  // loop and the loop containing the exit block.
+  // loop, the loop containing the exit block and the topmost parent loop
+  // exiting via LoopExitBB.
   if (SE) {
-    if (Loop *ExitL = LI.getLoopFor(LoopExitBB))
+    if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI))
       SE->forgetLoop(ExitL);
     else
       // Forget the entire nest as this exits the entire nest.
@@ -532,7 +549,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // If this was full unswitching, we may have changed the nesting relationship
   // for this loop so hoist it to its correct parent if needed.
   if (FullUnswitch)
-    hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU);
+    hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
 
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
@@ -825,7 +842,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
 
   // We may have changed the nesting relationship for this loop so hoist it to
   // its correct parent if needed.
-  hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU);
+  hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
 
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
@@ -1909,7 +1926,7 @@ static void unswitchNontrivialInvariants(
 
   // We can only unswitch switches, conditional branches with an invariant
   // condition, or combining invariant conditions with an instruction.
-  assert((SI || BI->isConditional()) &&
+  assert((SI || (BI && BI->isConditional())) &&
          "Can only unswitch switches and conditional branch!");
   bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
   if (FullUnswitch)
@@ -2141,17 +2158,21 @@ static void unswitchNontrivialInvariants(
     buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
                                           *ClonedPH, *LoopPH);
     DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+
+    if (MSSAU) {
+      DT.applyUpdates(DTUpdates);
+      DTUpdates.clear();
+
+      // Perform MSSA cloning updates.
+      for (auto &VMap : VMaps)
+        MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+                                   /*IgnoreIncomingWithNoClones=*/true);
+      MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+    }
   }
 
   // Apply the updates accumulated above to get an up-to-date dominator tree.
   DT.applyUpdates(DTUpdates);
-  if (!FullUnswitch && MSSAU) {
-    // Update MSSA for partial unswitch, after DT update.
-    SmallVector<CFGUpdate, 1> Updates;
-    Updates.push_back(
-        {cfg::UpdateKind::Insert, SplitBB, ClonedPHs.begin()->second});
-    MSSAU->applyInsertUpdates(Updates, DT);
-  }
 
   // Now that we have an accurate dominator tree, first delete the dead cloned
   // blocks so that we can accurately build any cloned loops. It is important to
@@ -2256,7 +2277,7 @@ static void unswitchNontrivialInvariants(
     // First build LCSSA for this loop so that we can preserve it when
     // forming dedicated exits. We don't want to perturb some other loop's
     // LCSSA while doing that CFG edit.
-    formLCSSA(UpdateL, DT, &LI, nullptr);
+    formLCSSA(UpdateL, DT, &LI, SE);
 
     // For loops reached by this loop's original exit blocks we may
     // introduced new, non-dedicated exits. At least try to re-form dedicated
@@ -2422,7 +2443,7 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
 
   if (MSSAU) {
     MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI));
-    MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::End);
+    MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator);
     if (VerifyMemorySSA)
       MSSAU->getMemorySSA()->verifyMemorySSA();
   }
@@ -2720,7 +2741,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return Cost * (SuccessorsCount - 1);
   };
   Instruction *BestUnswitchTI = nullptr;
-  int BestUnswitchCost;
+  int BestUnswitchCost = 0;
   ArrayRef<Value *> BestUnswitchInvariants;
   for (auto &TerminatorAndInvariants : UnswitchCandidates) {
     Instruction &TI = *TerminatorAndInvariants.first;
@@ -2752,6 +2773,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
       BestUnswitchInvariants = Invariants;
     }
   }
+  assert(BestUnswitchTI && "Failed to find loop unswitch candidate");
 
   if (BestUnswitchCost >= UnswitchThreshold) {
     LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
@@ -2880,7 +2902,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
   assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
 
   auto PA = getLoopPassPreservedAnalyses();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 4544975a4887..623a8b711ed8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -35,10 +34,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <utility>
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp
index 90f3a2aa46e1..677d86f8c7b4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
@@ -78,7 +79,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
   if (auto *Call = dyn_cast<CallBase>(Inst)) {
     // Convergent operations cannot be made control-dependent on additional
     // values.
-    if (Call->hasFnAttr(Attribute::Convergent))
+    if (Call->isConvergent())
       return false;
 
     for (Instruction *S : Stores)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index e6db11f47ead..cd7bfb2f20dc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -283,12 +283,12 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
       int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
       int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
       if (IID)
-        FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
-                                        IncomingC->getType());
+        FoldedCost += TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
+                                              IncomingC->getType());
       else
         FoldedCost +=
-            TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
-                              IncomingC->getType());
+            TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
+                                  IncomingC->getValue(), IncomingC->getType());
 
       // If we accumulate more folded cost for this incoming constant than
       // materialized cost, then we'll regress any edge with this constant so
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index f9d027eb4a3b..c8d899bb4871 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -67,6 +67,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index a58c32cc5894..9f82b1263ebd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -60,7 +60,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -76,10 +75,12 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <limits>
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index e5400676c7e8..4ce4ce46f67a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -34,8 +34,10 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -65,7 +67,7 @@ static cl::opt<bool> ForceSkipUniformRegions(
 static cl::opt<bool>
     RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden,
                           cl::desc("Allow relaxed uniform region checks"),
-                          cl::init(false));
+                          cl::init(true));
 
 // Definition of the complex types used in this pass.
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index f0b79079d817..9f0ab9103d42 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -76,6 +76,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -341,7 +342,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
       const DataLayout &DL = L->getModule()->getDataLayout();
       if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
-                                       L->getAlignment(), DL, L))
+                                       MaybeAlign(L->getAlignment()), DL, L))
         return false;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index 707adf46d1f4..c8461fdc1608 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
index ee0973002c47..0908b361a4d4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -63,6 +63,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -232,7 +233,7 @@ static bool addDiscriminators(Function &F) {
     LocationSet CallLocations;
     for (auto &I : B.getInstList()) {
       // We bypass intrinsic calls for the following two reasons:
-      //  1) We want to avoid a non-deterministic assigment of
+      //  1) We want to avoid a non-deterministic assignment of
       //     discriminators.
       //  2) We want to minimize the number of base discriminators used.
       if (!isa<InvokeInst>(I) && (!isa<CallInst>(I) || isa<IntrinsicInst>(I)))  
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5fa371377c85..c9eb4abfa21a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -170,7 +170,8 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
                                      LoopInfo *LI, MemorySSAUpdater *MSSAU,
-                                     MemoryDependenceResults *MemDep) {
+                                     MemoryDependenceResults *MemDep,
+                                     bool PredecessorWithTwoSuccessors) {
   if (BB->hasAddressTaken())
     return false;
 
@@ -185,9 +186,24 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
     return false;
 
   // Can't merge if there are multiple distinct successors.
-  if (PredBB->getUniqueSuccessor() != BB)
+  if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB)
     return false;
 
+  // Currently only allow PredBB to have two predecessors, one being BB.
+  // Update BI to branch to BB's only successor instead of BB.
+  BranchInst *PredBB_BI;
+  BasicBlock *NewSucc = nullptr;
+  unsigned FallThruPath;
+  if (PredecessorWithTwoSuccessors) {
+    if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator())))
+      return false;
+    BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BB_JmpI || !BB_JmpI->isUnconditional())
+      return false;
+    NewSucc = BB_JmpI->getSuccessor(0);
+    FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1;
+  }
+
   // Can't merge if there is PHI loop.
   for (PHINode &PN : BB->phis())
     for (Value *IncValue : PN.incoming_values())
@@ -227,34 +243,51 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
     Updates.push_back({DominatorTree::Delete, PredBB, BB});
   }
 
-  if (MSSAU)
-    MSSAU->moveAllAfterMergeBlocks(BB, PredBB, &*(BB->begin()));
+  Instruction *PTI = PredBB->getTerminator();
+  Instruction *STI = BB->getTerminator();
+  Instruction *Start = &*BB->begin();
+  // If there's nothing to move, mark the starting instruction as the last
+  // instruction in the block. Terminator instruction is handled separately.
+  if (Start == STI)
+    Start = PTI;
 
-  // Delete the unconditional branch from the predecessor...
-  PredBB->getInstList().pop_back();
+  // Move all definitions in the successor to the predecessor...
+  PredBB->getInstList().splice(PTI->getIterator(), BB->getInstList(),
+                               BB->begin(), STI->getIterator());
+
+  if (MSSAU)
+    MSSAU->moveAllAfterMergeBlocks(BB, PredBB, Start);
 
   // Make all PHI nodes that referred to BB now refer to Pred as their
   // source...
   BB->replaceAllUsesWith(PredBB);
 
-  // Move all definitions in the successor to the predecessor...
-  PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
-  new UnreachableInst(BB->getContext(), BB);
+  if (PredecessorWithTwoSuccessors) {
+    // Delete the unconditional branch from BB.
+    BB->getInstList().pop_back();
 
-  // Eliminate duplicate dbg.values describing the entry PHI node post-splice.
-  for (auto Incoming : IncomingValues) {
-    if (isa<Instruction>(*Incoming)) {
-      SmallVector<DbgValueInst *, 2> DbgValues;
-      SmallDenseSet<std::pair<DILocalVariable *, DIExpression *>, 2>
-          DbgValueSet;
-      llvm::findDbgValues(DbgValues, Incoming);
-      for (auto &DVI : DbgValues) {
-        auto R = DbgValueSet.insert({DVI->getVariable(), DVI->getExpression()});
-        if (!R.second)
-          DVI->eraseFromParent();
-      }
-    }
+    // Update branch in the predecessor.
+    PredBB_BI->setSuccessor(FallThruPath, NewSucc);
+  } else {
+    // Delete the unconditional branch from the predecessor.
+    PredBB->getInstList().pop_back();
+
+    // Move terminator instruction.
+    PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+
+    // Terminator may be a memory accessing instruction too.
+    if (MSSAU)
+      if (MemoryUseOrDef *MUD = cast_or_null<MemoryUseOrDef>(
+              MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator())))
+        MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End);
   }
+  // Add unreachable to now empty BB.
+  new UnreachableInst(BB->getContext(), BB);
+
+  // Eliminate duplicate/redundant dbg.values. This seems to be a good place to
+  // do that since we might end up with redundant dbg.values describing the
+  // entry PHI node post-splice.
+  RemoveRedundantDbgInstrs(PredBB);
 
   // Inherit predecessors name if it exists.
   if (!PredBB->hasName())
@@ -274,14 +307,131 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
            "applying corresponding DTU updates.");
     DTU->applyUpdatesPermissive(Updates);
     DTU->deleteBB(BB);
-  }
-
-  else {
+  } else {
     BB->eraseFromParent(); // Nuke BB if DTU is nullptr.
   }
+
   return true;
 }
 
+/// Remove redundant instructions within sequences of consecutive dbg.value
+/// instructions. This is done using a backward scan to keep the last dbg.value
+/// describing a specific variable/fragment.
+///
+/// BackwardScan strategy:
+/// ----------------------
+/// Given a sequence of consecutive DbgValueInst like this
+///
+///   dbg.value ..., "x", FragmentX1  (*)
+///   dbg.value ..., "y", FragmentY1
+///   dbg.value ..., "x", FragmentX2
+///   dbg.value ..., "x", FragmentX1  (**)
+///
+/// then the instruction marked with (*) can be removed (it is guaranteed to be
+/// obsoleted by the instruction marked with (**) as the latter instruction is
+/// describing the same variable using the same fragment info).
+///
+/// Possible improvements:
+/// - Check fully overlapping fragments and not only identical fragments.
+/// - Support dbg.addr, dbg.declare. dbg.label, and possibly other meta
+///   instructions being part of the sequence of consecutive instructions.
+static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
+  SmallVector<DbgValueInst *, 8> ToBeRemoved;
+  SmallDenseSet<DebugVariable> VariableSet;
+  for (auto &I : reverse(*BB)) {
+    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
+      DebugVariable Key(DVI->getVariable(),
+                        DVI->getExpression(),
+                        DVI->getDebugLoc()->getInlinedAt());
+      auto R = VariableSet.insert(Key);
+      // If the same variable fragment is described more than once it is enough
+      // to keep the last one (i.e. the first found since we for reverse
+      // iteration).
+      if (!R.second)
+        ToBeRemoved.push_back(DVI);
+      continue;
+    }
+    // Sequence with consecutive dbg.value instrs ended. Clear the map to
+    // restart identifying redundant instructions if case we find another
+    // dbg.value sequence.
+    VariableSet.clear();
+  }
+
+  for (auto &Instr : ToBeRemoved)
+    Instr->eraseFromParent();
+
+  return !ToBeRemoved.empty();
+}
+
+/// Remove redundant dbg.value instructions using a forward scan. This can
+/// remove a dbg.value instruction that is redundant due to indicating that a
+/// variable has the same value as already being indicated by an earlier
+/// dbg.value.
+///
+/// ForwardScan strategy:
+/// ---------------------
+/// Given two identical dbg.value instructions, separated by a block of
+/// instructions that isn't describing the same variable, like this
+///
+///   dbg.value X1, "x", FragmentX1  (**)
+///   <block of instructions, none being "dbg.value ..., "x", ...">
+///   dbg.value X1, "x", FragmentX1  (*)
+///
+/// then the instruction marked with (*) can be removed. Variable "x" is already
+/// described as being mapped to the SSA value X1.
+///
+/// Possible improvements:
+/// - Keep track of non-overlapping fragments.
+static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
+  SmallVector<DbgValueInst *, 8> ToBeRemoved;
+  DenseMap<DebugVariable, std::pair<Value *, DIExpression *> > VariableMap;
+  for (auto &I : *BB) {
+    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
+      DebugVariable Key(DVI->getVariable(),
+                        NoneType(),
+                        DVI->getDebugLoc()->getInlinedAt());
+      auto VMI = VariableMap.find(Key);
+      // Update the map if we found a new value/expression describing the
+      // variable, or if the variable wasn't mapped already.
+      if (VMI == VariableMap.end() ||
+          VMI->second.first != DVI->getValue() ||
+          VMI->second.second != DVI->getExpression()) {
+        VariableMap[Key] = { DVI->getValue(), DVI->getExpression() };
+        continue;
+      }
+      // Found an identical mapping. Remember the instruction for later removal.
+      ToBeRemoved.push_back(DVI);
+    }
+  }
+
+  for (auto &Instr : ToBeRemoved)
+    Instr->eraseFromParent();
+
+  return !ToBeRemoved.empty();
+}
+
+bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) {
+  bool MadeChanges = false;
+  // By using the "backward scan" strategy before the "forward scan" strategy we
+  // can remove both dbg.value (2) and (3) in a situation like this:
+  //
+  //   (1) dbg.value V1, "x", DIExpression()
+  //       ...
+  //   (2) dbg.value V2, "x", DIExpression()
+  //   (3) dbg.value V1, "x", DIExpression()
+  //
+  // The backward scan will remove (2), it is made obsolete by (3). After
+  // getting (2) out of the way, the foward scan will remove (3) since "x"
+  // already is described as having the value V1 at (1).
+  MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB);
+  MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB);
+
+  if (MadeChanges)
+    LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: "
+                      << BB->getName() << "\n");
+  return MadeChanges;
+}
+
 void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
                                 BasicBlock::iterator &BI, Value *V) {
   Instruction &I = *BI;
@@ -365,11 +515,13 @@ llvm::SplitAllCriticalEdges(Function &F,
 
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
                              DominatorTree *DT, LoopInfo *LI,
-                             MemorySSAUpdater *MSSAU) {
+                             MemorySSAUpdater *MSSAU, const Twine &BBName) {
   BasicBlock::iterator SplitIt = SplitPt->getIterator();
   while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
     ++SplitIt;
-  BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
+  std::string Name = BBName.str();
+  BasicBlock *New = Old->splitBasicBlock(
+      SplitIt, Name.empty() ? Old->getName() + ".split" : Name);
 
   // The new block lives in whichever loop the old one did. This preserves
   // LCSSA as well, because we force the split point to be after any PHI nodes.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index f5e4b53f6d97..008cea333e6b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 27f110e24f9c..71316ce8f758 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -88,6 +88,14 @@ static bool setDoesNotCapture(Function &F, unsigned ArgNo) {
   return true;
 }
 
+static bool setDoesNotAlias(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::NoAlias))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::NoAlias);
+  ++NumNoAlias;
+  return true;
+}
+
 static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
   if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly))
     return false;
@@ -175,6 +183,9 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_strcpy:
   case LibFunc_strncpy:
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
+    LLVM_FALLTHROUGH;
   case LibFunc_strcat:
   case LibFunc_strncat:
     Changed |= setReturnedArg(F, 0);
@@ -249,12 +260,14 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_sprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_snprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
@@ -291,11 +304,23 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_memcpy:
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
+    Changed |= setReturnedArg(F, 0);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
   case LibFunc_memmove:
     Changed |= setReturnedArg(F, 0);
-    LLVM_FALLTHROUGH;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
   case LibFunc_mempcpy:
   case LibFunc_memccpy:
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
@@ -760,9 +785,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   }
 }
 
-bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                           LibFunc DoubleFn, LibFunc FloatFn,
-                           LibFunc LongDoubleFn) {
+bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                      LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
   switch (Ty->getTypeID()) {
   case Type::HalfTyID:
     return false;
@@ -775,10 +799,10 @@ bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
-StringRef llvm::getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                                LibFunc DoubleFn, LibFunc FloatFn,
-                                LibFunc LongDoubleFn) {
-  assert(hasUnaryFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
+                               LibFunc DoubleFn, LibFunc FloatFn,
+                               LibFunc LongDoubleFn) {
+  assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
          "Cannot get name for unavailable function!");
 
   switch (Ty->getTypeID()) {
@@ -827,6 +851,12 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                      B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI);
 }
 
+Value *llvm::emitStrDup(Value *Ptr, IRBuilder<> &B,
+                        const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(),
+                     castToCStr(Ptr, B), B, TLI);
+}
+
 Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI) {
   Type *I8Ptr = B.getInt8PtrTy();
@@ -1045,24 +1075,28 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
                                   LibFunc LongDoubleFn, IRBuilder<> &B,
                                   const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
-  StringRef Name = getUnaryFloatFn(TLI, Op->getType(),
-                                   DoubleFn, FloatFn, LongDoubleFn);
+  StringRef Name = getFloatFnName(TLI, Op->getType(),
+                                  DoubleFn, FloatFn, LongDoubleFn);
 
   return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
 }
 
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                   IRBuilder<> &B, const AttributeList &Attrs) {
+static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
+                                          StringRef Name, IRBuilder<> &B,
+                                          const AttributeList &Attrs) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
-  SmallString<20> NameBuffer;
-  appendTypeSuffix(Op1, Name, NameBuffer);
-
   Module *M = B.GetInsertBlock()->getModule();
-  FunctionCallee Callee = M->getOrInsertFunction(
-      Name, Op1->getType(), Op1->getType(), Op2->getType());
-  CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
-  CI->setAttributes(Attrs);
+  FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
+                                                 Op1->getType(), Op2->getType());
+  CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
+
+  // The incoming attribute set may have come from a speculatable intrinsic, but
+  // is being replaced with a library call which is not allowed to be
+  // speculatable.
+  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+                                          AttributeList::FunctionIndex,
+                                          Attribute::Speculatable));
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1070,6 +1104,28 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
   return CI;
 }
 
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
+                                   IRBuilder<> &B, const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op1, Name, NameBuffer);
+
+  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+}
+
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+                                   const TargetLibraryInfo *TLI,
+                                   LibFunc DoubleFn, LibFunc FloatFn,
+                                   LibFunc LongDoubleFn, IRBuilder<> &B,
+                                   const AttributeList &Attrs) {
+  // Get the name of the function according to TLI.
+  StringRef Name = getFloatFnName(TLI, Op1->getType(),
+                                  DoubleFn, FloatFn, LongDoubleFn);
+
+  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+}
+
 Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_putchar))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index df299f673f65..9a6761040bd8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -448,13 +448,17 @@ bool llvm::bypassSlowDivision(BasicBlock *BB,
   DivCacheTy PerBBDivCache;
 
   bool MadeChange = false;
-  Instruction* Next = &*BB->begin();
+  Instruction *Next = &*BB->begin();
   while (Next != nullptr) {
     // We may add instructions immediately after I, but we want to skip over
     // them.
-    Instruction* I = Next;
+    Instruction *I = Next;
     Next = Next->getNextNode();
 
+    // Ignore dead code to save time and avoid bugs.
+    if (I->hasNUses(0))
+      continue;
+
     FastDivInsertionTask Task(I, BypassWidths);
     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
       I->replaceAllUsesWith(Replacement);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
index 455fcbb1cf98..6b01c0c71d00 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
@@ -30,9 +30,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
-
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 1026c9d37038..75e8963303c2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -210,6 +210,21 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       RemapInstruction(&II, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
                        TypeMapper, Materializer);
+
+  // Register all DICompileUnits of the old parent module in the new parent module
+  auto* OldModule = OldFunc->getParent();
+  auto* NewModule = NewFunc->getParent();
+  if (OldModule && NewModule && OldModule != NewModule && DIFinder.compile_unit_count()) {
+    auto* NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+    // Avoid multiple insertions of the same DICompileUnit to NMD.
+    SmallPtrSet<const void*, 8> Visited;
+    for (auto* Operand : NMD->operands())
+      Visited.insert(Operand);
+    for (auto* Unit : DIFinder.compile_units())
+      // VMap.MD()[Unit] == Unit
+      if (Visited.insert(Unit).second)
+        NMD->addOperand(Unit);
+  }
 }
 
 /// Return a copy of the specified function and add it to that function's
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
index 7ddf59becba9..2c8c3abb2922 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -48,7 +48,7 @@ std::unique_ptr<Module> llvm::CloneModule(
     function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
   std::unique_ptr<Module> New =
-      llvm::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
+      std::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
   New->setSourceFileName(M.getSourceFileName());
   New->setDataLayout(M.getDataLayout());
   New->setTargetTriple(M.getTargetTriple());
@@ -181,13 +181,25 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // And named metadata....
+  const auto* LLVM_DBG_CU = M.getNamedMetadata("llvm.dbg.cu");
   for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
                                              E = M.named_metadata_end();
        I != E; ++I) {
     const NamedMDNode &NMD = *I;
     NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
-    for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
-      NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
+    if (&NMD == LLVM_DBG_CU) {
+      // Do not insert duplicate operands.
+      SmallPtrSet<const void*, 8> Visited;
+      for (const auto* Operand : NewNMD->operands())
+        Visited.insert(Operand);
+      for (const auto* Operand : NMD.operands()) {
+        auto* MappedOperand = MapMetadata(Operand, VMap);
+        if (Visited.insert(MappedOperand).second)
+          NewNMD->addOperand(MappedOperand);
+      }
+    } else
+      for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+        NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
   }
 
   return New;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index fa6d3f8ae873..682af4a88d3e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -293,10 +293,8 @@ static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
         CommonExitBlock = Succ;
         continue;
       }
-      if (CommonExitBlock == Succ)
-        continue;
-
-      return true;
+      if (CommonExitBlock != Succ)
+        return true;
     }
     return false;
   };
@@ -307,52 +305,79 @@ static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
   return CommonExitBlock;
 }
 
-bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
-    Instruction *Addr) const {
-  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
-  Function *Func = (*Blocks.begin())->getParent();
-  for (BasicBlock &BB : *Func) {
-    if (Blocks.count(&BB))
-      continue;
-    for (Instruction &II : BB) {
-      if (isa<DbgInfoIntrinsic>(II))
-        continue;
+CodeExtractorAnalysisCache::CodeExtractorAnalysisCache(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &II : BB.instructionsWithoutDebug())
+      if (auto *AI = dyn_cast<AllocaInst>(&II))
+        Allocas.push_back(AI);
 
-      unsigned Opcode = II.getOpcode();
-      Value *MemAddr = nullptr;
-      switch (Opcode) {
-      case Instruction::Store:
-      case Instruction::Load: {
-        if (Opcode == Instruction::Store) {
-          StoreInst *SI = cast<StoreInst>(&II);
-          MemAddr = SI->getPointerOperand();
-        } else {
-          LoadInst *LI = cast<LoadInst>(&II);
-          MemAddr = LI->getPointerOperand();
-        }
-        // Global variable can not be aliased with locals.
-        if (dyn_cast<Constant>(MemAddr))
-          break;
-        Value *Base = MemAddr->stripInBoundsConstantOffsets();
-        if (!isa<AllocaInst>(Base) || Base == AI)
-          return false;
+    findSideEffectInfoForBlock(BB);
+  }
+}
+
+void CodeExtractorAnalysisCache::findSideEffectInfoForBlock(BasicBlock &BB) {
+  for (Instruction &II : BB.instructionsWithoutDebug()) {
+    unsigned Opcode = II.getOpcode();
+    Value *MemAddr = nullptr;
+    switch (Opcode) {
+    case Instruction::Store:
+    case Instruction::Load: {
+      if (Opcode == Instruction::Store) {
+        StoreInst *SI = cast<StoreInst>(&II);
+        MemAddr = SI->getPointerOperand();
+      } else {
+        LoadInst *LI = cast<LoadInst>(&II);
+        MemAddr = LI->getPointerOperand();
+      }
+      // Global variable can not be aliased with locals.
+      if (dyn_cast<Constant>(MemAddr))
         break;
+      Value *Base = MemAddr->stripInBoundsConstantOffsets();
+      if (!isa<AllocaInst>(Base)) {
+        SideEffectingBlocks.insert(&BB);
+        return;
       }
-      default: {
-        IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
-        if (IntrInst) {
-          if (IntrInst->isLifetimeStartOrEnd())
-            break;
-          return false;
-        }
-        // Treat all the other cases conservatively if it has side effects.
-        if (II.mayHaveSideEffects())
-          return false;
+      BaseMemAddrs[&BB].insert(Base);
+      break;
+    }
+    default: {
+      IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+      if (IntrInst) {
+        if (IntrInst->isLifetimeStartOrEnd())
+          break;
+        SideEffectingBlocks.insert(&BB);
+        return;
       }
+      // Treat all the other cases conservatively if it has side effects.
+      if (II.mayHaveSideEffects()) {
+        SideEffectingBlocks.insert(&BB);
+        return;
       }
     }
+    }
   }
+}
 
+bool CodeExtractorAnalysisCache::doesBlockContainClobberOfAddr(
+    BasicBlock &BB, AllocaInst *Addr) const {
+  if (SideEffectingBlocks.count(&BB))
+    return true;
+  auto It = BaseMemAddrs.find(&BB);
+  if (It != BaseMemAddrs.end())
+    return It->second.count(Addr);
+  return false;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+    const CodeExtractorAnalysisCache &CEAC, Instruction *Addr) const {
+  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+  Function *Func = (*Blocks.begin())->getParent();
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    if (CEAC.doesBlockContainClobberOfAddr(BB, AI))
+      return false;
+  }
   return true;
 }
 
@@ -415,7 +440,8 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
 // outline region. If there are not other untracked uses of the address, return
 // the pair of markers if found; otherwise return a pair of nullptr.
 CodeExtractor::LifetimeMarkerInfo
-CodeExtractor::getLifetimeMarkers(Instruction *Addr,
+CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
+                                  Instruction *Addr,
                                   BasicBlock *ExitBlock) const {
   LifetimeMarkerInfo Info;
 
@@ -447,7 +473,7 @@ CodeExtractor::getLifetimeMarkers(Instruction *Addr,
   Info.HoistLifeEnd = !definedInRegion(Blocks, Info.LifeEnd);
   // Do legality check.
   if ((Info.SinkLifeStart || Info.HoistLifeEnd) &&
-      !isLegalToShrinkwrapLifetimeMarkers(Addr))
+      !isLegalToShrinkwrapLifetimeMarkers(CEAC, Addr))
     return {};
 
   // Check to see if we have a place to do hoisting, if not, bail.
@@ -457,7 +483,8 @@ CodeExtractor::getLifetimeMarkers(Instruction *Addr,
   return Info;
 }
 
-void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
+void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
+                                ValueSet &SinkCands, ValueSet &HoistCands,
                                 BasicBlock *&ExitBlock) const {
   Function *Func = (*Blocks.begin())->getParent();
   ExitBlock = getCommonExitBlock(Blocks);
@@ -478,74 +505,104 @@ void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
     return true;
   };
 
-  for (BasicBlock &BB : *Func) {
-    if (Blocks.count(&BB))
+  // Look up allocas in the original function in CodeExtractorAnalysisCache, as
+  // this is much faster than walking all the instructions.
+  for (AllocaInst *AI : CEAC.getAllocas()) {
+    BasicBlock *BB = AI->getParent();
+    if (Blocks.count(BB))
       continue;
-    for (Instruction &II : BB) {
-      auto *AI = dyn_cast<AllocaInst>(&II);
-      if (!AI)
-        continue;
 
-      LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(AI, ExitBlock);
-      bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo);
-      if (Moved) {
-        LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n");
-        SinkCands.insert(AI);
-        continue;
-      }
+    // As a prior call to extractCodeRegion() may have shrinkwrapped the alloca,
+    // check whether it is actually still in the original function.
+    Function *AIFunc = BB->getParent();
+    if (AIFunc != Func)
+      continue;
 
-      // Follow any bitcasts.
-      SmallVector<Instruction *, 2> Bitcasts;
-      SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
-      for (User *U : AI->users()) {
-        if (U->stripInBoundsConstantOffsets() == AI) {
-          Instruction *Bitcast = cast<Instruction>(U);
-          LifetimeMarkerInfo LMI = getLifetimeMarkers(Bitcast, ExitBlock);
-          if (LMI.LifeStart) {
-            Bitcasts.push_back(Bitcast);
-            BitcastLifetimeInfo.push_back(LMI);
-            continue;
-          }
-        }
+    LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(CEAC, AI, ExitBlock);
+    bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo);
+    if (Moved) {
+      LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n");
+      SinkCands.insert(AI);
+      continue;
+    }
 
-        // Found unknown use of AI.
-        if (!definedInRegion(Blocks, U)) {
-          Bitcasts.clear();
-          break;
+    // Follow any bitcasts.
+    SmallVector<Instruction *, 2> Bitcasts;
+    SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
+    for (User *U : AI->users()) {
+      if (U->stripInBoundsConstantOffsets() == AI) {
+        Instruction *Bitcast = cast<Instruction>(U);
+        LifetimeMarkerInfo LMI = getLifetimeMarkers(CEAC, Bitcast, ExitBlock);
+        if (LMI.LifeStart) {
+          Bitcasts.push_back(Bitcast);
+          BitcastLifetimeInfo.push_back(LMI);
+          continue;
         }
       }
 
-      // Either no bitcasts reference the alloca or there are unknown uses.
-      if (Bitcasts.empty())
-        continue;
+      // Found unknown use of AI.
+      if (!definedInRegion(Blocks, U)) {
+        Bitcasts.clear();
+        break;
+      }
+    }
 
-      LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n");
-      SinkCands.insert(AI);
-      for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) {
-        Instruction *BitcastAddr = Bitcasts[I];
-        const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I];
-        assert(LMI.LifeStart &&
-               "Unsafe to sink bitcast without lifetime markers");
-        moveOrIgnoreLifetimeMarkers(LMI);
-        if (!definedInRegion(Blocks, BitcastAddr)) {
-          LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr
-                            << "\n");
-          SinkCands.insert(BitcastAddr);
-        }
+    // Either no bitcasts reference the alloca or there are unknown uses.
+    if (Bitcasts.empty())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n");
+    SinkCands.insert(AI);
+    for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) {
+      Instruction *BitcastAddr = Bitcasts[I];
+      const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I];
+      assert(LMI.LifeStart &&
+             "Unsafe to sink bitcast without lifetime markers");
+      moveOrIgnoreLifetimeMarkers(LMI);
+      if (!definedInRegion(Blocks, BitcastAddr)) {
+        LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr
+                          << "\n");
+        SinkCands.insert(BitcastAddr);
       }
     }
   }
 }
 
+bool CodeExtractor::isEligible() const {
+  if (Blocks.empty())
+    return false;
+  BasicBlock *Header = *Blocks.begin();
+  Function *F = Header->getParent();
+
+  // For functions with varargs, check that varargs handling is only done in the
+  // outlined function, i.e vastart and vaend are only used in outlined blocks.
+  if (AllowVarArgs && F->getFunctionType()->isVarArg()) {
+    auto containsVarArgIntrinsic = [](const Instruction &I) {
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (const Function *Callee = CI->getCalledFunction())
+          return Callee->getIntrinsicID() == Intrinsic::vastart ||
+                 Callee->getIntrinsicID() == Intrinsic::vaend;
+      return false;
+    };
+
+    for (auto &BB : *F) {
+      if (Blocks.count(&BB))
+        continue;
+      if (llvm::any_of(BB, containsVarArgIntrinsic))
+        return false;
+    }
+  }
+  return true;
+}
+
 void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
                                       const ValueSet &SinkCands) const {
   for (BasicBlock *BB : Blocks) {
     // If a used value is defined outside the region, it's an input.  If an
     // instruction is used outside the region, it's an output.
     for (Instruction &II : *BB) {
-      for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
-           ++OI) {
-        Value *V = *OI;
+      for (auto &OI : II.operands()) {
+        Value *V = OI;
         if (!SinkCands.count(V) && definedInCaller(Blocks, V))
           Inputs.insert(V);
       }
@@ -748,7 +805,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
     dbgs() << ")\n";
   });
 
-  StructType *StructTy;
+  StructType *StructTy = nullptr;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
     StructTy = StructType::get(M->getContext(), paramTy);
     paramTy.clear();
@@ -904,12 +961,12 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   // within the new function. This must be done before we lose track of which
   // blocks were originally in the code region.
   std::vector<User *> Users(header->user_begin(), header->user_end());
-  for (unsigned i = 0, e = Users.size(); i != e; ++i)
+  for (auto &U : Users)
     // The BasicBlock which contains the branch is not in the region
     // modify the branch target to a new block
-    if (Instruction *I = dyn_cast<Instruction>(Users[i]))
-      if (I->isTerminator() && !Blocks.count(I->getParent()) &&
-          I->getParent()->getParent() == oldFunction)
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      if (I->isTerminator() && I->getFunction() == oldFunction &&
+          !Blocks.count(I->getParent()))
         I->replaceUsesOfWith(header, newHeader);
 
   return newFunction;
@@ -1277,13 +1334,6 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
 
     // Insert this basic block into the new function
     newBlocks.push_back(Block);
-
-    // Remove @llvm.assume calls that were moved to the new function from the
-    // old function's assumption cache.
-    if (AC)
-      for (auto &I : *Block)
-        if (match(&I, m_Intrinsic<Intrinsic::assume>()))
-          AC->unregisterAssumption(cast<CallInst>(&I));
   }
 }
 
@@ -1332,7 +1382,8 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
       MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
 }
 
-Function *CodeExtractor::extractCodeRegion() {
+Function *
+CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
   if (!isEligible())
     return nullptr;
 
@@ -1341,27 +1392,6 @@ Function *CodeExtractor::extractCodeRegion() {
   BasicBlock *header = *Blocks.begin();
   Function *oldFunction = header->getParent();
 
-  // For functions with varargs, check that varargs handling is only done in the
-  // outlined function, i.e vastart and vaend are only used in outlined blocks.
-  if (AllowVarArgs && oldFunction->getFunctionType()->isVarArg()) {
-    auto containsVarArgIntrinsic = [](Instruction &I) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&I))
-        if (const Function *F = CI->getCalledFunction())
-          return F->getIntrinsicID() == Intrinsic::vastart ||
-                 F->getIntrinsicID() == Intrinsic::vaend;
-      return false;
-    };
-
-    for (auto &BB : *oldFunction) {
-      if (Blocks.count(&BB))
-        continue;
-      if (llvm::any_of(BB, containsVarArgIntrinsic))
-        return nullptr;
-    }
-  }
-  ValueSet inputs, outputs, SinkingCands, HoistingCands;
-  BasicBlock *CommonExit = nullptr;
-
   // Calculate the entry frequency of the new function before we change the root
   //   block.
   BlockFrequency EntryFreq;
@@ -1375,6 +1405,15 @@ Function *CodeExtractor::extractCodeRegion() {
     }
   }
 
+  if (AC) {
+    // Remove @llvm.assume calls that were moved to the new function from the
+    // old function's assumption cache.
+    for (BasicBlock *Block : Blocks)
+      for (auto &I : *Block)
+        if (match(&I, m_Intrinsic<Intrinsic::assume>()))
+          AC->unregisterAssumption(cast<CallInst>(&I));
+  }
+
   // If we have any return instructions in the region, split those blocks so
   // that the return is not in the region.
   splitReturnBlocks();
@@ -1428,7 +1467,9 @@ Function *CodeExtractor::extractCodeRegion() {
   }
   newFuncRoot->getInstList().push_back(BranchI);
 
-  findAllocas(SinkingCands, HoistingCands, CommonExit);
+  ValueSet inputs, outputs, SinkingCands, HoistingCands;
+  BasicBlock *CommonExit = nullptr;
+  findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
   assert(HoistingCands.empty() || CommonExit);
 
   // Find inputs to, outputs from the code region.
@@ -1563,5 +1604,17 @@ Function *CodeExtractor::extractCodeRegion() {
   });
   LLVM_DEBUG(if (verifyFunction(*oldFunction))
              report_fatal_error("verification of oldFunction failed!"));
+  LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, AC))
+             report_fatal_error("Stale Asumption cache for old Function!"));
   return newFunction;
 }
+
+bool CodeExtractor::verifyAssumptionCache(const Function& F,
+                                          AssumptionCache *AC) {
+  for (auto AssumeVH : AC->assumptions()) {
+    CallInst *I = cast<CallInst>(AssumeVH);
+    if (I->getFunction() != &F)
+      return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
new file mode 100644
index 000000000000..93395ac761ab
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -0,0 +1,189 @@
+//===- CodeMoverUtils.cpp - CodeMover Utilities ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform movements on basic blocks, and instructions
+// contained within a function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "codemover-utils"
+
+STATISTIC(HasDependences,
+          "Cannot move across instructions that has memory dependences");
+STATISTIC(MayThrowException, "Cannot move across instructions that may throw");
+STATISTIC(NotControlFlowEquivalent,
+          "Instructions are not control flow equivalent");
+STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported");
+STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported");
+
+bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1,
+                                   const DominatorTree &DT,
+                                   const PostDominatorTree &PDT) {
+  return isControlFlowEquivalent(*I0.getParent(), *I1.getParent(), DT, PDT);
+}
+
+bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1,
+                                   const DominatorTree &DT,
+                                   const PostDominatorTree &PDT) {
+  if (&BB0 == &BB1)
+    return true;
+
+  return ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) ||
+          (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0)));
+}
+
+static bool reportInvalidCandidate(const Instruction &I,
+                                   llvm::Statistic &Stat) {
+  ++Stat;
+  LLVM_DEBUG(dbgs() << "Unable to move instruction: " << I << ". "
+                    << Stat.getDesc());
+  return false;
+}
+
+/// Collect all instructions in between \p StartInst and \p EndInst, and store
+/// them in \p InBetweenInsts.
+static void
+collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst,
+                             SmallPtrSetImpl<Instruction *> &InBetweenInsts) {
+  assert(InBetweenInsts.empty() && "Expecting InBetweenInsts to be empty");
+
+  /// Get the next instructions of \p I, and push them to \p WorkList.
+  auto getNextInsts = [](Instruction &I,
+                         SmallPtrSetImpl<Instruction *> &WorkList) {
+    if (Instruction *NextInst = I.getNextNode())
+      WorkList.insert(NextInst);
+    else {
+      assert(I.isTerminator() && "Expecting a terminator instruction");
+      for (BasicBlock *Succ : successors(&I))
+        WorkList.insert(&Succ->front());
+    }
+  };
+
+  SmallPtrSet<Instruction *, 10> WorkList;
+  getNextInsts(StartInst, WorkList);
+  while (!WorkList.empty()) {
+    Instruction *CurInst = *WorkList.begin();
+    WorkList.erase(CurInst);
+
+    if (CurInst == &EndInst)
+      continue;
+
+    if (!InBetweenInsts.insert(CurInst).second)
+      continue;
+
+    getNextInsts(*CurInst, WorkList);
+  }
+}
+
+bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
+                              const DominatorTree &DT,
+                              const PostDominatorTree &PDT,
+                              DependenceInfo &DI) {
+  // Cannot move itself before itself.
+  if (&I == &InsertPoint)
+    return false;
+
+  // Not moved.
+  if (I.getNextNode() == &InsertPoint)
+    return true;
+
+  if (isa<PHINode>(I) || isa<PHINode>(InsertPoint))
+    return reportInvalidCandidate(I, NotMovedPHINode);
+
+  if (I.isTerminator())
+    return reportInvalidCandidate(I, NotMovedTerminator);
+
+  // TODO remove this limitation.
+  if (!isControlFlowEquivalent(I, InsertPoint, DT, PDT))
+    return reportInvalidCandidate(I, NotControlFlowEquivalent);
+
+  // As I and InsertPoint are control flow equivalent, if I dominates
+  // InsertPoint, then I comes before InsertPoint.
+  const bool MoveForward = DT.dominates(&I, &InsertPoint);
+  if (MoveForward) {
+    // When I is being moved forward, we need to make sure the InsertPoint
+    // dominates every users. Or else, a user may be using an undefined I.
+    for (const Use &U : I.uses())
+      if (auto *UserInst = dyn_cast<Instruction>(U.getUser()))
+        if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U))
+          return false;
+  } else {
+    // When I is being moved backward, we need to make sure all its opernads
+    // dominates the InsertPoint. Or else, an operand may be undefined for I.
+    for (const Value *Op : I.operands())
+      if (auto *OpInst = dyn_cast<Instruction>(Op))
+        if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint))
+          return false;
+  }
+
+  Instruction &StartInst = (MoveForward ? I : InsertPoint);
+  Instruction &EndInst = (MoveForward ? InsertPoint : I);
+  SmallPtrSet<Instruction *, 10> InstsToCheck;
+  collectInstructionsInBetween(StartInst, EndInst, InstsToCheck);
+  if (!MoveForward)
+    InstsToCheck.insert(&InsertPoint);
+
+  // Check if there exists instructions which may throw, may synchonize, or may
+  // never return, from I to InsertPoint.
+  if (!isSafeToSpeculativelyExecute(&I))
+    if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(),
+                    [](Instruction *I) {
+                      if (I->mayThrow())
+                        return true;
+
+                      const CallBase *CB = dyn_cast<CallBase>(I);
+                      if (!CB)
+                        return false;
+                      if (!CB->hasFnAttr(Attribute::WillReturn))
+                        return true;
+                      if (!CB->hasFnAttr(Attribute::NoSync))
+                        return true;
+
+                      return false;
+                    })) {
+      return reportInvalidCandidate(I, MayThrowException);
+    }
+
+  // Check if I has any output/flow/anti dependences with instructions from \p
+  // StartInst to \p EndInst.
+  if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(),
+                  [&DI, &I](Instruction *CurInst) {
+                    auto DepResult = DI.depends(&I, CurInst, true);
+                    if (DepResult &&
+                        (DepResult->isOutput() || DepResult->isFlow() ||
+                         DepResult->isAnti()))
+                      return true;
+                    return false;
+                  }))
+    return reportInvalidCandidate(I, HasDependences);
+
+  return true;
+}
+
+void llvm::moveInstsBottomUp(BasicBlock &FromBB, BasicBlock &ToBB,
+                             const DominatorTree &DT,
+                             const PostDominatorTree &PDT, DependenceInfo &DI) {
+  for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) {
+    Instruction *MovePos = ToBB.getFirstNonPHIOrDbg();
+    Instruction &I = *It;
+    // Increment the iterator before modifying FromBB.
+    ++It;
+
+    if (isSafeToMoveBefore(I, *MovePos, DT, PDT, DI))
+      I.moveBefore(MovePos);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp
new file mode 100644
index 000000000000..b7b4bfa3734d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -0,0 +1,435 @@
+//===- Debugify.cpp - Attach synthetic debug info to everything -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This pass attaches synthetic debug info to everything. It can be used
+/// to create targeted tests for debug info preservation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Debugify.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt<bool> Quiet("debugify-quiet",
+                    cl::desc("Suppress verbose debugify output"));
+
+raw_ostream &dbg() { return Quiet ? nulls() : errs(); }
+
+uint64_t getAllocSizeInBits(Module &M, Type *Ty) {
+  return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0;
+}
+
+bool isFunctionSkipped(Function &F) {
+  return F.isDeclaration() || !F.hasExactDefinition();
+}
+
+/// Find the basic block's terminating instruction.
+///
+/// Special care is needed to handle musttail and deopt calls, as these behave
+/// like (but are in fact not) terminators.
+Instruction *findTerminatingInstruction(BasicBlock &BB) {
+  if (auto *I = BB.getTerminatingMustTailCall())
+    return I;
+  if (auto *I = BB.getTerminatingDeoptimizeCall())
+    return I;
+  return BB.getTerminator();
+}
+
+bool applyDebugifyMetadata(Module &M,
+                           iterator_range<Module::iterator> Functions,
+                           StringRef Banner) {
+  // Skip modules with debug info.
+  if (M.getNamedMetadata("llvm.dbg.cu")) {
+    dbg() << Banner << "Skipping module with debug info\n";
+    return false;
+  }
+
+  DIBuilder DIB(M);
+  LLVMContext &Ctx = M.getContext();
+
+  // Get a DIType which corresponds to Ty.
+  DenseMap<uint64_t, DIType *> TypeCache;
+  auto getCachedDIType = [&](Type *Ty) -> DIType * {
+    uint64_t Size = getAllocSizeInBits(M, Ty);
+    DIType *&DTy = TypeCache[Size];
+    if (!DTy) {
+      std::string Name = "ty" + utostr(Size);
+      DTy = DIB.createBasicType(Name, Size, dwarf::DW_ATE_unsigned);
+    }
+    return DTy;
+  };
+
+  unsigned NextLine = 1;
+  unsigned NextVar = 1;
+  auto File = DIB.createFile(M.getName(), "/");
+  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify",
+                                  /*isOptimized=*/true, "", 0);
+
+  // Visit each instruction.
+  for (Function &F : Functions) {
+    if (isFunctionSkipped(F))
+      continue;
+
+    auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+    DISubprogram::DISPFlags SPFlags =
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized;
+    if (F.hasPrivateLinkage() || F.hasInternalLinkage())
+      SPFlags |= DISubprogram::SPFlagLocalToUnit;
+    auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine,
+                                 SPType, NextLine, DINode::FlagZero, SPFlags);
+    F.setSubprogram(SP);
+    for (BasicBlock &BB : F) {
+      // Attach debug locations.
+      for (Instruction &I : BB)
+        I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
+
+      // Inserting debug values into EH pads can break IR invariants.
+      if (BB.isEHPad())
+        continue;
+
+      // Find the terminating instruction, after which no debug values are
+      // attached.
+      Instruction *LastInst = findTerminatingInstruction(BB);
+      assert(LastInst && "Expected basic block with a terminator");
+
+      // Maintain an insertion point which can't be invalidated when updates
+      // are made.
+      BasicBlock::iterator InsertPt = BB.getFirstInsertionPt();
+      assert(InsertPt != BB.end() && "Expected to find an insertion point");
+      Instruction *InsertBefore = &*InsertPt;
+
+      // Attach debug values.
+      for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) {
+        // Skip void-valued instructions.
+        if (I->getType()->isVoidTy())
+          continue;
+
+        // Phis and EH pads must be grouped at the beginning of the block.
+        // Only advance the insertion point when we finish visiting these.
+        if (!isa<PHINode>(I) && !I->isEHPad())
+          InsertBefore = I->getNextNode();
+
+        std::string Name = utostr(NextVar++);
+        const DILocation *Loc = I->getDebugLoc().get();
+        auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(),
+                                               getCachedDIType(I->getType()),
+                                               /*AlwaysPreserve=*/true);
+        DIB.insertDbgValueIntrinsic(I, LocalVar, DIB.createExpression(), Loc,
+                                    InsertBefore);
+      }
+    }
+    DIB.finalizeSubprogram(SP);
+  }
+  DIB.finalize();
+
+  // Track the number of distinct lines and variables.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify");
+  auto *IntTy = Type::getInt32Ty(Ctx);
+  auto addDebugifyOperand = [&](unsigned N) {
+    NMD->addOperand(MDNode::get(
+        Ctx, ValueAsMetadata::getConstant(ConstantInt::get(IntTy, N))));
+  };
+  addDebugifyOperand(NextLine - 1); // Original number of lines.
+  addDebugifyOperand(NextVar - 1);  // Original number of variables.
+  assert(NMD->getNumOperands() == 2 &&
+         "llvm.debugify should have exactly 2 operands!");
+
+  // Claim that this synthetic debug info is valid.
+  StringRef DIVersionKey = "Debug Info Version";
+  if (!M.getModuleFlag(DIVersionKey))
+    M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION);
+
+  return true;
+}
+
+/// Return true if a mis-sized diagnostic is issued for \p DVI.
+bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
+  // The size of a dbg.value's value operand should match the size of the
+  // variable it corresponds to.
+  //
+  // TODO: This, along with a check for non-null value operands, should be
+  // promoted to verifier failures.
+  Value *V = DVI->getValue();
+  if (!V)
+    return false;
+
+  // For now, don't try to interpret anything more complicated than an empty
+  // DIExpression. Eventually we should try to handle OP_deref and fragments.
+  if (DVI->getExpression()->getNumElements())
+    return false;
+
+  Type *Ty = V->getType();
+  uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty);
+  Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits();
+  if (!ValueOperandSize || !DbgVarSize)
+    return false;
+
+  bool HasBadSize = false;
+  if (Ty->isIntegerTy()) {
+    auto Signedness = DVI->getVariable()->getSignedness();
+    if (Signedness && *Signedness == DIBasicType::Signedness::Signed)
+      HasBadSize = ValueOperandSize < *DbgVarSize;
+  } else {
+    HasBadSize = ValueOperandSize != *DbgVarSize;
+  }
+
+  if (HasBadSize) {
+    dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize
+          << ", but its variable has size " << *DbgVarSize << ": ";
+    DVI->print(dbg());
+    dbg() << "\n";
+  }
+  return HasBadSize;
+}
+
+bool checkDebugifyMetadata(Module &M,
+                           iterator_range<Module::iterator> Functions,
+                           StringRef NameOfWrappedPass, StringRef Banner,
+                           bool Strip, DebugifyStatsMap *StatsMap) {
+  // Skip modules without debugify metadata.
+  NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify");
+  if (!NMD) {
+    dbg() << Banner << "Skipping module without debugify metadata\n";
+    return false;
+  }
+
+  auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
+    return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
+        ->getZExtValue();
+  };
+  assert(NMD->getNumOperands() == 2 &&
+         "llvm.debugify should have exactly 2 operands!");
+  unsigned OriginalNumLines = getDebugifyOperand(0);
+  unsigned OriginalNumVars = getDebugifyOperand(1);
+  bool HasErrors = false;
+
+  // Track debug info loss statistics if able.
+  DebugifyStatistics *Stats = nullptr;
+  if (StatsMap && !NameOfWrappedPass.empty())
+    Stats = &StatsMap->operator[](NameOfWrappedPass);
+
+  BitVector MissingLines{OriginalNumLines, true};
+  BitVector MissingVars{OriginalNumVars, true};
+  for (Function &F : Functions) {
+    if (isFunctionSkipped(F))
+      continue;
+
+    // Find missing lines.
+    for (Instruction &I : instructions(F)) {
+      if (isa<DbgValueInst>(&I))
+        continue;
+
+      auto DL = I.getDebugLoc();
+      if (DL && DL.getLine() != 0) {
+        MissingLines.reset(DL.getLine() - 1);
+        continue;
+      }
+
+      if (!DL) {
+        dbg() << "ERROR: Instruction with empty DebugLoc in function ";
+        dbg() << F.getName() << " --";
+        I.print(dbg());
+        dbg() << "\n";
+        HasErrors = true;
+      }
+    }
+
+    // Find missing variables and mis-sized debug values.
+    for (Instruction &I : instructions(F)) {
+      auto *DVI = dyn_cast<DbgValueInst>(&I);
+      if (!DVI)
+        continue;
+
+      unsigned Var = ~0U;
+      (void)to_integer(DVI->getVariable()->getName(), Var, 10);
+      assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable");
+      bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI);
+      if (!HasBadSize)
+        MissingVars.reset(Var - 1);
+      HasErrors |= HasBadSize;
+    }
+  }
+
+  // Print the results.
+  for (unsigned Idx : MissingLines.set_bits())
+    dbg() << "WARNING: Missing line " << Idx + 1 << "\n";
+
+  for (unsigned Idx : MissingVars.set_bits())
+    dbg() << "WARNING: Missing variable " << Idx + 1 << "\n";
+
+  // Update DI loss statistics.
+  if (Stats) {
+    Stats->NumDbgLocsExpected += OriginalNumLines;
+    Stats->NumDbgLocsMissing += MissingLines.count();
+    Stats->NumDbgValuesExpected += OriginalNumVars;
+    Stats->NumDbgValuesMissing += MissingVars.count();
+  }
+
+  dbg() << Banner;
+  if (!NameOfWrappedPass.empty())
+    dbg() << " [" << NameOfWrappedPass << "]";
+  dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n';
+
+  // Strip the Debugify Metadata if required.
+  if (Strip) {
+    StripDebugInfo(M);
+    M.eraseNamedMetadata(NMD);
+    return true;
+  }
+
+  return false;
+}
+
+/// ModulePass for attaching synthetic debug info to everything, used with the
+/// legacy module pass manager.
+struct DebugifyModulePass : public ModulePass {
+  bool runOnModule(Module &M) override {
+    return applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: ");
+  }
+
+  DebugifyModulePass() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+};
+
+/// FunctionPass for attaching synthetic debug info to instructions within a
+/// single function, used with the legacy module pass manager.
+struct DebugifyFunctionPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    Module &M = *F.getParent();
+    auto FuncIt = F.getIterator();
+    return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
+                                 "FunctionDebugify: ");
+  }
+
+  DebugifyFunctionPass() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+};
+
+/// ModulePass for checking debug info inserted by -debugify, used with the
+/// legacy module pass manager.
+struct CheckDebugifyModulePass : public ModulePass {
+  bool runOnModule(Module &M) override {
+    return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass,
+                                 "CheckModuleDebugify", Strip, StatsMap);
+  }
+
+  CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "",
+                          DebugifyStatsMap *StatsMap = nullptr)
+      : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+        StatsMap(StatsMap) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+
+private:
+  bool Strip;
+  StringRef NameOfWrappedPass;
+  DebugifyStatsMap *StatsMap;
+};
+
+/// FunctionPass for checking debug info inserted by -debugify-function, used
+/// with the legacy module pass manager.
+struct CheckDebugifyFunctionPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    Module &M = *F.getParent();
+    auto FuncIt = F.getIterator();
+    return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
+                                 NameOfWrappedPass, "CheckFunctionDebugify",
+                                 Strip, StatsMap);
+  }
+
+  CheckDebugifyFunctionPass(bool Strip = false,
+                            StringRef NameOfWrappedPass = "",
+                            DebugifyStatsMap *StatsMap = nullptr)
+      : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+        StatsMap(StatsMap) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+
+private:
+  bool Strip;
+  StringRef NameOfWrappedPass;
+  DebugifyStatsMap *StatsMap;
+};
+
+} // end anonymous namespace
+
+ModulePass *createDebugifyModulePass() { return new DebugifyModulePass(); }
+
+FunctionPass *createDebugifyFunctionPass() {
+  return new DebugifyFunctionPass();
+}
+
+PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
+  applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: ");
+  return PreservedAnalyses::all();
+}
+
+ModulePass *createCheckDebugifyModulePass(bool Strip,
+                                          StringRef NameOfWrappedPass,
+                                          DebugifyStatsMap *StatsMap) {
+  return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap);
+}
+
+FunctionPass *createCheckDebugifyFunctionPass(bool Strip,
+                                              StringRef NameOfWrappedPass,
+                                              DebugifyStatsMap *StatsMap) {
+  return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap);
+}
+
+PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false,
+                        nullptr);
+  return PreservedAnalyses::all();
+}
+
+char DebugifyModulePass::ID = 0;
+static RegisterPass<DebugifyModulePass> DM("debugify",
+                                           "Attach debug info to everything");
+
+char CheckDebugifyModulePass::ID = 0;
+static RegisterPass<CheckDebugifyModulePass>
+    CDM("check-debugify", "Check debug info from -debugify");
+
+char DebugifyFunctionPass::ID = 0;
+static RegisterPass<DebugifyFunctionPass> DF("debugify-function",
+                                             "Attach debug info to a function");
+
+char CheckDebugifyFunctionPass::ID = 0;
+static RegisterPass<CheckDebugifyFunctionPass>
+    CDF("check-debugify-function", "Check debug info from -debugify-function");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 4aa40eeadda4..651f776a4915 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
 using namespace llvm;
@@ -24,7 +25,7 @@ static void insertCall(Function &CurFn, StringRef Func,
 
   if (Func == "mcount" ||
       Func == ".mcount" ||
-      Func == "\01__gnu_mcount_nc" ||
+      Func == "llvm.arm.gnu.eabi.mcount" ||
       Func == "\01_mcount" ||
       Func == "\01mcount" ||
       Func == "__mcount" ||
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp
index 0e203f4e075d..ad36790b8c6a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -469,7 +469,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         return false;  // Cannot handle array allocs.
       }
       Type *Ty = AI->getAllocatedType();
-      AllocaTmps.push_back(llvm::make_unique<GlobalVariable>(
+      AllocaTmps.push_back(std::make_unique<GlobalVariable>(
           Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
           AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
           AI->getType()->getPointerAddressSpace()));
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 0c52e6f3703b..893f23eb6048 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -67,7 +67,7 @@ public:
 /// Before:
 ///   ......
 ///   %cmp10 = fcmp une float %tmp1, %tmp2
-///   br i1 %cmp1, label %if.then, label %lor.rhs
+///   br i1 %cmp10, label %if.then, label %lor.rhs
 ///
 /// lor.rhs:
 ///   ......
@@ -251,8 +251,8 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     bool EverChanged = false;
     for (; CurrBlock != FirstCondBlock;
          CurrBlock = CurrBlock->getSinglePredecessor()) {
-      BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator());
-      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      auto *BI = cast<BranchInst>(CurrBlock->getTerminator());
+      auto *CI = dyn_cast<CmpInst>(BI->getCondition());
       if (!CI)
         continue;
 
@@ -278,7 +278,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
 
   // Do the transformation.
   BasicBlock *CB;
-  BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator());
+  BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
   bool Iteration = true;
   IRBuilder<>::InsertPointGuard Guard(Builder);
   Value *PC = PBI->getCondition();
@@ -444,7 +444,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   FirstEntryBlock->getInstList().pop_back();
   FirstEntryBlock->getInstList()
       .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
-  BranchInst *PBI = dyn_cast<BranchInst>(FirstEntryBlock->getTerminator());
+  BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator());
   Value *CC = PBI->getCondition();
   BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
   BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
@@ -453,6 +453,16 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   PBI->replaceUsesOfWith(CC, NC);
   Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
 
+  // Handle PHI node to replace its predecessors to FirstEntryBlock.
+  for (BasicBlock *Succ : successors(PBI)) {
+    for (PHINode &Phi : Succ->phis()) {
+      for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) {
+        if (Phi.getIncomingBlock(i) == SecondEntryBlock)
+          Phi.setIncomingBlock(i, FirstEntryBlock);
+      }
+    }
+  }
+
   // Remove IfTrue1
   if (IfTrue1 != FirstEntryBlock) {
     IfTrue1->dropAllReferences();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index c9cc0990f237..26d48ee0d23f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -12,13 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
 using namespace llvm;
 
 /// Checks if we should import SGV as a definition, otherwise import as a
 /// declaration.
 bool FunctionImportGlobalProcessing::doImportAsDefinition(
-    const GlobalValue *SGV, SetVector<GlobalValue *> *GlobalsToImport) {
+    const GlobalValue *SGV) {
+  if (!isPerformingImport())
+    return false;
 
   // Only import the globals requested for importing.
   if (!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
@@ -31,16 +34,8 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition(
   return true;
 }
 
-bool FunctionImportGlobalProcessing::doImportAsDefinition(
-    const GlobalValue *SGV) {
-  if (!isPerformingImport())
-    return false;
-  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
-                                                              GlobalsToImport);
-}
-
 bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
-    const GlobalValue *SGV) {
+    const GlobalValue *SGV, ValueInfo VI) {
   assert(SGV->hasLocalLinkage());
   // Both the imported references and the original local variable must
   // be promoted.
@@ -65,7 +60,7 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
   // (so the source file name and resulting GUID is the same). Find the one
   // in this module.
   auto Summary = ImportIndex.findSummaryInModule(
-      SGV->getGUID(), SGV->getParent()->getModuleIdentifier());
+      VI, SGV->getParent()->getModuleIdentifier());
   assert(Summary && "Missing summary for global value when exporting");
   auto Linkage = Summary->linkage();
   if (!GlobalValue::isLocalLinkage(Linkage)) {
@@ -91,18 +86,15 @@ bool FunctionImportGlobalProcessing::isNonRenamableLocal(
 }
 #endif
 
-std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV,
-                                                    bool DoPromote) {
+std::string
+FunctionImportGlobalProcessing::getPromotedName(const GlobalValue *SGV) {
+  assert(SGV->hasLocalLinkage());
   // For locals that must be promoted to global scope, ensure that
   // the promoted name uniquely identifies the copy in the original module,
-  // using the ID assigned during combined index creation. When importing,
-  // we rename all locals (not just those that are promoted) in order to
-  // avoid naming conflicts between locals imported from different modules.
-  if (SGV->hasLocalLinkage() && (DoPromote || isPerformingImport()))
-    return ModuleSummaryIndex::getGlobalNameForLocal(
-        SGV->getName(),
-        ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
-  return SGV->getName();
+  // using the ID assigned during combined index creation.
+  return ModuleSummaryIndex::getGlobalNameForLocal(
+      SGV->getName(),
+      ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
 }
 
 GlobalValue::LinkageTypes
@@ -210,7 +202,7 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
       if (Function *F = dyn_cast<Function>(&GV)) {
         if (!F->isDeclaration()) {
           for (auto &S : VI.getSummaryList()) {
-            FunctionSummary *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
+            auto *FS = cast<FunctionSummary>(S->getBaseObject());
             if (FS->modulePath() == M.getModuleIdentifier()) {
               F->setEntryCount(Function::ProfileCount(FS->entryCount(),
                                                       Function::PCT_Synthetic));
@@ -229,6 +221,11 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
     }
   }
 
+  // We should always have a ValueInfo (i.e. GV in index) for definitions when
+  // we are exporting, and also when importing that value.
+  assert(VI || GV.isDeclaration() ||
+         (isPerformingImport() && !doImportAsDefinition(&GV)));
+
   // Mark read/write-only variables which can be imported with specific
   // attribute. We can't internalize them now because IRMover will fail
   // to link variable definitions to their external declarations during
@@ -238,27 +235,42 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
   // If global value dead stripping is not enabled in summary then
   // propagateConstants hasn't been run. We can't internalize GV
   // in such case.
-  if (!GV.isDeclaration() && VI && ImportIndex.withGlobalValueDeadStripping()) {
-    const auto &SL = VI.getSummaryList();
-    auto *GVS = SL.empty() ? nullptr : dyn_cast<GlobalVarSummary>(SL[0].get());
-    // At this stage "maybe" is "definitely"
-    if (GVS && (GVS->maybeReadOnly() || GVS->maybeWriteOnly()))
-      cast<GlobalVariable>(&GV)->addAttribute("thinlto-internalize");
+  if (!GV.isDeclaration() && VI && ImportIndex.withAttributePropagation()) {
+    if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+      // We can have more than one local with the same GUID, in the case of
+      // same-named locals in different but same-named source files that were
+      // compiled in their respective directories (so the source file name
+      // and resulting GUID is the same). Find the one in this module.
+      // Handle the case where there is no summary found in this module. That
+      // can happen in the distributed ThinLTO backend, because the index only
+      // contains summaries from the source modules if they are being imported.
+      // We might have a non-null VI and get here even in that case if the name
+      // matches one in this module (e.g. weak or appending linkage).
+      auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
+          ImportIndex.findSummaryInModule(VI, M.getModuleIdentifier()));
+      if (GVS &&
+          (ImportIndex.isReadOnly(GVS) || ImportIndex.isWriteOnly(GVS))) {
+        V->addAttribute("thinlto-internalize");
+        // Objects referenced by writeonly GV initializer should not be
+        // promoted, because there is no any kind of read access to them
+        // on behalf of this writeonly GV. To avoid promotion we convert
+        // GV initializer to 'zeroinitializer'. This effectively drops
+        // references in IR module (not in combined index), so we can
+        // ignore them when computing import. We do not export references
+        // of writeonly object. See computeImportForReferencedGlobals
+        if (ImportIndex.isWriteOnly(GVS))
+          V->setInitializer(Constant::getNullValue(V->getValueType()));
+      }
+    }
   }
 
-  bool DoPromote = false;
-  if (GV.hasLocalLinkage() &&
-      ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) {
+  if (GV.hasLocalLinkage() && shouldPromoteLocalToGlobal(&GV, VI)) {
     // Save the original name string before we rename GV below.
     auto Name = GV.getName().str();
-    // Once we change the name or linkage it is difficult to determine
-    // again whether we should promote since shouldPromoteLocalToGlobal needs
-    // to locate the summary (based on GUID from name and linkage). Therefore,
-    // use DoPromote result saved above.
-    GV.setName(getName(&GV, DoPromote));
-    GV.setLinkage(getLinkage(&GV, DoPromote));
-    if (!GV.hasLocalLinkage())
-      GV.setVisibility(GlobalValue::HiddenVisibility);
+    GV.setName(getPromotedName(&GV));
+    GV.setLinkage(getLinkage(&GV, /* DoPromote */ true));
+    assert(!GV.hasLocalLinkage());
+    GV.setVisibility(GlobalValue::HiddenVisibility);
 
     // If we are renaming a COMDAT leader, ensure that we record the COMDAT
     // for later renaming as well. This is required for COFF.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
index 34c32d9c0c98..4cfc9358499a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
@@ -10,13 +10,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Analysis/GuardUtils.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 static cl::opt<uint32_t> PredicatePassBranchWeight(
     "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20),
@@ -24,7 +28,7 @@ static cl::opt<uint32_t> PredicatePassBranchWeight(
              "reciprocal of this value (default = 1 << 20)"));
 
 void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
-                                        CallInst *Guard) {
+                                        CallInst *Guard, bool UseWC) {
   OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt));
   SmallVector<Value *, 4> Args(std::next(Guard->arg_begin()), Guard->arg_end());
 
@@ -60,4 +64,63 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
 
   DeoptCall->setCallingConv(Guard->getCallingConv());
   DeoptBlockTerm->eraseFromParent();
+
+  if (UseWC) {
+    // We want the guard to be expressed as explicit control flow, but still be
+    // widenable. For that, we add Widenable Condition intrinsic call to the
+    // guard's condition.
+    IRBuilder<> B(CheckBI);
+    auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition,
+                                 {}, {}, nullptr, "widenable_cond");
+    CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC,
+                                      "exiplicit_guard_cond"));
+    assert(isWidenableBranch(CheckBI) && "sanity check");
+  }
+}
+
+
+void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) {
+  assert(isWidenableBranch(WidenableBR) && "precondition");
+
+  // The tempting trivially option is to produce something like this:
+  // br (and oldcond, newcond) where oldcond is assumed to contain a widenable
+  // condition, but that doesn't match the pattern parseWidenableBranch expects
+  // so we have to be more sophisticated.
+
+  Use *C, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
+  if (!C) {
+    // br (wc()), ... form
+    IRBuilder<> B(WidenableBR);
+    WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
+  } else {
+    // br (wc & C), ... form
+    IRBuilder<> B(WidenableBR);
+    C->set(B.CreateAnd(NewCond, C->get()));
+    Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
+    // Condition is only guaranteed to dominate branch
+    WCAnd->moveBefore(WidenableBR);    
+  }
+  assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
+}
+
+void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) {
+  assert(isWidenableBranch(WidenableBR) && "precondition");
+
+  Use *C, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
+  if (!C) {
+    // br (wc()), ... form
+    IRBuilder<> B(WidenableBR);
+    WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
+  } else {
+    // br (wc & C), ... form
+    Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
+    // Condition is only guaranteed to dominate branch
+    WCAnd->moveBefore(WidenableBR);
+    C->set(NewCond);
+  }
+  assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index 8041e66e6c4c..ea93f99d69e3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -25,8 +25,8 @@ ImportedFunctionsInliningStatistics::createInlineGraphNode(const Function &F) {
 
   auto &ValueLookup = NodesMap[F.getName()];
   if (!ValueLookup) {
-    ValueLookup = llvm::make_unique<InlineGraphNode>();
-    ValueLookup->Imported = F.getMetadata("thinlto_src_module") != nullptr;
+    ValueLookup = std::make_unique<InlineGraphNode>();
+    ValueLookup->Imported = F.hasMetadata("thinlto_src_module");
   }
   return *ValueLookup;
 }
@@ -64,7 +64,7 @@ void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
     if (F.isDeclaration())
       continue;
     AllFunctions++;
-    ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr);
+    ImportedFunctions += int(F.hasMetadata("thinlto_src_module"));
   }
 }
 static std::string getStatString(const char *Msg, int32_t Fraction, int32_t All,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
new file mode 100644
index 000000000000..9192e74b9ace
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -0,0 +1,186 @@
+//===- InjectTLIMAppings.cpp - TLI to VFABI attribute injection  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Populates the VFABI attribute with the scalar-to-vector mappings
+// from the TargetLibraryInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inject-tli-mappings"
+
+STATISTIC(NumCallInjected,
+          "Number of calls in which the mappings have been injected.");
+
+STATISTIC(NumVFDeclAdded,
+          "Number of function declarations that have been added.");
+STATISTIC(NumCompUsedAdded,
+          "Number of `@llvm.compiler.used` operands that have been added.");
+
+/// Helper function to map the TLI name to a strings that holds
+/// scalar-to-vector mapping.
+///
+///    _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>)
+///
+/// where:
+///
+/// <isa> = "_LLVM_"
+/// <mask> = "N". Note: TLI does not support masked interfaces.
+/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor`
+///          field of the `VecDesc` struct.
+/// <vparams> = "v", as many as are the number of parameters of CI.
+/// <scalarname> = the name of the scalar function called by CI.
+/// <vectorname> = the name of the vector function mapped by the TLI.
+static std::string mangleTLIName(StringRef VectorName, const CallInst &CI,
+                                 unsigned VF) {
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  Out << "_ZGV" << VFABI::_LLVM_ << "N" << VF;
+  for (unsigned I = 0; I < CI.getNumArgOperands(); ++I)
+    Out << "v";
+  Out << "_" << CI.getCalledFunction()->getName() << "(" << VectorName << ")";
+  return Out.str();
+}
+
+/// A helper function for converting Scalar types to vector types.
+/// If the incoming type is void, we return void. If the VF is 1, we return
+/// the scalar type.
+static Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, {VF, isScalable});
+}
+
+/// A helper function that adds the vector function declaration that
+/// vectorizes the CallInst CI with a vectorization factor of VF
+/// lanes. The TLI assumes that all parameters and the return type of
+/// CI (other than void) need to be widened to a VectorType of VF
+/// lanes.
+static void addVariantDeclaration(CallInst &CI, const unsigned VF,
+                                  const StringRef VFName) {
+  Module *M = CI.getModule();
+
+  // Add function declaration.
+  Type *RetTy = ToVectorTy(CI.getType(), VF);
+  SmallVector<Type *, 4> Tys;
+  for (Value *ArgOperand : CI.arg_operands())
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+  assert(!CI.getFunctionType()->isVarArg() &&
+         "VarArg functions are not supported.");
+  FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false);
+  Function *VectorF =
+      Function::Create(FTy, Function::ExternalLinkage, VFName, M);
+  VectorF->copyAttributesFrom(CI.getCalledFunction());
+  ++NumVFDeclAdded;
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName
+                    << "` of type " << *(VectorF->getType()) << "\n");
+
+  // Make function declaration (without a body) "sticky" in the IR by
+  // listing it in the @llvm.compiler.used intrinsic.
+  assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` "
+                             "only on declarations.");
+  appendToCompilerUsed(*M, {VectorF});
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName
+                    << "` to `@llvm.compiler.used`.\n");
+  ++NumCompUsedAdded;
+}
+
+static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
+  // This is needed to make sure we don't query the TLI for calls to
+  // bitcast of function pointers, like `%call = call i32 (i32*, ...)
+  // bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i)`,
+  // as such calls make the `isFunctionVectorizable` raise an
+  // exception.
+  if (CI.isNoBuiltin() || !CI.getCalledFunction())
+    return;
+
+  const std::string ScalarName = CI.getCalledFunction()->getName();
+  // Nothing to be done if the TLI thinks the function is not
+  // vectorizable.
+  if (!TLI.isFunctionVectorizable(ScalarName))
+    return;
+  SmallVector<std::string, 8> Mappings;
+  VFABI::getVectorVariantNames(CI, Mappings);
+  Module *M = CI.getModule();
+  const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(),
+                                                   Mappings.end());
+  //  All VFs in the TLI are powers of 2.
+  for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF;
+       VF *= 2) {
+    const std::string TLIName = TLI.getVectorizedFunction(ScalarName, VF);
+    if (!TLIName.empty()) {
+      std::string MangledName = mangleTLIName(TLIName, CI, VF);
+      if (!OriginalSetOfMappings.count(MangledName)) {
+        Mappings.push_back(MangledName);
+        ++NumCallInjected;
+      }
+      Function *VariantF = M->getFunction(TLIName);
+      if (!VariantF)
+        addVariantDeclaration(CI, VF, TLIName);
+    }
+  }
+
+  VFABI::setVectorVariantNames(&CI, Mappings);
+}
+
+static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
+  for (auto &I : instructions(F))
+    if (auto CI = dyn_cast<CallInst>(&I))
+      addMappingsFromTLI(TLI, *CI);
+  // Even if the pass adds IR attributes, the analyses are preserved.
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// New pass manager implementation.
+////////////////////////////////////////////////////////////////////////////////
+PreservedAnalyses InjectTLIMappings::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  runImpl(TLI, F);
+  // Even if the pass adds IR attributes, the analyses are preserved.
+  return PreservedAnalyses::all();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy PM Implementation.
+////////////////////////////////////////////////////////////////////////////////
+bool InjectTLIMappingsLegacy::runOnFunction(Function &F) {
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return runImpl(TLI, F);
+}
+
+void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<TargetLibraryInfoWrapperPass>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy Pass manager initialization
+////////////////////////////////////////////////////////////////////////////////
+char InjectTLIMappingsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InjectTLIMappingsLegacy, DEBUG_TYPE,
+                      "Inject TLI Mappings", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(InjectTLIMappingsLegacy, DEBUG_TYPE, "Inject TLI Mappings",
+                    false, false)
+
+FunctionPass *llvm::createInjectTLIMappingsLegacyPass() {
+  return new InjectTLIMappingsLegacy();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
index a7f0f7ac5d61..6da612eb4e65 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1254,7 +1254,8 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
   // better alignment.
-  Builder.CreateMemCpy(Dst, /*DstAlign*/1, Src, /*SrcAlign*/1, Size);
+  Builder.CreateMemCpy(Dst, /*DstAlign*/ Align::None(), Src,
+                       /*SrcAlign*/ Align::None(), Size);
 }
 
 /// When inlining a call site that has a byval argument,
@@ -1293,16 +1294,16 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   }
 
   // Create the alloca.  If we have DataLayout, use nice alignment.
-  unsigned Align = DL.getPrefTypeAlignment(AggTy);
+  Align Alignment(DL.getPrefTypeAlignment(AggTy));
 
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
   // pointer inside the callee).
-  Align = std::max(Align, ByValAlignment);
+  Alignment = max(Alignment, MaybeAlign(ByValAlignment));
 
-  Value *NewAlloca = new AllocaInst(AggTy, DL.getAllocaAddrSpace(),
-                                    nullptr, Align, Arg->getName(),
-                                    &*Caller->begin()->begin());
+  Value *NewAlloca =
+      new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
+                     Arg->getName(), &*Caller->begin()->begin());
   IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
 
   // Uses of the argument in the function should use our new alloca
@@ -1405,6 +1406,10 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   // other.
   DenseMap<const MDNode *, MDNode *> IANodes;
 
+  // Check if we are not generating inline line tables and want to use
+  // the call site location instead.
+  bool NoInlineLineTables = Fn->hasFnAttribute("no-inline-line-tables");
+
   for (; FI != Fn->end(); ++FI) {
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
@@ -1416,20 +1421,22 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
         BI->setMetadata(LLVMContext::MD_loop, NewLoopID);
       }
 
-      if (DebugLoc DL = BI->getDebugLoc()) {
-        DebugLoc IDL =
-            inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes);
-        BI->setDebugLoc(IDL);
-        continue;
-      }
+      if (!NoInlineLineTables)
+        if (DebugLoc DL = BI->getDebugLoc()) {
+          DebugLoc IDL =
+              inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes);
+          BI->setDebugLoc(IDL);
+          continue;
+        }
 
-      if (CalleeHasDebugInfo)
+      if (CalleeHasDebugInfo && !NoInlineLineTables)
         continue;
 
-      // If the inlined instruction has no line number, make it look as if it
-      // originates from the call location. This is important for
-      // ((__always_inline__, __nodebug__)) functions which must use caller
-      // location for all instructions in their function body.
+      // If the inlined instruction has no line number, or if inline info
+      // is not being generated, make it look as if it originates from the call
+      // location. This is important for ((__always_inline, __nodebug__))
+      // functions which must use caller location for all instructions in their
+      // function body.
 
       // Don't update static allocas, as they may get moved later.
       if (auto *AI = dyn_cast<AllocaInst>(BI))
@@ -1438,6 +1445,19 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 
       BI->setDebugLoc(TheCallDL);
     }
+
+    // Remove debug info intrinsics if we're not keeping inline info.
+    if (NoInlineLineTables) {
+      BasicBlock::iterator BI = FI->begin();
+      while (BI != FI->end()) {
+        if (isa<DbgInfoIntrinsic>(BI)) {
+          BI = BI->eraseFromParent();
+          continue;
+        }
+        ++BI;
+      }
+    }
+
   }
 }
 
@@ -1453,7 +1473,7 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
                             BlockFrequencyInfo *CalleeBFI,
                             const BasicBlock &CalleeEntryBlock) {
   SmallPtrSet<BasicBlock *, 16> ClonedBBs;
-  for (auto const &Entry : VMap) {
+  for (auto Entry : VMap) {
     if (!isa<BasicBlock>(Entry.first) || !Entry.second)
       continue;
     auto *OrigBB = cast<BasicBlock>(Entry.first);
@@ -1508,22 +1528,25 @@ void llvm::updateProfileCallee(
   else
     newEntryCount = priorEntryCount + entryDelta;
 
-  Callee->setEntryCount(newEntryCount);
-
   // During inlining ?
   if (VMap) {
     uint64_t cloneEntryCount = priorEntryCount - newEntryCount;
-    for (auto const &Entry : *VMap)
+    for (auto Entry : *VMap)
       if (isa<CallInst>(Entry.first))
         if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
           CI->updateProfWeight(cloneEntryCount, priorEntryCount);
   }
-  for (BasicBlock &BB : *Callee)
-    // No need to update the callsite if it is pruned during inlining.
-    if (!VMap || VMap->count(&BB))
-      for (Instruction &I : BB)
-        if (CallInst *CI = dyn_cast<CallInst>(&I))
-          CI->updateProfWeight(newEntryCount, priorEntryCount);
+
+  if (entryDelta) {
+    Callee->setEntryCount(newEntryCount);
+
+    for (BasicBlock &BB : *Callee)
+      // No need to update the callsite if it is pruned during inlining.
+      if (!VMap || VMap->count(&BB))
+        for (Instruction &I : BB)
+          if (CallInst *CI = dyn_cast<CallInst>(&I))
+            CI->updateProfWeight(newEntryCount, priorEntryCount);
+  }
 }
 
 /// This function inlines the called function into the basic block of the
@@ -1842,6 +1865,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // Scan for the block of allocas that we can move over, and move them
       // all at once.
       while (isa<AllocaInst>(I) &&
+             !cast<AllocaInst>(I)->use_empty() &&
              allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
         IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
         ++I;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index 6c4fc1ceb991..aac0b55801c4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp
index 29e7c5260f46..5746d69260d5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -43,7 +43,9 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PredIteratorCache.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -74,7 +76,8 @@ static bool isExitBlock(BasicBlock *BB,
 /// that are outside the current loop.  If so, insert LCSSA PHI nodes and
 /// rewrite the uses.
 bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
-                                    DominatorTree &DT, LoopInfo &LI) {
+                                    DominatorTree &DT, LoopInfo &LI,
+                                    ScalarEvolution *SE) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallSetVector<PHINode *, 16> PHIsToRemove;
   PredIteratorCache PredCache;
@@ -134,6 +137,11 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     SSAUpdater SSAUpdate(&InsertedPHIs);
     SSAUpdate.Initialize(I->getType(), I->getName());
 
+    // Force re-computation of I, as some users now need to use the new PHI
+    // node.
+    if (SE)
+      SE->forgetValue(I);
+
     // Insert the LCSSA phi's into all of the exit blocks dominated by the
     // value, and add them to the Phi's map.
     for (BasicBlock *ExitBB : ExitBlocks) {
@@ -192,9 +200,6 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
         UserBB = PN->getIncomingBlock(*UseToRewrite);
 
       if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
-        // Tell the VHs that the uses changed. This updates SCEV's caches.
-        if (UseToRewrite->get()->hasValueHandle())
-          ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front());
         UseToRewrite->set(&UserBB->front());
         continue;
       }
@@ -202,10 +207,6 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       // If we added a single PHI, it must dominate all uses and we can directly
       // rename it.
       if (AddedPHIs.size() == 1) {
-        // Tell the VHs that the uses changed. This updates SCEV's caches.
-        // We might call ValueIsRAUWd multiple times for the same value.
-        if (UseToRewrite->get()->hasValueHandle())
-          ValueHandleBase::ValueIsRAUWd(*UseToRewrite, AddedPHIs[0]);
         UseToRewrite->set(AddedPHIs[0]);
         continue;
       }
@@ -368,7 +369,7 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
       Worklist.push_back(&I);
     }
   }
-  Changed = formLCSSAForInstructions(Worklist, DT, *LI);
+  Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE);
 
   // If we modified the code, remove any caches about the loop from SCEV to
   // avoid dangling entries.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 8c67d1dc6eb3..4c52fac6f7cb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 using namespace llvm;
@@ -533,7 +534,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
 }
 
 bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   return runImpl(F, TLI, DT);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
index 39b6b889f91c..b2d511c7c9a9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
@@ -324,8 +324,14 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
       if (DeleteDeadConditions)
+        // Delete pointer cast instructions.
         RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
 
+      // Also zap the blockaddress constant if there are no users remaining,
+      // otherwise the destination is still marked as having its address taken.
+      if (BA->use_empty())
+        BA->destroyConstant();
+
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
       // 'unreachable' instruction.
@@ -633,17 +639,6 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
 //  Control Flow Graph Restructuring.
 //
 
-/// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this
-/// method is called when we're about to delete Pred as a predecessor of BB.  If
-/// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred.
-///
-/// Unlike the removePredecessor method, this attempts to simplify uses of PHI
-/// nodes that collapse into identity values.  For example, if we have:
-///   x = phi(1, 0, 0, 0)
-///   y = and x, z
-///
-/// .. and delete the predecessor corresponding to the '1', this will attempt to
-/// recursively fold the and to 0.
 void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
                                         DomTreeUpdater *DTU) {
   // This only adjusts blocks with PHI nodes.
@@ -672,10 +667,6 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
     DTU->applyUpdatesPermissive({{DominatorTree::Delete, Pred, BB}});
 }
 
-/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
-/// predecessor is known to have one successor (DestBB!). Eliminate the edge
-/// between them, moving the instructions in the predecessor into DestBB and
-/// deleting the predecessor block.
 void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
                                        DomTreeUpdater *DTU) {
 
@@ -755,15 +746,14 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
   }
 }
 
-/// CanMergeValues - Return true if we can choose one of these values to use
-/// in place of the other. Note that we will always choose the non-undef
-/// value to keep.
+/// Return true if we can choose one of these values to use in place of the
+/// other. Note that we will always choose the non-undef value to keep.
 static bool CanMergeValues(Value *First, Value *Second) {
   return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
 }
 
-/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
-/// almost-empty BB ending in an unconditional branch to Succ, into Succ.
+/// Return true if we can fold BB, an almost-empty BB ending in an unconditional
+/// branch to Succ, into Succ.
 ///
 /// Assumption: Succ is the single successor for BB.
 static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
@@ -956,11 +946,6 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
   replaceUndefValuesInPhi(PN, IncomingValues);
 }
 
-/// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
-/// unconditional branch, and contains no instructions other than PHI nodes,
-/// potential side-effect free intrinsics and the branch.  If possible,
-/// eliminate BB by rewriting all the predecessors to branch to the successor
-/// block and return true.  If we can't transform, return false.
 bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
                                                    DomTreeUpdater *DTU) {
   assert(BB != &BB->getParent()->getEntryBlock() &&
@@ -1088,10 +1073,6 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
   return true;
 }
 
-/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI
-/// nodes in this block. This doesn't try to be clever about PHI nodes
-/// which differ only in the order of the incoming values, but instcombine
-/// orders them so it usually won't matter.
 bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
   // This implementation doesn't currently consider undef operands
   // specially. Theoretically, two phis which are identical except for
@@ -1151,10 +1132,10 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// often possible though. If alignment is important, a more reliable approach
 /// is to simply align all global variables and allocation instructions to
 /// their preferred alignment from the beginning.
-static unsigned enforceKnownAlignment(Value *V, unsigned Align,
+static unsigned enforceKnownAlignment(Value *V, unsigned Alignment,
                                       unsigned PrefAlign,
                                       const DataLayout &DL) {
-  assert(PrefAlign > Align);
+  assert(PrefAlign > Alignment);
 
   V = V->stripPointerCasts();
 
@@ -1165,36 +1146,36 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
     // stripPointerCasts recurses through infinite layers of bitcasts,
     // while computeKnownBits is not allowed to traverse more than 6
     // levels.
-    Align = std::max(AI->getAlignment(), Align);
-    if (PrefAlign <= Align)
-      return Align;
+    Alignment = std::max(AI->getAlignment(), Alignment);
+    if (PrefAlign <= Alignment)
+      return Alignment;
 
     // If the preferred alignment is greater than the natural stack alignment
     // then don't round up. This avoids dynamic stack realignment.
-    if (DL.exceedsNaturalStackAlignment(PrefAlign))
-      return Align;
-    AI->setAlignment(PrefAlign);
+    if (DL.exceedsNaturalStackAlignment(Align(PrefAlign)))
+      return Alignment;
+    AI->setAlignment(MaybeAlign(PrefAlign));
     return PrefAlign;
   }
 
   if (auto *GO = dyn_cast<GlobalObject>(V)) {
     // TODO: as above, this shouldn't be necessary.
-    Align = std::max(GO->getAlignment(), Align);
-    if (PrefAlign <= Align)
-      return Align;
+    Alignment = std::max(GO->getAlignment(), Alignment);
+    if (PrefAlign <= Alignment)
+      return Alignment;
 
     // If there is a large requested alignment and we can, bump up the alignment
     // of the global.  If the memory we set aside for the global may not be the
     // memory used by the final program then it is impossible for us to reliably
     // enforce the preferred alignment.
     if (!GO->canIncreaseAlignment())
-      return Align;
+      return Alignment;
 
-    GO->setAlignment(PrefAlign);
+    GO->setAlignment(MaybeAlign(PrefAlign));
     return PrefAlign;
   }
 
-  return Align;
+  return Alignment;
 }
 
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
@@ -1397,7 +1378,12 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
 /// Determine whether this alloca is either a VLA or an array.
 static bool isArray(AllocaInst *AI) {
   return AI->isArrayAllocation() ||
-    AI->getType()->getElementType()->isArrayTy();
+         (AI->getAllocatedType() && AI->getAllocatedType()->isArrayTy());
+}
+
+/// Determine whether this alloca is a structure.
+static bool isStructure(AllocaInst *AI) {
+  return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy();
 }
 
 /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
@@ -1422,7 +1408,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
     // stored on the stack, while the dbg.declare can only describe
     // the stack slot (and at a lexical-scope granularity). Later
     // passes will attempt to elide the stack slot.
-    if (!AI || isArray(AI))
+    if (!AI || isArray(AI) || isStructure(AI))
       continue;
 
     // A volatile load/store means that the alloca can't be elided anyway.
@@ -1435,22 +1421,32 @@ bool llvm::LowerDbgDeclare(Function &F) {
         }))
       continue;
 
-    for (auto &AIUse : AI->uses()) {
-      User *U = AIUse.getUser();
-      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-        if (AIUse.getOperandNo() == 1)
-          ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-      } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
-        // This is a call by-value or some other instruction that takes a
-        // pointer to the variable. Insert a *value* intrinsic that describes
-        // the variable by dereferencing the alloca.
-        DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr);
-        auto *DerefExpr =
-            DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
-        DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr, NewLoc,
-                                    CI);
+    SmallVector<const Value *, 8> WorkList;
+    WorkList.push_back(AI);
+    while (!WorkList.empty()) {
+      const Value *V = WorkList.pop_back_val();
+      for (auto &AIUse : V->uses()) {
+        User *U = AIUse.getUser();
+        if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+          if (AIUse.getOperandNo() == 1)
+            ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+          ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+          // This is a call by-value or some other instruction that takes a
+          // pointer to the variable. Insert a *value* intrinsic that describes
+          // the variable by dereferencing the alloca.
+          if (!CI->isLifetimeStartOrEnd()) {
+            DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr);
+            auto *DerefExpr =
+                DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
+            DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr,
+                                        NewLoc, CI);
+          }
+        } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
+          if (BI->getType()->isPointerTy())
+            WorkList.push_back(BI);
+        }
       }
     }
     DDI->eraseFromParent();
@@ -1591,15 +1587,10 @@ static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
       DIExpr->getElement(0) != dwarf::DW_OP_deref)
     return;
 
-  // Insert the offset immediately after the first deref.
+  // Insert the offset before the first deref.
   // We could just change the offset argument of dbg.value, but it's unsigned...
-  if (Offset) {
-    SmallVector<uint64_t, 4> Ops;
-    Ops.push_back(dwarf::DW_OP_deref);
-    DIExpression::appendOffset(Ops, Offset);
-    Ops.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
-    DIExpr = Builder.createExpression(Ops);
-  }
+  if (Offset)
+    DIExpr = DIExpression::prepend(DIExpr, 0, Offset);
 
   Builder.insertDbgValueIntrinsic(NewAddress, DIVar, DIExpr, Loc, DVI);
   DVI->eraseFromParent();
@@ -1630,6 +1621,11 @@ bool llvm::salvageDebugInfo(Instruction &I) {
   return salvageDebugInfoForDbgValues(I, DbgUsers);
 }
 
+void llvm::salvageDebugInfoOrMarkUndef(Instruction &I) {
+  if (!salvageDebugInfo(I))
+    replaceDbgUsesWithUndef(&I);
+}
+
 bool llvm::salvageDebugInfoForDbgValues(
     Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) {
   auto &Ctx = I.getContext();
@@ -1680,9 +1676,8 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
   };
 
   // initializer-list helper for applying operators to the source DIExpression.
-  auto applyOps =
-      [&](std::initializer_list<uint64_t> Opcodes) -> DIExpression * {
-    SmallVector<uint64_t, 8> Ops(Opcodes);
+  auto applyOps = [&](ArrayRef<uint64_t> Opcodes) -> DIExpression * {
+    SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end());
     return doSalvage(Ops);
   };
 
@@ -1690,8 +1685,21 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
     // No-op casts and zexts are irrelevant for debug info.
     if (CI->isNoopCast(DL) || isa<ZExtInst>(&I))
       return SrcDIExpr;
-    return nullptr;
-  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+
+    Type *Type = CI->getType();
+    // Casts other than Trunc or SExt to scalar types cannot be salvaged.
+    if (Type->isVectorTy() || (!isa<TruncInst>(&I) && !isa<SExtInst>(&I)))
+      return nullptr;
+
+    Value *FromValue = CI->getOperand(0);
+    unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits();
+    unsigned ToTypeBitSize = Type->getScalarSizeInBits();
+
+    return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize,
+                                            isa<SExtInst>(&I)));
+  }
+
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
     unsigned BitWidth =
         M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace());
     // Rewrite a constant GEP into a DIExpression.
@@ -1746,7 +1754,7 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
 using DbgValReplacement = Optional<DIExpression *>;
 
 /// Point debug users of \p From to \p To using exprs given by \p RewriteExpr,
-/// possibly moving/deleting users to prevent use-before-def. Returns true if
+/// possibly moving/undefing users to prevent use-before-def. Returns true if
 /// changes are made.
 static bool rewriteDebugUsers(
     Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
@@ -1759,7 +1767,7 @@ static bool rewriteDebugUsers(
 
   // Prevent use-before-def of To.
   bool Changed = false;
-  SmallPtrSet<DbgVariableIntrinsic *, 1> DeleteOrSalvage;
+  SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
   if (isa<Instruction>(&To)) {
     bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
 
@@ -1774,14 +1782,14 @@ static bool rewriteDebugUsers(
       // Users which otherwise aren't dominated by the replacement value must
       // be salvaged or deleted.
       } else if (!DT.dominates(&DomPoint, DII)) {
-        DeleteOrSalvage.insert(DII);
+        UndefOrSalvage.insert(DII);
       }
     }
   }
 
   // Update debug users without use-before-def risk.
   for (auto *DII : Users) {
-    if (DeleteOrSalvage.count(DII))
+    if (UndefOrSalvage.count(DII))
       continue;
 
     LLVMContext &Ctx = DII->getContext();
@@ -1795,18 +1803,10 @@ static bool rewriteDebugUsers(
     Changed = true;
   }
 
-  if (!DeleteOrSalvage.empty()) {
+  if (!UndefOrSalvage.empty()) {
     // Try to salvage the remaining debug users.
-    Changed |= salvageDebugInfo(From);
-
-    // Delete the debug users which weren't salvaged.
-    for (auto *DII : DeleteOrSalvage) {
-      if (DII->getVariableLocation() == &From) {
-        LLVM_DEBUG(dbgs() << "Erased UseBeforeDef:  " << *DII << '\n');
-        DII->eraseFromParent();
-        Changed = true;
-      }
-    }
+    salvageDebugInfoOrMarkUndef(From);
+    Changed = true;
   }
 
   return Changed;
@@ -1881,10 +1881,8 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
         return None;
 
       bool Signed = *Signedness == DIBasicType::Signedness::Signed;
-      dwarf::TypeKind TK = Signed ? dwarf::DW_ATE_signed : dwarf::DW_ATE_unsigned;
-      SmallVector<uint64_t, 8> Ops({dwarf::DW_OP_LLVM_convert, ToBits, TK,
-                                   dwarf::DW_OP_LLVM_convert, FromBits, TK});
-      return DIExpression::appendToStack(DII.getExpression(), Ops);
+      return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
+                                     Signed);
     };
     return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt);
   }
@@ -1957,18 +1955,24 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
   return NumInstrsRemoved;
 }
 
-/// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr) {
-  SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
+CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
+  SmallVector<Value *, 8> Args(II->arg_begin(), II->arg_end());
   SmallVector<OperandBundleDef, 1> OpBundles;
   II->getOperandBundlesAsDefs(OpBundles);
-  CallInst *NewCall = CallInst::Create(
-      II->getFunctionType(), II->getCalledValue(), Args, OpBundles, "", II);
-  NewCall->takeName(II);
+  CallInst *NewCall = CallInst::Create(II->getFunctionType(),
+                                       II->getCalledValue(), Args, OpBundles);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
   NewCall->setDebugLoc(II->getDebugLoc());
   NewCall->copyMetadata(*II);
+  return NewCall;
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
+  CallInst *NewCall = createCallMatchingInvoke(II);
+  NewCall->takeName(II);
+  NewCall->insertBefore(II);
   II->replaceAllUsesWith(NewCall);
 
   // Follow the call by a branch to the normal destination.
@@ -2223,12 +2227,10 @@ void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
 
 /// removeUnreachableBlocks - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
-/// after modifying the CFG.
-bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
-                                   DomTreeUpdater *DTU,
+/// otherwise.
+bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
                                    MemorySSAUpdater *MSSAU) {
-  SmallPtrSet<BasicBlock*, 16> Reachable;
+  SmallPtrSet<BasicBlock *, 16> Reachable;
   bool Changed = markAliveBlocks(F, Reachable, DTU);
 
   // If there are unreachable blocks in the CFG...
@@ -2236,21 +2238,21 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
     return Changed;
 
   assert(Reachable.size() < F.size());
-  NumRemoved += F.size()-Reachable.size();
+  NumRemoved += F.size() - Reachable.size();
 
   SmallSetVector<BasicBlock *, 8> DeadBlockSet;
-  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ++I) {
-    auto *BB = &*I;
-    if (Reachable.count(BB))
+  for (BasicBlock &BB : F) {
+    // Skip reachable basic blocks
+    if (Reachable.find(&BB) != Reachable.end())
       continue;
-    DeadBlockSet.insert(BB);
+    DeadBlockSet.insert(&BB);
   }
 
   if (MSSAU)
     MSSAU->removeBlocks(DeadBlockSet);
 
   // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references. Update DTU and LVI if available.
+  // their internal references. Update DTU if available.
   std::vector<DominatorTree::UpdateType> Updates;
   for (auto *BB : DeadBlockSet) {
     for (BasicBlock *Successor : successors(BB)) {
@@ -2259,26 +2261,18 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
       if (DTU)
         Updates.push_back({DominatorTree::Delete, BB, Successor});
     }
-    if (LVI)
-      LVI->eraseBlock(BB);
     BB->dropAllReferences();
-  }
-  for (Function::iterator I = ++F.begin(); I != F.end();) {
-    auto *BB = &*I;
-    if (Reachable.count(BB)) {
-      ++I;
-      continue;
-    }
     if (DTU) {
-      // Remove the terminator of BB to clear the successor list of BB.
-      if (BB->getTerminator())
-        BB->getInstList().pop_back();
+      Instruction *TI = BB->getTerminator();
+      assert(TI && "Basic block should have a terminator");
+      // Terminators like invoke can have users. We have to replace their users,
+      // before removing them.
+      if (!TI->use_empty())
+        TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+      TI->eraseFromParent();
       new UnreachableInst(BB->getContext(), BB);
       assert(succ_empty(BB) && "The successor list of BB isn't empty before "
                                "applying corresponding DTU updates.");
-      ++I;
-    } else {
-      I = F.getBasicBlockList().erase(I);
     }
   }
 
@@ -2294,7 +2288,11 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
     }
     if (!Deleted)
       return false;
+  } else {
+    for (auto *BB : DeadBlockSet)
+      BB->eraseFromParent();
   }
+
   return true;
 }
 
@@ -2363,6 +2361,9 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->setMetadata(Kind,
           MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
         break;
+      case LLVMContext::MD_preserve_access_index:
+        // Preserve !preserve.access.index in K.
+        break;
     }
   }
   // Set !invariant.group from J if J has it. If both instructions have it
@@ -2385,10 +2386,61 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
       LLVMContext::MD_invariant_group, LLVMContext::MD_align,
       LLVMContext::MD_dereferenceable,
       LLVMContext::MD_dereferenceable_or_null,
-      LLVMContext::MD_access_group};
+      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index};
   combineMetadata(K, J, KnownIDs, KDominatesJ);
 }
 
+void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  MDBuilder MDB(Dest.getContext());
+  Type *NewType = Dest.getType();
+  const DataLayout &DL = Source.getModule()->getDataLayout();
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a load instruction changing *only its type*.
+    // The only metadata it makes sense to drop is metadata which is invalidated
+    // when the pointer type changes. This should essentially never be the case
+    // in LLVM, but we explicitly switch over only known metadata to be
+    // conservatively correct. If you are adding metadata to LLVM which pertains
+    // to loads, you almost certainly want to add it here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+    case LLVMContext::MD_access_group:
+      // All of these directly apply.
+      Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_nonnull:
+      copyNonnullMetadata(Source, N, Dest);
+      break;
+
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewType->isPointerTy())
+        Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_range:
+      copyRangeMetadata(DL, Source, N, Dest);
+      break;
+    }
+  }
+}
+
 void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
   auto *ReplInst = dyn_cast<Instruction>(Repl);
   if (!ReplInst)
@@ -2417,7 +2469,7 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
       LLVMContext::MD_noalias,         LLVMContext::MD_range,
       LLVMContext::MD_fpmath,          LLVMContext::MD_invariant_load,
       LLVMContext::MD_invariant_group, LLVMContext::MD_nonnull,
-      LLVMContext::MD_access_group};
+      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index};
   combineMetadata(ReplInst, I, KnownIDs, false);
 }
 
@@ -2539,7 +2591,7 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
   if (!NewTy->isPointerTy())
     return;
 
-  unsigned BitWidth = DL.getIndexTypeSizeInBits(NewTy);
+  unsigned BitWidth = DL.getPointerTypeSizeInBits(NewTy);
   if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
     MDNode *NN = MDNode::get(OldLI.getContext(), None);
     NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 37389a695b45..c065e0269c64 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -76,6 +76,13 @@ private:
 };
 } // end anonymous namespace
 
+/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not
+/// previously exist in the map, and the value was inserted.
+static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) {
+  bool Inserted = VM.insert({K, V}).second;
+  assert(Inserted);
+  (void)Inserted;
+}
 /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
 /// old header into the preheader.  If there were uses of the values produced by
 /// these instruction that were outside of the loop, we have to insert PHI nodes
@@ -300,7 +307,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // For PHI nodes, the value available in OldPreHeader is just the
   // incoming value from OldPreHeader.
   for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+    InsertNewValueIntoMap(ValueMap, PN,
+                          PN->getIncomingValueForBlock(OrigPreheader));
 
   // For the rest of the instructions, either hoist to the OrigPreheader if
   // possible or create a clone in the OldPreHeader if not.
@@ -358,13 +366,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     if (V && LI->replacementPreservesLCSSAForm(C, V)) {
       // If so, then delete the temporary instruction and stick the folded value
       // in the map.
-      ValueMap[Inst] = V;
+      InsertNewValueIntoMap(ValueMap, Inst, V);
       if (!C->mayHaveSideEffects()) {
         C->deleteValue();
         C = nullptr;
       }
     } else {
-      ValueMap[Inst] = C;
+      InsertNewValueIntoMap(ValueMap, Inst, C);
     }
     if (C) {
       // Otherwise, stick the new instruction into the new block!
@@ -376,7 +384,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           AC->registerAssumption(II);
       // MemorySSA cares whether the cloned instruction was inserted or not, and
       // not whether it can be remapped to a simplified value.
-      ValueMapMSSA[Inst] = C;
+      if (MSSAU)
+        InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
     }
   }
 
@@ -396,7 +405,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // Update MemorySSA before the rewrite call below changes the 1:1
   // instruction:cloned_instruction_or_value mapping.
   if (MSSAU) {
-    ValueMapMSSA[OrigHeader] = OrigPreheader;
+    InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
     MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
                                         ValueMapMSSA);
   }
@@ -615,30 +624,9 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
   LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
                     << LastExit->getName() << "\n");
 
-  // Hoist the instructions from Latch into LastExit.
-  Instruction *FirstLatchInst = &*(Latch->begin());
-  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
-                                 Latch->begin(), Jmp->getIterator());
-
-  // Update MemorySSA
-  if (MSSAU)
-    MSSAU->moveAllAfterMergeBlocks(Latch, LastExit, FirstLatchInst);
-
-  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
-  BasicBlock *Header = Jmp->getSuccessor(0);
-  assert(Header == L->getHeader() && "expected a backward branch");
-
-  // Remove Latch from the CFG so that LastExit becomes the new Latch.
-  BI->setSuccessor(FallThruPath, Header);
-  Latch->replaceSuccessorsPhiUsesWith(LastExit);
-  Jmp->eraseFromParent();
-
-  // Nuke the Latch block.
-  assert(Latch->empty() && "unable to evacuate Latch");
-  LI->removeBlock(Latch);
-  if (DT)
-    DT->eraseNode(Latch);
-  Latch->eraseFromParent();
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr,
+                            /*PredecessorWithTwoSuccessors=*/true);
 
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 7e6da02d5707..28f88f39a712 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -67,6 +67,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils.h"
@@ -808,7 +809,7 @@ bool LoopSimplify::runOnFunction(Function &F) {
     auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
     if (MSSAAnalysis) {
       MSSA = &MSSAAnalysis->getMSSA();
-      MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
     }
   }
 
@@ -835,12 +836,19 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
   AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *MSSAAnalysis = AM.getCachedResult<MemorySSAAnalysis>(F);
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSAAnalysis) {
+    auto *MSSA = &MSSAAnalysis->getMSSA();
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+  }
+
 
   // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
-  // after simplifying the loops. MemorySSA is not preserved either.
+  // after simplifying the loops. MemorySSA is preserved if it exists.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
     Changed |=
-        simplifyLoop(*I, DT, LI, SE, AC, nullptr, /*PreserveLCSSA*/ false);
+        simplifyLoop(*I, DT, LI, SE, AC, MSSAU.get(), /*PreserveLCSSA*/ false);
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -853,6 +861,8 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
   PA.preserve<SCEVAA>();
   PA.preserve<ScalarEvolutionAnalysis>();
   PA.preserve<DependenceAnalysis>();
+  if (MSSAAnalysis)
+    PA.preserve<MemorySSAAnalysis>();
   // BPI maps conditional terminators to probabilities, LoopSimplify can insert
   // blocks, but it does so only by splitting existing blocks and edges. This
   // results in the interesting property that all new terminators inserted are
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 4a1edb3700c0..4b94b371e70a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -22,17 +22,18 @@
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -870,7 +871,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   assert(!DT || !UnrollVerifyDomtree ||
          DT->verify(DominatorTree::VerificationLevel::Fast));
 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   // Merge adjacent basic blocks, if possible.
   for (BasicBlock *Latch : Latches) {
     BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
@@ -890,6 +891,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       }
     }
   }
+  // Apply updates to the DomTree.
+  DT = &DTU.getDomTree();
 
   // At this point, the code is well formed.  We now simplify the unrolled loop,
   // doing constant propagation and dead code elimination as we go.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index ff49d83f25c5..f1965934b2d7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
@@ -177,6 +176,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
 
   // When we enter here we should have already checked that it is safe
   BasicBlock *Header = L->getHeader();
+  assert(Header && "No header.");
   assert(L->getSubLoops().size() == 1);
   Loop *SubLoop = *L->begin();
 
@@ -247,8 +247,9 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
 
   BasicBlock *Preheader = L->getLoopPreheader();
   BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(Preheader && "No preheader");
+  assert(LatchBlock && "No latch block");
   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-  assert(Preheader && LatchBlock && Header);
   assert(BI && !BI->isUnconditional());
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
@@ -517,6 +518,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
   }
 
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
   // new ones required.
   if (Count != 1) {
@@ -530,7 +532,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
-    DT->applyUpdates(DTUpdates);
+    DTU.applyUpdatesPermissive(DTUpdates);
   }
 
   // Merge adjacent basic blocks, if possible.
@@ -538,7 +540,6 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   while (!MergeBlocks.empty()) {
     BasicBlock *BB = *MergeBlocks.begin();
     BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
@@ -555,6 +556,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
     } else
       MergeBlocks.erase(BB);
   }
+  // Apply updates to the DomTree.
+  DT = &DTU.getDomTree();
 
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 005306cf1898..7a168ff6f32b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -62,9 +62,11 @@ static cl::opt<unsigned> UnrollForcePeelCount(
     cl::desc("Force a peel count regardless of profiling information."));
 
 static cl::opt<bool> UnrollPeelMultiDeoptExit(
-    "unroll-peel-multi-deopt-exit", cl::init(false), cl::Hidden,
+    "unroll-peel-multi-deopt-exit", cl::init(true), cl::Hidden,
     cl::desc("Allow peeling of loops with multiple deopt exits."));
 
+static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
+
 // Designates that a Phi is estimated to become invariant after an "infinite"
 // number of loop iterations (i.e. only may become an invariant if the loop is
 // fully unrolled).
@@ -210,14 +212,11 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
 
     // Avoid huge SCEV computations in the loop below, make sure we only
-    // consider AddRecs of the loop we are trying to peel and avoid
-    // non-monotonic predicates, as we will not be able to simplify the loop
-    // body.
-    // FIXME: For the non-monotonic predicates ICMP_EQ and ICMP_NE we can
-    //        simplify the loop, if we peel 1 additional iteration, if there
-    //        is no wrapping.
+    // consider AddRecs of the loop we are trying to peel.
+    if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)
+      continue;
     bool Increasing;
-    if (!LeftAR->isAffine() || LeftAR->getLoop() != &L ||
+    if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) &&
         !SE.isMonotonicPredicate(LeftAR, Pred, Increasing))
       continue;
     (void)Increasing;
@@ -236,18 +235,43 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
       Pred = ICmpInst::getInversePredicate(Pred);
 
     const SCEV *Step = LeftAR->getStepRecurrence(SE);
-    while (NewPeelCount < MaxPeelCount &&
-           SE.isKnownPredicate(Pred, IterVal, RightSCEV)) {
-      IterVal = SE.getAddExpr(IterVal, Step);
+    const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);
+    auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step,
+                                 &NewPeelCount]() {
+      IterVal = NextIterVal;
+      NextIterVal = SE.getAddExpr(IterVal, Step);
       NewPeelCount++;
+    };
+
+    auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() {
+      return NewPeelCount < MaxPeelCount;
+    };
+
+    while (CanPeelOneMoreIteration() &&
+           SE.isKnownPredicate(Pred, IterVal, RightSCEV))
+      PeelOneMoreIteration();
+
+    // With *that* peel count, does the predicate !Pred become known in the
+    // first iteration of the loop body after peeling?
+    if (!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
+                             RightSCEV))
+      continue; // If not, give up.
+
+    // However, for equality comparisons, that isn't always sufficient to
+    // eliminate the comparsion in loop body, we may need to peel one more
+    // iteration. See if that makes !Pred become unknown again.
+    if (ICmpInst::isEquality(Pred) &&
+        !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,
+                             RightSCEV)) {
+      assert(!SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
+             SE.isKnownPredicate(Pred, NextIterVal, RightSCEV) &&
+             "Expected Pred to go from known to unknown.");
+      if (!CanPeelOneMoreIteration())
+        continue; // Need to peel one more iteration, but can't. Give up.
+      PeelOneMoreIteration(); // Great!
     }
 
-    // Only peel the loop if the monotonic predicate !Pred becomes known in the
-    // first iteration of the loop body after peeling.
-    if (NewPeelCount > DesiredPeelCount &&
-        SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
-                            RightSCEV))
-      DesiredPeelCount = NewPeelCount;
+    DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
   }
 
   return DesiredPeelCount;
@@ -275,6 +299,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
                       << " iterations.\n");
     UP.PeelCount = UnrollForcePeelCount;
+    UP.PeelProfiledIterations = true;
     return;
   }
 
@@ -282,6 +307,13 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!UP.AllowPeeling)
     return;
 
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  // Stop if we already peeled off the maximum number of iterations.
+  if (AlreadyPeeled >= UnrollPeelMaxCount)
+    return;
+
   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
   // iterations of the loop. For this we compute the number for iterations after
   // which every Phi is guaranteed to become an invariant, and try to peel the
@@ -317,11 +349,14 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
       DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
       // Consider max peel count limitation.
       assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
-      LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
-                        << " iteration(s) to turn"
-                        << " some Phis into invariants.\n");
-      UP.PeelCount = DesiredPeelCount;
-      return;
+      if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
+        LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                          << " iteration(s) to turn"
+                          << " some Phis into invariants.\n");
+        UP.PeelCount = DesiredPeelCount;
+        UP.PeelProfiledIterations = false;
+        return;
+      }
     }
   }
 
@@ -330,6 +365,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (TripCount)
     return;
 
+  // Do not apply profile base peeling if it is disabled.
+  if (!UP.PeelProfiledIterations)
+    return;
   // If we don't know the trip count, but have reason to believe the average
   // trip count is low, peeling should be beneficial, since we will usually
   // hit the peeled section.
@@ -344,7 +382,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                       << "\n");
 
     if (*PeelCount) {
-      if ((*PeelCount <= UnrollPeelMaxCount) &&
+      if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
           (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
         LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
                           << " iterations.\n");
@@ -352,6 +390,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
         return;
       }
       LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+      LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
       LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
       LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
                         << "\n");
@@ -364,88 +403,77 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
 /// iteration.
 /// This sets the branch weights for the latch of the recently peeled off loop
 /// iteration correctly.
-/// Our goal is to make sure that:
-/// a) The total weight of all the copies of the loop body is preserved.
-/// b) The total weight of the loop exit is preserved.
-/// c) The body weight is reasonably distributed between the peeled iterations.
+/// Let F is a weight of the edge from latch to header.
+/// Let E is a weight of the edge from latch to exit.
+/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
+/// go to exit.
+/// Then, Estimated TripCount = F / E.
+/// For I-th (counting from 0) peeled off iteration we set the the weights for
+/// the peeled latch as (TC - I, 1). It gives us reasonable distribution,
+/// The probability to go to exit 1/(TC-I) increases. At the same time
+/// the estimated trip count of remaining loop reduces by I.
+/// To avoid dealing with division rounding we can just multiple both part
+/// of weights to E and use weight as (F - I * E, E).
 ///
 /// \param Header The copy of the header block that belongs to next iteration.
 /// \param LatchBR The copy of the latch branch that belongs to this iteration.
-/// \param IterNumber The serial number of the iteration that was just
-/// peeled off.
-/// \param AvgIters The average number of iterations we expect the loop to have.
-/// \param[in,out] PeeledHeaderWeight The total number of dynamic loop
-/// iterations that are unaccounted for. As an input, it represents the number
-/// of times we expect to enter the header of the iteration currently being
-/// peeled off. The output is the number of times we expect to enter the
-/// header of the next iteration.
+/// \param[in,out] FallThroughWeight The weight of the edge from latch to
+/// header before peeling (in) and after peeled off one iteration (out).
 static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                                unsigned IterNumber, unsigned AvgIters,
-                                uint64_t &PeeledHeaderWeight) {
-  if (!PeeledHeaderWeight)
+                                uint64_t ExitWeight,
+                                uint64_t &FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
     return;
-  // FIXME: Pick a more realistic distribution.
-  // Currently the proportion of weight we assign to the fall-through
-  // side of the branch drops linearly with the iteration number, and we use
-  // a 0.9 fudge factor to make the drop-off less sharp...
-  uint64_t FallThruWeight =
-      PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9);
-  uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight;
-  PeeledHeaderWeight -= ExitWeight;
 
   unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
   MDBuilder MDB(LatchBR->getContext());
   MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight)
-                : MDB.createBranchWeights(FallThruWeight, ExitWeight);
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+  FallThroughWeight =
+      FallThroughWeight > ExitWeight ? FallThroughWeight - ExitWeight : 1;
 }
 
 /// Initialize the weights.
 ///
 /// \param Header The header block.
 /// \param LatchBR The latch branch.
-/// \param AvgIters The average number of iterations we expect the loop to have.
-/// \param[out] ExitWeight The # of times the edge from Latch to Exit is taken.
-/// \param[out] CurHeaderWeight The # of times the header is executed.
+/// \param[out] ExitWeight The weight of the edge from Latch to Exit.
+/// \param[out] FallThroughWeight The weight of the edge from Latch to Header.
 static void initBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                              unsigned AvgIters, uint64_t &ExitWeight,
-                              uint64_t &CurHeaderWeight) {
+                              uint64_t &ExitWeight,
+                              uint64_t &FallThroughWeight) {
   uint64_t TrueWeight, FalseWeight;
   if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
     return;
   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
   ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
-  // The # of times the loop body executes is the sum of the exit block
-  // is taken and the # of times the backedges are taken.
-  CurHeaderWeight = TrueWeight + FalseWeight;
+  FallThroughWeight = HeaderIdx ? FalseWeight : TrueWeight;
 }
 
 /// Update the weights of original Latch block after peeling off all iterations.
 ///
 /// \param Header The header block.
 /// \param LatchBR The latch branch.
-/// \param ExitWeight The weight of the edge from Latch to Exit block.
-/// \param CurHeaderWeight The # of time the header is executed.
+/// \param ExitWeight The weight of the edge from Latch to Exit.
+/// \param FallThroughWeight The weight of the edge from Latch to Header.
 static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                               uint64_t ExitWeight, uint64_t CurHeaderWeight) {
-  // Adjust the branch weights on the loop exit.
-  if (!ExitWeight)
+                               uint64_t ExitWeight,
+                               uint64_t FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
     return;
 
-  // The backedge count is the difference of current header weight and
-  // current loop exit weight. If the current header weight is smaller than
-  // the current loop exit weight, we mark the loop backedge weight as 1.
-  uint64_t BackEdgeWeight = 0;
-  if (ExitWeight < CurHeaderWeight)
-    BackEdgeWeight = CurHeaderWeight - ExitWeight;
-  else
-    BackEdgeWeight = 1;
+  // Sets the branch weights on the loop exit.
   MDBuilder MDB(LatchBR->getContext());
   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
   MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
-                : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
 }
 
@@ -556,7 +584,7 @@ static void cloneLoopBlocks(
 
   // LastValueMap is updated with the values for the current loop
   // which are used the next time this function is called.
-  for (const auto &KV : VMap)
+  for (auto KV : VMap)
     LVMap[KV.first] = KV.second;
 }
 
@@ -586,11 +614,30 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
 
   DenseMap<BasicBlock *, BasicBlock *> ExitIDom;
   if (DT) {
+    // We'd like to determine the idom of exit block after peeling one
+    // iteration.
+    // Let Exit is exit block.
+    // Let ExitingSet - is a set of predecessors of Exit block. They are exiting
+    // blocks.
+    // Let Latch' and ExitingSet' are copies after a peeling.
+    // We'd like to find an idom'(Exit) - idom of Exit after peeling.
+    // It is an evident that idom'(Exit) will be the nearest common dominator
+    // of ExitingSet and ExitingSet'.
+    // idom(Exit) is a nearest common dominator of ExitingSet.
+    // idom(Exit)' is a nearest common dominator of ExitingSet'.
+    // Taking into account that we have a single Latch, Latch' will dominate
+    // Header and idom(Exit).
+    // So the idom'(Exit) is nearest common dominator of idom(Exit)' and Latch'.
+    // All these basic blocks are in the same loop, so what we find is
+    // (nearest common dominator of idom(Exit) and Latch)'.
+    // In the loop below we remember nearest common dominator of idom(Exit) and
+    // Latch to update idom of Exit later.
     assert(L->hasDedicatedExits() && "No dedicated exits?");
     for (auto Edge : ExitEdges) {
       if (ExitIDom.count(Edge.second))
         continue;
-      BasicBlock *BB = DT->getNode(Edge.second)->getIDom()->getBlock();
+      BasicBlock *BB = DT->findNearestCommonDominator(
+          DT->getNode(Edge.second)->getIDom()->getBlock(), Latch);
       assert(L->contains(BB) && "IDom is not in a loop");
       ExitIDom[Edge.second] = BB;
     }
@@ -659,23 +706,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   // newly created branches.
   BranchInst *LatchBR =
       cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
-  uint64_t ExitWeight = 0, CurHeaderWeight = 0;
-  initBranchWeights(Header, LatchBR, PeelCount, ExitWeight, CurHeaderWeight);
+  uint64_t ExitWeight = 0, FallThroughWeight = 0;
+  initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
 
   // For each peeled-off iteration, make a copy of the loop.
   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     ValueToValueMapTy VMap;
 
-    // Subtract the exit weight from the current header weight -- the exit
-    // weight is exactly the weight of the previous iteration's header.
-    // FIXME: due to the way the distribution is constructed, we need a
-    // guard here to make sure we don't end up with non-positive weights.
-    if (ExitWeight < CurHeaderWeight)
-      CurHeaderWeight -= ExitWeight;
-    else
-      CurHeaderWeight = 1;
-
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
                     LoopBlocks, VMap, LVMap, DT, LI);
 
@@ -697,8 +735,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     }
 
     auto *LatchBRCopy = cast<BranchInst>(VMap[LatchBR]);
-    updateBranchWeights(InsertBot, LatchBRCopy, Iter,
-                        PeelCount, ExitWeight);
+    updateBranchWeights(InsertBot, LatchBRCopy, ExitWeight, FallThroughWeight);
     // Remove Loop metadata from the latch branch instruction
     // because it is not the Loop's latch branch anymore.
     LatchBRCopy->setMetadata(LLVMContext::MD_loop, nullptr);
@@ -724,7 +761,13 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
   }
 
-  fixupBranchWeights(Header, LatchBR, ExitWeight, CurHeaderWeight);
+  fixupBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
+
+  // Update Metadata for count of peeled off iterations.
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
 
   if (Loop *ParentLoop = L->getParentLoop())
     L = ParentLoop;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index d22fdb4d52dc..ddb7479924bd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils.h"
@@ -395,9 +396,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
     }
   }
   if (CreateRemainderLoop) {
-    Loop *NewLoop = NewLoops[L];
-    MDNode *LoopID = NewLoop->getLoopID();
+    Loop *NewLoop = NewLoops[L];  
     assert(NewLoop && "L should have been cloned");
+    MDNode *LoopID = NewLoop->getLoopID();
 
     // Only add loop metadata if the loop is not going to be completely
     // unrolled.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
index ec226e65f650..c4c40189fda4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -19,11 +19,11 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -34,6 +34,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
@@ -45,6 +46,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "loop-utils"
 
 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
+static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
 
 bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
                                    MemorySSAUpdater *MSSAU,
@@ -169,6 +171,8 @@ void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
   AU.addPreserved<SCEVAAWrapperPass>();
   AU.addRequired<ScalarEvolutionWrapperPass>();
   AU.addPreserved<ScalarEvolutionWrapperPass>();
+  // FIXME: When all loop passes preserve MemorySSA, it can be required and
+  // preserved here instead of the individual handling in each pass.
 }
 
 /// Manually defined generic "LoopPass" dependency initialization. This is used
@@ -189,6 +193,54 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) {
   INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
   INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
   INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+}
+
+/// Create MDNode for input string.
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  Metadata *MDs[] = {
+      MDString::get(Context, Name),
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+  return MDNode::get(Context, MDs);
+}
+
+/// Set input string into loop metadata by keeping other values intact.
+/// If the string is already in loop metadata update value if it is
+/// different.
+void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
+                                   unsigned V) {
+  SmallVector<Metadata *, 4> MDs(1);
+  // If the loop already has metadata, retain it.
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+      // If it is of form key = value, try to parse it.
+      if (Node->getNumOperands() == 2) {
+        MDString *S = dyn_cast<MDString>(Node->getOperand(0));
+        if (S && S->getString().equals(StringMD)) {
+          ConstantInt *IntMD =
+              mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
+          if (IntMD && IntMD->getSExtValue() == V)
+            // It is already in place. Do nothing.
+            return;
+          // We need to update the value, so just skip it here and it will
+          // be added after copying other existed nodes.
+          continue;
+        }
+      }
+      MDs.push_back(Node);
+    }
+  }
+  // Add new metadata.
+  MDs.push_back(createStringMetadata(TheLoop, StringMD, V));
+  // Replace current metadata node with new one.
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  TheLoop->setLoopID(NewLoopID);
 }
 
 /// Find string metadata for loop
@@ -332,6 +384,10 @@ bool llvm::hasDisableAllTransformsHint(const Loop *L) {
   return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced);
 }
 
+bool llvm::hasDisableLICMTransformsHint(const Loop *L) {
+  return getBooleanLoopAttribute(L, LLVMLoopDisableLICM);
+}
+
 TransformationMode llvm::hasUnrollTransformation(Loop *L) {
   if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable"))
     return TM_SuppressedByUser;
@@ -616,7 +672,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
       LI->removeBlock(BB);
 
     // The last step is to update LoopInfo now that we've eliminated this loop.
-    LI->erase(L);
+    // Note: LoopInfo::erase remove the given loop and relink its subloops with
+    // its parent. While removeLoop/removeChildLoop remove the given loop but
+    // not relink its subloops, which is what we want.
+    if (Loop *ParentLoop = L->getParentLoop()) {
+      Loop::iterator I = find(ParentLoop->begin(), ParentLoop->end(), L);
+      assert(I != ParentLoop->end() && "Couldn't find loop");
+      ParentLoop->removeChildLoop(I);
+    } else {
+      Loop::iterator I = find(LI->begin(), LI->end(), L);
+      assert(I != LI->end() && "Couldn't find loop");
+      LI->removeLoop(I);
+    }
+    LI->destroy(L);
   }
 }
 
@@ -646,19 +714,19 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   // To estimate the number of times the loop body was executed, we want to
   // know the number of times the backedge was taken, vs. the number of times
   // we exited the loop.
-  uint64_t TrueVal, FalseVal;
-  if (!LatchBR->extractProfMetadata(TrueVal, FalseVal))
+  uint64_t BackedgeTakenWeight, LatchExitWeight;
+  if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
     return None;
 
-  if (!TrueVal || !FalseVal)
+  if (LatchBR->getSuccessor(0) != L->getHeader())
+    std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+  if (!BackedgeTakenWeight || !LatchExitWeight)
     return 0;
 
   // Divide the count of the backedge by the count of the edge exiting the loop,
   // rounding to nearest.
-  if (LatchBR->getSuccessor(0) == L->getHeader())
-    return (TrueVal + (FalseVal / 2)) / FalseVal;
-  else
-    return (FalseVal + (TrueVal / 2)) / TrueVal;
+  return llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
 }
 
 bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index a9a480a4b7f9..50752bd78a65 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -92,8 +94,8 @@ void LoopVersioning::versionLoop(
   // Create empty preheader for the loop (and after cloning for the
   // non-versioned loop).
   BasicBlock *PH =
-      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI);
-  PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
+      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI,
+                 nullptr, VersionedLoop->getHeader()->getName() + ".ph");
 
   // Clone the loop including the preheader.
   //
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index fe67e191dc62..1af0ce3d86cc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 8256e3b5f5af..4b9d0dadfc17 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index cd2c81b6abc8..5ad7aeb463ec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Utils.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index c0b7edc547fd..7f961dbaf4b4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
 
@@ -121,15 +122,14 @@ namespace {
       }
 
       // Rename all functions
-      const TargetLibraryInfo &TLI =
-          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       for (auto &F : M) {
         StringRef Name = F.getName();
         LibFunc Tmp;
         // Leave library functions alone because their presence or absence could
         // affect the behavior of other passes.
         if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
-            TLI.getLibFunc(F, Tmp))
+            getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F).getLibFunc(
+                F, Tmp))
           continue;
 
         // Leave @main alone. The output of -metarenamer might be passed to
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp
new file mode 100644
index 000000000000..a16ca1fb8efa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp
@@ -0,0 +1,178 @@
+//===--- MisExpect.cpp - Check the use of llvm.expect with PGO data -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit warnings for potentially incorrect usage of the
+// llvm.expect intrinsic. This utility extracts the threshold values from
+// metadata associated with the instrumented Branch or Switch instruction. The
+// threshold values are then used to determine if a warning should be emmited.
+//
+// MisExpect metadata is generated when llvm.expect intrinsics are lowered see
+// LowerExpectIntrinsic.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/MisExpect.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
+#include <functional>
+#include <numeric>
+
+#define DEBUG_TYPE "misexpect"
+
+using namespace llvm;
+using namespace misexpect;
+
+namespace llvm {
+
+// Command line option to enable/disable the warning when profile data suggests
+// a mismatch with the use of the llvm.expect intrinsic
+static cl::opt<bool> PGOWarnMisExpect(
+    "pgo-warn-misexpect", cl::init(false), cl::Hidden,
+    cl::desc("Use this option to turn on/off "
+             "warnings about incorrect usage of llvm.expect intrinsics."));
+
+} // namespace llvm
+
+namespace {
+
+Instruction *getOprndOrInst(Instruction *I) {
+  assert(I != nullptr && "MisExpect target Instruction cannot be nullptr");
+  Instruction *Ret = nullptr;
+  if (auto *B = dyn_cast<BranchInst>(I)) {
+    Ret = dyn_cast<Instruction>(B->getCondition());
+  }
+  // TODO: Find a way to resolve condition location for switches
+  // Using the condition of the switch seems to often resolve to an earlier
+  // point in the program, i.e. the calculation of the switch condition, rather
+  // than the switches location in the source code. Thus, we should use the
+  // instruction to get source code locations rather than the condition to
+  // improve diagnostic output, such as the caret. If the same problem exists
+  // for branch instructions, then we should remove this function and directly
+  // use the instruction
+  //
+  // else if (auto S = dyn_cast<SwitchInst>(I)) {
+  // Ret = I;
+  //}
+  return Ret ? Ret : I;
+}
+
+void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
+                             uint64_t ProfCount, uint64_t TotalCount) {
+  double PercentageCorrect = (double)ProfCount / TotalCount;
+  auto PerString =
+      formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount);
+  auto RemStr = formatv(
+      "Potential performance regression from use of the llvm.expect intrinsic: "
+      "Annotation was correct on {0} of profiled executions.",
+      PerString);
+  Twine Msg(PerString);
+  Instruction *Cond = getOprndOrInst(I);
+  if (PGOWarnMisExpect)
+    Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Msg));
+  OptimizationRemarkEmitter ORE(I->getParent()->getParent());
+  ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str());
+}
+
+} // namespace
+
+namespace llvm {
+namespace misexpect {
+
+void verifyMisExpect(Instruction *I, const SmallVector<uint32_t, 4> &Weights,
+                     LLVMContext &Ctx) {
+  if (auto *MisExpectData = I->getMetadata(LLVMContext::MD_misexpect)) {
+    auto *MisExpectDataName = dyn_cast<MDString>(MisExpectData->getOperand(0));
+    if (MisExpectDataName &&
+        MisExpectDataName->getString().equals("misexpect")) {
+      LLVM_DEBUG(llvm::dbgs() << "------------------\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Function: " << I->getFunction()->getName() << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "Instruction: " << *I << ":\n");
+      LLVM_DEBUG(for (int Idx = 0, Size = Weights.size(); Idx < Size; ++Idx) {
+        llvm::dbgs() << "Weights[" << Idx << "] = " << Weights[Idx] << "\n";
+      });
+
+      // extract values from misexpect metadata
+      const auto *IndexCint =
+          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(1));
+      const auto *LikelyCInt =
+          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(2));
+      const auto *UnlikelyCInt =
+          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(3));
+
+      if (!IndexCint || !LikelyCInt || !UnlikelyCInt)
+        return;
+
+      const uint64_t Index = IndexCint->getZExtValue();
+      const uint64_t LikelyBranchWeight = LikelyCInt->getZExtValue();
+      const uint64_t UnlikelyBranchWeight = UnlikelyCInt->getZExtValue();
+      const uint64_t ProfileCount = Weights[Index];
+      const uint64_t CaseTotal = std::accumulate(
+          Weights.begin(), Weights.end(), (uint64_t)0, std::plus<uint64_t>());
+      const uint64_t NumUnlikelyTargets = Weights.size() - 1;
+
+      const uint64_t TotalBranchWeight =
+          LikelyBranchWeight + (UnlikelyBranchWeight * NumUnlikelyTargets);
+
+      const llvm::BranchProbability LikelyThreshold(LikelyBranchWeight,
+                                                    TotalBranchWeight);
+      uint64_t ScaledThreshold = LikelyThreshold.scale(CaseTotal);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Unlikely Targets: " << NumUnlikelyTargets << ":\n");
+      LLVM_DEBUG(llvm::dbgs() << "Profile Count: " << ProfileCount << ":\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Scaled Threshold: " << ScaledThreshold << ":\n");
+      LLVM_DEBUG(llvm::dbgs() << "------------------\n");
+      if (ProfileCount < ScaledThreshold)
+        emitMisexpectDiagnostic(I, Ctx, ProfileCount, CaseTotal);
+    }
+  }
+}
+
+void checkFrontendInstrumentation(Instruction &I) {
+  if (auto *MD = I.getMetadata(LLVMContext::MD_prof)) {
+    unsigned NOps = MD->getNumOperands();
+
+    // Only emit misexpect diagnostics if at least 2 branch weights are present.
+    // Less than 2 branch weights means that the profiling metadata is:
+    //    1) incorrect/corrupted
+    //    2) not branch weight metadata
+    //    3) completely deterministic
+    // In these cases we should not emit any diagnostic related to misexpect.
+    if (NOps < 3)
+      return;
+
+    // Operand 0 is a string tag "branch_weights"
+    if (MDString *Tag = cast<MDString>(MD->getOperand(0))) {
+      if (Tag->getString().equals("branch_weights")) {
+        SmallVector<uint32_t, 4> RealWeights(NOps - 1);
+        for (unsigned i = 1; i < NOps; i++) {
+          ConstantInt *Value =
+              mdconst::dyn_extract<ConstantInt>(MD->getOperand(i));
+          RealWeights[i - 1] = Value->getZExtValue();
+        }
+        verifyMisExpect(&I, RealWeights, I.getContext());
+      }
+    }
+  }
+}
+
+} // namespace misexpect
+} // namespace llvm
+#undef DEBUG_TYPE
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index c84beceee191..b94f57e4dc2c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -73,7 +74,7 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *>
   SmallPtrSet<Constant *, 16> InitAsSet;
   SmallVector<Constant *, 16> Init;
   if (GV) {
-    ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+    auto *CA = cast<ConstantArray>(GV->getInitializer());
     for (auto &Op : CA->operands()) {
       Constant *C = cast_or_null<Constant>(Op);
       if (InitAsSet.insert(C).second)
@@ -280,3 +281,31 @@ std::string llvm::getUniqueModuleId(Module *M) {
   MD5::stringifyResult(R, Str);
   return ("$" + Str).str();
 }
+
+void VFABI::setVectorVariantNames(
+    CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) {
+  if (VariantMappings.empty())
+    return;
+
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  for (const std::string &VariantMapping : VariantMappings)
+    Out << VariantMapping << ",";
+  // Get rid of the trailing ','.
+  assert(!Buffer.str().empty() && "Must have at least one char.");
+  Buffer.pop_back();
+
+  Module *M = CI->getModule();
+#ifndef NDEBUG
+  for (const std::string &VariantMapping : VariantMappings) {
+    Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping);
+    assert(VI.hasValue() && "Canno add an invalid VFABI name.");
+    assert(M->getNamedValue(VI.getValue().VectorName) &&
+           "Cannot add variant to attribute: "
+           "vector function declaration is missing.");
+  }
+#endif
+  CI->addAttribute(
+      AttributeList::FunctionIndex,
+      Attribute::get(M->getContext(), MappingsAttrName, Buffer.str()));
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
index ac8991e9d475..1c5c41abc682 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
-
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index bdf24d80bd17..dda2867f44b2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FormattedStream.h"
@@ -125,8 +126,10 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
 // necessary to compare uses/defs in the same block.  Doing so allows us to walk
 // the minimum number of instructions necessary to compute our def/use ordering.
 struct ValueDFS_Compare {
+  DominatorTree &DT;
   OrderedInstructions &OI;
-  ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {}
+  ValueDFS_Compare(DominatorTree &DT, OrderedInstructions &OI)
+      : DT(DT), OI(OI) {}
 
   bool operator()(const ValueDFS &A, const ValueDFS &B) const {
     if (&A == &B)
@@ -136,7 +139,9 @@ struct ValueDFS_Compare {
     // comesbefore to see what the real ordering is, because they are in the
     // same basic block.
 
-    bool SameBlock = std::tie(A.DFSIn, A.DFSOut) == std::tie(B.DFSIn, B.DFSOut);
+    assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) &&
+           "Equal DFS-in numbers imply equal out numbers");
+    bool SameBlock = A.DFSIn == B.DFSIn;
 
     // We want to put the def that will get used for a given set of phi uses,
     // before those phi uses.
@@ -145,9 +150,11 @@ struct ValueDFS_Compare {
     if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
       return comparePHIRelated(A, B);
 
+    bool isADef = A.Def;
+    bool isBDef = B.Def;
     if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
-      return std::tie(A.DFSIn, A.DFSOut, A.LocalNum, A.Def, A.U) <
-             std::tie(B.DFSIn, B.DFSOut, B.LocalNum, B.Def, B.U);
+      return std::tie(A.DFSIn, A.LocalNum, isADef) <
+             std::tie(B.DFSIn, B.LocalNum, isBDef);
     return localComesBefore(A, B);
   }
 
@@ -164,10 +171,35 @@ struct ValueDFS_Compare {
 
   // For two phi related values, return the ordering.
   bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
-    auto &ABlockEdge = getBlockEdge(A);
-    auto &BBlockEdge = getBlockEdge(B);
-    // Now sort by block edge and then defs before uses.
-    return std::tie(ABlockEdge, A.Def, A.U) < std::tie(BBlockEdge, B.Def, B.U);
+    BasicBlock *ASrc, *ADest, *BSrc, *BDest;
+    std::tie(ASrc, ADest) = getBlockEdge(A);
+    std::tie(BSrc, BDest) = getBlockEdge(B);
+
+#ifndef NDEBUG
+    // This function should only be used for values in the same BB, check that.
+    DomTreeNode *DomASrc = DT.getNode(ASrc);
+    DomTreeNode *DomBSrc = DT.getNode(BSrc);
+    assert(DomASrc->getDFSNumIn() == (unsigned)A.DFSIn &&
+           "DFS numbers for A should match the ones of the source block");
+    assert(DomBSrc->getDFSNumIn() == (unsigned)B.DFSIn &&
+           "DFS numbers for B should match the ones of the source block");
+    assert(A.DFSIn == B.DFSIn && "Values must be in the same block");
+#endif
+    (void)ASrc;
+    (void)BSrc;
+
+    // Use DFS numbers to compare destination blocks, to guarantee a
+    // deterministic order.
+    DomTreeNode *DomADest = DT.getNode(ADest);
+    DomTreeNode *DomBDest = DT.getNode(BDest);
+    unsigned AIn = DomADest->getDFSNumIn();
+    unsigned BIn = DomBDest->getDFSNumIn();
+    bool isADef = A.Def;
+    bool isBDef = B.Def;
+    assert((!A.Def || !A.U) && (!B.Def || !B.U) &&
+           "Def and U cannot be set at the same time");
+    // Now sort by edge destination and then defs before uses.
+    return std::tie(AIn, isADef) < std::tie(BIn, isBDef);
   }
 
   // Get the definition of an instruction that occurs in the middle of a block.
@@ -306,10 +338,11 @@ void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
 }
 
 // Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
-void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+void PredicateInfo::addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
                                PredicateBase *PB) {
-  OpsToRename.insert(Op);
   auto &OperandInfo = getOrCreateValueInfo(Op);
+  if (OperandInfo.Infos.empty())
+    OpsToRename.push_back(Op);
   AllInfos.push_back(PB);
   OperandInfo.Infos.push_back(PB);
 }
@@ -317,7 +350,7 @@ void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
 // Process an assume instruction and place relevant operations we want to rename
 // into OpsToRename.
 void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
-                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+                                  SmallVectorImpl<Value *> &OpsToRename) {
   // See if we have a comparison we support
   SmallVector<Value *, 8> CmpOperands;
   SmallVector<Value *, 2> ConditionsToProcess;
@@ -357,7 +390,7 @@ void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
 // Process a block terminating branch, and place relevant operations to be
 // renamed into OpsToRename.
 void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
-                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+                                  SmallVectorImpl<Value *> &OpsToRename) {
   BasicBlock *FirstBB = BI->getSuccessor(0);
   BasicBlock *SecondBB = BI->getSuccessor(1);
   SmallVector<BasicBlock *, 2> SuccsToProcess;
@@ -427,7 +460,7 @@ void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
 // Process a block terminating switch, and place relevant operations to be
 // renamed into OpsToRename.
 void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
-                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+                                  SmallVectorImpl<Value *> &OpsToRename) {
   Value *Op = SI->getCondition();
   if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
     return;
@@ -457,7 +490,7 @@ void PredicateInfo::buildPredicateInfo() {
   DT.updateDFSNumbers();
   // Collect operands to rename from all conditional branch terminators, as well
   // as assume statements.
-  SmallPtrSet<Value *, 8> OpsToRename;
+  SmallVector<Value *, 8> OpsToRename;
   for (auto DTN : depth_first(DT.getRootNode())) {
     BasicBlock *BranchBB = DTN->getBlock();
     if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
@@ -524,7 +557,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (empty(IF->users()))
+      if (IF->users().empty())
         CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
@@ -536,7 +569,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
              "Should not have gotten here without it being an assume");
       IRBuilder<> B(PAssume->AssumeInst);
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (empty(IF->users()))
+      if (IF->users().empty())
         CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PredicateMap.insert({PIC, ValInfo});
@@ -565,14 +598,8 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
 //
 // TODO: Use this algorithm to perform fast single-variable renaming in
 // promotememtoreg and memoryssa.
-void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
-  // Sort OpsToRename since we are going to iterate it.
-  SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end());
-  auto Comparator = [&](const Value *A, const Value *B) {
-    return valueComesBefore(OI, A, B);
-  };
-  llvm::sort(OpsToRename, Comparator);
-  ValueDFS_Compare Compare(OI);
+void PredicateInfo::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
+  ValueDFS_Compare Compare(DT, OI);
   // Compute liveness, and rename in O(uses) per Op.
   for (auto *Op : OpsToRename) {
     LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
@@ -772,7 +799,7 @@ static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
 bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
   PredInfo->print(dbgs());
   if (VerifyPredicateInfo)
     PredInfo->verifyPredicateInfo();
@@ -786,7 +813,7 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   OS << "PredicateInfo for function: " << F.getName() << "\n";
-  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
   PredInfo->print(OS);
 
   replaceCreatedSSACopys(*PredInfo, F);
@@ -845,7 +872,7 @@ PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+  std::make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
 
   return PreservedAnalyses::all();
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 6e2ef67408d9..0ea6e99e6f19 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
@@ -94,6 +95,12 @@ static cl::opt<unsigned> PHINodeFoldingThreshold(
     cl::desc(
         "Control the amount of phi node folding to perform (default = 2)"));
 
+static cl::opt<unsigned> TwoEntryPHINodeFoldingThreshold(
+    "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4),
+    cl::desc("Control the maximal total instruction cost that we are willing "
+             "to speculatively execute to fold a 2-entry PHI node into a "
+             "select (default = 4)"));
+
 static cl::opt<bool> DupRet(
     "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
     cl::desc("Duplicate return instructions into unconditional branches"));
@@ -332,7 +339,7 @@ static unsigned ComputeSpeculationCost(const User *I,
 /// CostRemaining, false is returned and CostRemaining is undefined.
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
                                 SmallPtrSetImpl<Instruction *> &AggressiveInsts,
-                                unsigned &CostRemaining,
+                                int &BudgetRemaining,
                                 const TargetTransformInfo &TTI,
                                 unsigned Depth = 0) {
   // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
@@ -375,7 +382,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   if (!isSafeToSpeculativelyExecute(I))
     return false;
 
-  unsigned Cost = ComputeSpeculationCost(I, TTI);
+  BudgetRemaining -= ComputeSpeculationCost(I, TTI);
 
   // Allow exactly one instruction to be speculated regardless of its cost
   // (as long as it is safe to do so).
@@ -383,17 +390,14 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // or other expensive operation. The speculation of an expensive instruction
   // is expected to be undone in CodeGenPrepare if the speculation has not
   // enabled further IR optimizations.
-  if (Cost > CostRemaining &&
+  if (BudgetRemaining < 0 &&
       (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0))
     return false;
 
-  // Avoid unsigned wrap.
-  CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost;
-
   // Okay, we can only really hoist these out if their operands do
   // not take us over the cost threshold.
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI,
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI,
                              Depth + 1))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
@@ -629,8 +633,7 @@ private:
   /// vector.
   /// One "Extra" case is allowed to differ from the other.
   void gather(Value *V) {
-    Instruction *I = dyn_cast<Instruction>(V);
-    bool isEQ = (I->getOpcode() == Instruction::Or);
+    bool isEQ = (cast<Instruction>(V)->getOpcode() == Instruction::Or);
 
     // Keep a stack (SmallVector for efficiency) for depth-first traversal
     SmallVector<Value *, 8> DFT;
@@ -1313,7 +1316,8 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
                              LLVMContext::MD_dereferenceable,
                              LLVMContext::MD_dereferenceable_or_null,
                              LLVMContext::MD_mem_parallel_loop_access,
-                             LLVMContext::MD_access_group};
+                             LLVMContext::MD_access_group,
+                             LLVMContext::MD_preserve_access_index};
       combineMetadata(I1, I2, KnownIDs, true);
 
       // I1 and I2 are being combined into a single instruction.  Its debug
@@ -1400,10 +1404,16 @@ HoistTerminator:
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (!SI)
+      if (!SI) {
+        // Propagate fast-math-flags from phi node to its replacement select.
+        IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+        if (isa<FPMathOperator>(PN))
+          Builder.setFastMathFlags(PN.getFastMathFlags());
+
         SI = cast<SelectInst>(
             Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
                                  BB1V->getName() + "." + BB2V->getName(), BI));
+      }
 
       // Make the PHI node use the select for all incoming values for BB1/BB2
       for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
@@ -1420,6 +1430,20 @@ HoistTerminator:
   return true;
 }
 
+// Check lifetime markers.
+static bool isLifeTimeMarker(const Instruction *I) {
+  if (auto II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      return true;
+    }
+  }
+  return false;
+}
+
 // All instructions in Insts belong to different blocks that all unconditionally
 // branch to a common successor. Analyze each instruction and return true if it
 // would be possible to sink them into their successor, creating one common
@@ -1472,20 +1496,25 @@ static bool canSinkInstructions(
       return false;
   }
 
-  // Because SROA can't handle speculating stores of selects, try not
-  // to sink loads or stores of allocas when we'd have to create a PHI for
-  // the address operand. Also, because it is likely that loads or stores
-  // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
+  // Because SROA can't handle speculating stores of selects, try not to sink
+  // loads, stores or lifetime markers of allocas when we'd have to create a
+  // PHI for the address operand. Also, because it is likely that loads or
+  // stores of allocas will disappear when Mem2Reg/SROA is run, don't sink
+  // them.
   // This can cause code churn which can have unintended consequences down
   // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
   // FIXME: This is a workaround for a deficiency in SROA - see
   // https://llvm.org/bugs/show_bug.cgi?id=30188
   if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
-        return isa<AllocaInst>(I->getOperand(1));
+        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
       }))
     return false;
   if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
-        return isa<AllocaInst>(I->getOperand(0));
+        return isa<AllocaInst>(I->getOperand(0)->stripPointerCasts());
+      }))
+    return false;
+  if (isLifeTimeMarker(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
       }))
     return false;
 
@@ -1958,7 +1987,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
   SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
 
-  unsigned SpeculationCost = 0;
+  unsigned SpeculatedInstructions = 0;
   Value *SpeculatedStoreValue = nullptr;
   StoreInst *SpeculatedStore = nullptr;
   for (BasicBlock::iterator BBI = ThenBB->begin(),
@@ -1973,8 +2002,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
     // Only speculatively execute a single instruction (not counting the
     // terminator) for now.
-    ++SpeculationCost;
-    if (SpeculationCost > 1)
+    ++SpeculatedInstructions;
+    if (SpeculatedInstructions > 1)
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
@@ -2011,8 +2040,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
            E = SinkCandidateUseCounts.end();
        I != E; ++I)
     if (I->first->hasNUses(I->second)) {
-      ++SpeculationCost;
-      if (SpeculationCost > 1)
+      ++SpeculatedInstructions;
+      if (SpeculatedInstructions > 1)
         return false;
     }
 
@@ -2052,8 +2081,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     // getting expanded into Instructions.
     // FIXME: This doesn't account for how many operations are combined in the
     // constant expression.
-    ++SpeculationCost;
-    if (SpeculationCost > 1)
+    ++SpeculatedInstructions;
+    if (SpeculatedInstructions > 1)
       return false;
   }
 
@@ -2239,14 +2268,14 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
         if (!BBI->use_empty())
           TranslateMap[&*BBI] = N;
       }
-      // Insert the new instruction into its new home.
-      if (N)
+      if (N) {
+        // Insert the new instruction into its new home.
         EdgeBB->getInstList().insert(InsertPt, N);
 
-      // Register the new instruction with the assumption cache if necessary.
-      if (auto *II = dyn_cast_or_null<IntrinsicInst>(N))
-        if (II->getIntrinsicID() == Intrinsic::assume)
-          AC->registerAssumption(II);
+        // Register the new instruction with the assumption cache if necessary.
+        if (AC && match(N, m_Intrinsic<Intrinsic::assume>()))
+          AC->registerAssumption(cast<IntrinsicInst>(N));
+      }
     }
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
@@ -2301,10 +2330,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction *, 4> AggressiveInsts;
-  unsigned MaxCostVal0 = PHINodeFoldingThreshold,
-           MaxCostVal1 = PHINodeFoldingThreshold;
-  MaxCostVal0 *= TargetTransformInfo::TCC_Basic;
-  MaxCostVal1 *= TargetTransformInfo::TCC_Basic;
+  int BudgetRemaining =
+      TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
 
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
@@ -2315,9 +2342,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
     }
 
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts,
-                             MaxCostVal0, TTI) ||
+                             BudgetRemaining, TTI) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts,
-                             MaxCostVal1, TTI))
+                             BudgetRemaining, TTI))
       return false;
   }
 
@@ -2327,12 +2354,24 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   if (!PN)
     return true;
 
-  // Don't fold i1 branches on PHIs which contain binary operators.  These can
-  // often be turned into switches and other things.
+  // Return true if at least one of these is a 'not', and another is either
+  // a 'not' too, or a constant.
+  auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) {
+    if (!match(V0, m_Not(m_Value())))
+      std::swap(V0, V1);
+    auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant());
+    return match(V0, m_Not(m_Value())) && match(V1, Invertible);
+  };
+
+  // Don't fold i1 branches on PHIs which contain binary operators, unless one
+  // of the incoming values is an 'not' and another one is freely invertible.
+  // These can often be turned into switches and other things.
   if (PN->getType()->isIntegerTy(1) &&
       (isa<BinaryOperator>(PN->getIncomingValue(0)) ||
        isa<BinaryOperator>(PN->getIncomingValue(1)) ||
-       isa<BinaryOperator>(IfCond)))
+       isa<BinaryOperator>(IfCond)) &&
+      !CanHoistNotFromBothValues(PN->getIncomingValue(0),
+                                 PN->getIncomingValue(1)))
     return false;
 
   // If all PHI nodes are promotable, check to make sure that all instructions
@@ -2367,6 +2406,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
         return false;
       }
   }
+  assert(DomBlock && "Failed to find root DomBlock");
 
   LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
                     << "  T: " << IfTrue->getName()
@@ -2384,7 +2424,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   if (IfBlock2)
     hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);
 
+  // Propagate fast-math-flags from phi nodes to replacement selects.
+  IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    if (isa<FPMathOperator>(PN))
+      Builder.setFastMathFlags(PN->getFastMathFlags());
+
     // Change the PHI node into a select instruction.
     Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
@@ -2912,42 +2957,8 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
                                            BasicBlock *QTB, BasicBlock *QFB,
                                            BasicBlock *PostBB, Value *Address,
                                            bool InvertPCond, bool InvertQCond,
-                                           const DataLayout &DL) {
-  auto IsaBitcastOfPointerType = [](const Instruction &I) {
-    return Operator::getOpcode(&I) == Instruction::BitCast &&
-           I.getType()->isPointerTy();
-  };
-
-  // If we're not in aggressive mode, we only optimize if we have some
-  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
-  auto IsWorthwhile = [&](BasicBlock *BB) {
-    if (!BB)
-      return true;
-    // Heuristic: if the block can be if-converted/phi-folded and the
-    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
-    // thread this store.
-    unsigned N = 0;
-    for (auto &I : BB->instructionsWithoutDebug()) {
-      // Cheap instructions viable for folding.
-      if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
-          isa<StoreInst>(I))
-        ++N;
-      // Free instructions.
-      else if (I.isTerminator() || IsaBitcastOfPointerType(I))
-        continue;
-      else
-        return false;
-    }
-    // The store we want to merge is counted in N, so add 1 to make sure
-    // we're counting the instructions that would be left.
-    return N <= (PHINodeFoldingThreshold + 1);
-  };
-
-  if (!MergeCondStoresAggressively &&
-      (!IsWorthwhile(PTB) || !IsWorthwhile(PFB) || !IsWorthwhile(QTB) ||
-       !IsWorthwhile(QFB)))
-    return false;
-
+                                           const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
   // For every pointer, there must be exactly two stores, one coming from
   // PTB or PFB, and the other from QTB or QFB. We don't support more than one
   // store (to any address) in PTB,PFB or QTB,QFB.
@@ -2988,6 +2999,46 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     if (&*I != PStore && I->mayReadOrWriteMemory())
       return false;
 
+  // If we're not in aggressive mode, we only optimize if we have some
+  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
+  auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) {
+    if (!BB)
+      return true;
+    // Heuristic: if the block can be if-converted/phi-folded and the
+    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
+    // thread this store.
+    int BudgetRemaining =
+        PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+    for (auto &I : BB->instructionsWithoutDebug()) {
+      // Consider terminator instruction to be free.
+      if (I.isTerminator())
+        continue;
+      // If this is one the stores that we want to speculate out of this BB,
+      // then don't count it's cost, consider it to be free.
+      if (auto *S = dyn_cast<StoreInst>(&I))
+        if (llvm::find(FreeStores, S))
+          continue;
+      // Else, we have a white-list of instructions that we are ak speculating.
+      if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
+        return false; // Not in white-list - not worthwhile folding.
+      // And finally, if this is a non-free instruction that we are okay
+      // speculating, ensure that we consider the speculation budget.
+      BudgetRemaining -= TTI.getUserCost(&I);
+      if (BudgetRemaining < 0)
+        return false; // Eagerly refuse to fold as soon as we're out of budget.
+    }
+    assert(BudgetRemaining >= 0 &&
+           "When we run out of budget we will eagerly return from within the "
+           "per-instruction loop.");
+    return true;
+  };
+
+  const SmallVector<StoreInst *, 2> FreeStores = {PStore, QStore};
+  if (!MergeCondStoresAggressively &&
+      (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
+       !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
+    return false;
+
   // If PostBB has more than two predecessors, we need to split it so we can
   // sink the store.
   if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
@@ -3047,15 +3098,15 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
   // store that doesn't execute.
   if (MinAlignment != 0) {
     // Choose the minimum of all non-zero alignments.
-    SI->setAlignment(MinAlignment);
+    SI->setAlignment(Align(MinAlignment));
   } else if (MaxAlignment != 0) {
     // Choose the minimal alignment between the non-zero alignment and the ABI
     // default alignment for the type of the stored value.
-    SI->setAlignment(std::min(MaxAlignment, TypeAlignment));
+    SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment)));
   } else {
     // If both alignments are zero, use ABI default alignment for the type of
     // the stored value.
-    SI->setAlignment(TypeAlignment);
+    SI->setAlignment(Align(TypeAlignment));
   }
 
   QStore->eraseFromParent();
@@ -3065,7 +3116,8 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
 }
 
 static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
-                                   const DataLayout &DL) {
+                                   const DataLayout &DL,
+                                   const TargetTransformInfo &TTI) {
   // The intention here is to find diamonds or triangles (see below) where each
   // conditional block contains a store to the same address. Both of these
   // stores are conditional, so they can't be unconditionally sunk. But it may
@@ -3167,16 +3219,58 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
   bool Changed = false;
   for (auto *Address : CommonAddresses)
     Changed |= mergeConditionalStoreToAddress(
-        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL);
+        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL, TTI);
   return Changed;
 }
 
+
+/// If the previous block ended with a widenable branch, determine if reusing
+/// the target block is profitable and legal.  This will have the effect of
+/// "widening" PBI, but doesn't require us to reason about hosting safety.
+static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
+  // TODO: This can be generalized in two important ways:
+  // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input
+  //    values from the PBI edge.
+  // 2) We can sink side effecting instructions into BI's fallthrough
+  //    successor provided they doesn't contribute to computation of
+  //    BI's condition.
+  Value *CondWB, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  if (!parseWidenableBranch(PBI, CondWB, WC, IfTrueBB, IfFalseBB) ||
+      IfTrueBB != BI->getParent() || !BI->getParent()->getSinglePredecessor())
+    return false;
+  if (!IfFalseBB->phis().empty())
+    return false; // TODO
+  // Use lambda to lazily compute expensive condition after cheap ones.
+  auto NoSideEffects = [](BasicBlock &BB) {
+    return !llvm::any_of(BB, [](const Instruction &I) {
+        return I.mayWriteToMemory() || I.mayHaveSideEffects();
+      });
+  };
+  if (BI->getSuccessor(1) != IfFalseBB && // no inf looping
+      BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability
+      NoSideEffects(*BI->getParent())) {
+    BI->getSuccessor(1)->removePredecessor(BI->getParent());
+    BI->setSuccessor(1, IfFalseBB);
+    return true;
+  }
+  if (BI->getSuccessor(0) != IfFalseBB && // no inf looping
+      BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability
+      NoSideEffects(*BI->getParent())) {
+    BI->getSuccessor(0)->removePredecessor(BI->getParent());
+    BI->setSuccessor(0, IfFalseBB);
+    return true;
+  }
+  return false;
+}
+
 /// If we have a conditional branch as a predecessor of another block,
 /// this function tries to simplify it.  We know
 /// that PBI and BI are both conditional branches, and BI is in one of the
 /// successor blocks of PBI - PBI branches to BI.
 static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
-                                           const DataLayout &DL) {
+                                           const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
   assert(PBI->isConditional() && BI->isConditional());
   BasicBlock *BB = BI->getParent();
 
@@ -3225,6 +3319,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     }
   }
 
+  // If the previous block ended with a widenable branch, determine if reusing
+  // the target block is profitable and legal.  This will have the effect of
+  // "widening" PBI, but doesn't require us to reason about hosting safety.
+  if (tryWidenCondBranchToCondBranch(PBI, BI))
+    return true;
+
   if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
     if (CE->canTrap())
       return false;
@@ -3232,7 +3332,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // If both branches are conditional and both contain stores to the same
   // address, remove the stores from the conditionals and create a conditional
   // merged store at the end.
-  if (MergeCondStores && mergeConditionalStores(PBI, BI, DL))
+  if (MergeCondStores && mergeConditionalStores(PBI, BI, DL, TTI))
     return true;
 
   // If this is a conditional branch in an empty block, and if any
@@ -3696,12 +3796,17 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
 
   BasicBlock *BB = BI->getParent();
 
+  // MSAN does not like undefs as branch condition which can be introduced
+  // with "explicit branch".
+  if (ExtraCase && BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
   LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
                     << " cases into SWITCH.  BB is:\n"
                     << *BB);
 
   // If there are any extra values that couldn't be folded into the switch
-  // then we evaluate them with an explicit branch first.  Split the block
+  // then we evaluate them with an explicit branch first. Split the block
   // right before the condbr to handle it.
   if (ExtraCase) {
     BasicBlock *NewBB =
@@ -3850,7 +3955,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
 // Simplify resume that is only used by a single (non-phi) landing pad.
 bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
-  LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
+  auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI());
   assert(RI->getValue() == LPInst &&
          "Resume must unwind the exception that caused control to here");
 
@@ -4177,23 +4282,22 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
     IRBuilder<> Builder(TI);
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isUnconditional()) {
-        if (BI->getSuccessor(0) == BB) {
-          new UnreachableInst(TI->getContext(), TI);
-          TI->eraseFromParent();
-          Changed = true;
-        }
+        assert(BI->getSuccessor(0) == BB && "Incorrect CFG");
+        new UnreachableInst(TI->getContext(), TI);
+        TI->eraseFromParent();
+        Changed = true;
       } else {
         Value* Cond = BI->getCondition();
         if (BI->getSuccessor(0) == BB) {
           Builder.CreateAssumption(Builder.CreateNot(Cond));
           Builder.CreateBr(BI->getSuccessor(1));
-          EraseTerminatorAndDCECond(BI);
-        } else if (BI->getSuccessor(1) == BB) {
+        } else {
+          assert(BI->getSuccessor(1) == BB && "Incorrect CFG");
           Builder.CreateAssumption(Cond);
           Builder.CreateBr(BI->getSuccessor(0));
-          EraseTerminatorAndDCECond(BI);
-          Changed = true;
         }
+        EraseTerminatorAndDCECond(BI);
+        Changed = true;
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       SwitchInstProfUpdateWrapper SU(*SI);
@@ -4275,6 +4379,17 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
   return true;
 }
 
+static void createUnreachableSwitchDefault(SwitchInst *Switch) {
+  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+  BasicBlock *NewDefaultBlock =
+     SplitBlockPredecessors(Switch->getDefaultDest(), Switch->getParent(), "");
+  Switch->setDefaultDest(&*NewDefaultBlock);
+  SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front());
+  auto *NewTerminator = NewDefaultBlock->getTerminator();
+  new UnreachableInst(Switch->getContext(), NewTerminator);
+  EraseTerminatorAndDCECond(NewTerminator);
+}
+
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
 static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
@@ -4383,6 +4498,11 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
 
+  // Clean up the default block - it may have phis or other instructions before
+  // the unreachable terminator.
+  if (!HasDefault)
+    createUnreachableSwitchDefault(SI);
+
   // Drop the switch.
   SI->eraseFromParent();
 
@@ -4427,14 +4547,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-    BasicBlock *NewDefault =
-        SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
-    SI->setDefaultDest(&*NewDefault);
-    SplitBlock(&*NewDefault, &NewDefault->front());
-    auto *OldTI = NewDefault->getTerminator();
-    new UnreachableInst(SI->getContext(), OldTI);
-    EraseTerminatorAndDCECond(OldTI);
+    createUnreachableSwitchDefault(SI);
     return true;
   }
 
@@ -5030,7 +5143,7 @@ SwitchLookupTable::SwitchLookupTable(
   Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   // Set the alignment to that of an array items. We will be only loading one
   // value out of it.
-  Array->setAlignment(DL.getPrefTypeAlignment(ValueType));
+  Array->setAlignment(Align(DL.getPrefTypeAlignment(ValueType)));
   Kind = ArrayKind;
 }
 
@@ -5259,7 +5372,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // Figure out the corresponding result for each case value and phi node in the
   // common destination, as well as the min and max case values.
-  assert(!empty(SI->cases()));
+  assert(!SI->cases().empty());
   SwitchInst::CaseIt CI = SI->case_begin();
   ConstantInt *MinCaseVal = CI->getCaseValue();
   ConstantInt *MaxCaseVal = CI->getCaseValue();
@@ -5891,7 +6004,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
-        if (SimplifyCondBranchToCondBranch(PBI, BI, DL))
+        if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI))
           return requestResimplify();
 
   // Look for diamond patterns.
@@ -5899,7 +6012,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
       if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
         if (PBI != BI && PBI->isConditional())
-          if (mergeConditionalStores(PBI, BI, DL))
+          if (mergeConditionalStores(PBI, BI, DL, TTI))
             return requestResimplify();
 
   return false;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index e938ae6cb42f..fa3a9d21f3df 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 
@@ -47,7 +48,6 @@ static cl::opt<bool>
                          cl::desc("Enable unsafe double to float "
                                   "shrinking for math lib calls"));
 
-
 //===----------------------------------------------------------------------===//
 // Helper Functions
 //===----------------------------------------------------------------------===//
@@ -177,7 +177,8 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
   if (!isOnlyUsedInComparisonWithZero(CI))
     return false;
 
-  if (!isDereferenceableAndAlignedPointer(Str, 1, APInt(64, Len), DL))
+  if (!isDereferenceableAndAlignedPointer(Str, Align::None(), APInt(64, Len),
+                                          DL))
     return false;
 
   if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory))
@@ -186,6 +187,67 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
   return true;
 }
 
+static void annotateDereferenceableBytes(CallInst *CI,
+                                         ArrayRef<unsigned> ArgNos,
+                                         uint64_t DereferenceableBytes) {
+  const Function *F = CI->getCaller();
+  if (!F)
+    return;
+  for (unsigned ArgNo : ArgNos) {
+    uint64_t DerefBytes = DereferenceableBytes;
+    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+    if (!llvm::NullPointerIsDefined(F, AS) ||
+        CI->paramHasAttr(ArgNo, Attribute::NonNull))
+      DerefBytes = std::max(CI->getDereferenceableOrNullBytes(
+                                ArgNo + AttributeList::FirstArgIndex),
+                            DereferenceableBytes);
+  
+    if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) <
+        DerefBytes) {
+      CI->removeParamAttr(ArgNo, Attribute::Dereferenceable);
+      if (!llvm::NullPointerIsDefined(F, AS) ||
+          CI->paramHasAttr(ArgNo, Attribute::NonNull))
+        CI->removeParamAttr(ArgNo, Attribute::DereferenceableOrNull);
+      CI->addParamAttr(ArgNo, Attribute::getWithDereferenceableBytes(
+                                  CI->getContext(), DerefBytes));
+    }
+  }
+}
+
+static void annotateNonNullBasedOnAccess(CallInst *CI,
+                                         ArrayRef<unsigned> ArgNos) {
+  Function *F = CI->getCaller();
+  if (!F)
+    return;
+
+  for (unsigned ArgNo : ArgNos) {
+    if (CI->paramHasAttr(ArgNo, Attribute::NonNull))
+      continue;
+    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+    if (llvm::NullPointerIsDefined(F, AS))
+      continue;
+
+    CI->addParamAttr(ArgNo, Attribute::NonNull);
+    annotateDereferenceableBytes(CI, ArgNo, 1);
+  }
+}
+
+static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> ArgNos,
+                               Value *Size, const DataLayout &DL) {
+  if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) {
+    annotateNonNullBasedOnAccess(CI, ArgNos);
+    annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
+  } else if (isKnownNonZero(Size, DL)) {
+    annotateNonNullBasedOnAccess(CI, ArgNos);
+    const APInt *X, *Y;
+    uint64_t DerefMin = 1;
+    if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) {
+      DerefMin = std::min(X->getZExtValue(), Y->getZExtValue());
+      annotateDereferenceableBytes(CI, ArgNos, DerefMin);
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -194,10 +256,13 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
   // Extract some information from the instruction
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
+  annotateNonNullBasedOnAccess(CI, {0, 1});
 
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
   --Len; // Unbias length.
 
@@ -223,8 +288,9 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
 
   // We have enough information to now generate the memcpy call to do the
   // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(CpyDst, 1, Src, 1,
-                 ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
+  B.CreateMemCpy(
+      CpyDst, Align::None(), Src, Align::None(),
+      ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
   return Dst;
 }
 
@@ -232,24 +298,34 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
   // Extract some information from the instruction.
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
   uint64_t Len;
+  annotateNonNullBasedOnAccess(CI, 0);
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, 1);
 
   // We don't do anything if length is not constant.
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+  ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size);
+  if (LengthArg) {
     Len = LengthArg->getZExtValue();
-  else
+    // strncat(x, c, 0) -> x
+    if (!Len)
+      return Dst;
+  } else {
     return nullptr;
+  }
 
   // See if we can get the length of the input string.
   uint64_t SrcLen = GetStringLength(Src);
-  if (SrcLen == 0)
+  if (SrcLen) {
+    annotateDereferenceableBytes(CI, 1, SrcLen);
+    --SrcLen; // Unbias length.
+  } else {
     return nullptr;
-  --SrcLen; // Unbias length.
+  }
 
-  // Handle the simple, do-nothing cases:
   // strncat(x, "", c) -> x
-  // strncat(x,  c, 0) -> x
-  if (SrcLen == 0 || Len == 0)
+  if (SrcLen == 0)
     return Dst;
 
   // We don't optimize this case.
@@ -265,13 +341,18 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   Value *SrcStr = CI->getArgOperand(0);
+  annotateNonNullBasedOnAccess(CI, 0);
 
   // If the second operand is non-constant, see if we can compute the length
   // of the input string and turn this into memchr.
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   if (!CharC) {
     uint64_t Len = GetStringLength(SrcStr);
-    if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
+    if (Len)
+      annotateDereferenceableBytes(CI, 0, Len);
+    else
+      return nullptr;
+    if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
       return nullptr;
 
     return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
@@ -284,8 +365,8 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str)) {
     if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
-      return B.CreateGEP(B.getInt8Ty(), SrcStr, emitStrLen(SrcStr, B, DL, TLI),
-                         "strchr");
+      if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI))
+        return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr");
     return nullptr;
   }
 
@@ -304,6 +385,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
   Value *SrcStr = CI->getArgOperand(0);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  annotateNonNullBasedOnAccess(CI, 0);
 
   // Cannot fold anything if we're not looking for a constant.
   if (!CharC)
@@ -351,7 +433,12 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
 
   // strcmp(P, "x") -> memcmp(P, "x", 2)
   uint64_t Len1 = GetStringLength(Str1P);
+  if (Len1)
+    annotateDereferenceableBytes(CI, 0, Len1);
   uint64_t Len2 = GetStringLength(Str2P);
+  if (Len2)
+    annotateDereferenceableBytes(CI, 1, Len2);
+
   if (Len1 && Len2) {
     return emitMemCmp(Str1P, Str2P,
                       ConstantInt::get(DL.getIntPtrType(CI->getContext()),
@@ -374,17 +461,22 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
           TLI);
   }
 
+  annotateNonNullBasedOnAccess(CI, {0, 1});
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
-  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+  Value *Str1P = CI->getArgOperand(0);
+  Value *Str2P = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
   if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
     return ConstantInt::get(CI->getType(), 0);
 
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, {0, 1});
   // Get the length argument if it is constant.
   uint64_t Length;
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
     Length = LengthArg->getZExtValue();
   else
     return nullptr;
@@ -393,7 +485,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
     return ConstantInt::get(CI->getType(), 0);
 
   if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-    return emitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
+    return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI);
 
   StringRef Str1, Str2;
   bool HasStr1 = getConstantStringInfo(Str1P, Str1);
@@ -415,7 +507,11 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
                         CI->getType());
 
   uint64_t Len1 = GetStringLength(Str1P);
+  if (Len1)
+    annotateDereferenceableBytes(CI, 0, Len1);
   uint64_t Len2 = GetStringLength(Str2P);
+  if (Len2)
+    annotateDereferenceableBytes(CI, 1, Len2);
 
   // strncmp to memcmp
   if (!HasStr1 && HasStr2) {
@@ -437,20 +533,38 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilder<> &B) {
+  Value *Src = CI->getArgOperand(0);
+  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen && Size) {
+    annotateDereferenceableBytes(CI, 0, SrcLen);
+    if (SrcLen <= Size->getZExtValue() + 1)
+      return emitStrDup(Src, B, TLI);
+  }
+
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) // strcpy(x,x)  -> x
     return Src;
-
+  
+  annotateNonNullBasedOnAccess(CI, {0, 1});
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, 1, Src, 1,
-                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+  CallInst *NewCI =
+      B.CreateMemCpy(Dst, Align::None(), Src, Align::None(),
+                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+  NewCI->setAttributes(CI->getAttributes());
   return Dst;
 }
 
@@ -464,7 +578,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
@@ -474,7 +590,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, 1, Src, 1, LenV);
+  CallInst *NewCI =
+      B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), LenV);
+  NewCI->setAttributes(CI->getAttributes());
   return DstEnd;
 }
 
@@ -482,37 +600,48 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
-  Value *LenOp = CI->getArgOperand(2);
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullBasedOnAccess(CI, 0);
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, 1);
+
+  uint64_t Len;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
+    Len = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  // strncpy(x, y, 0) -> x
+  if (Len == 0)
+    return Dst;
 
   // See if we can get the length of the input string.
   uint64_t SrcLen = GetStringLength(Src);
-  if (SrcLen == 0)
+  if (SrcLen) {
+    annotateDereferenceableBytes(CI, 1, SrcLen);
+    --SrcLen; // Unbias length.
+  } else {
     return nullptr;
-  --SrcLen;
+  }
 
   if (SrcLen == 0) {
     // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
-    B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
+    CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align::None());
+    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
+    NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
+        CI->getContext(), 0, ArgAttrs));
     return Dst;
   }
 
-  uint64_t Len;
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
-    Len = LengthArg->getZExtValue();
-  else
-    return nullptr;
-
-  if (Len == 0)
-    return Dst; // strncpy(x, y, 0) -> x
-
   // Let strncpy handle the zero padding
   if (Len > SrcLen + 1)
     return nullptr;
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
   // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
-  B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len));
-
+  CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), Src, Align::None(),
+                                   ConstantInt::get(DL.getIntPtrType(PT), Len));
+  NewCI->setAttributes(CI->getAttributes());
   return Dst;
 }
 
@@ -608,7 +737,10 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
 }
 
 Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
-  return optimizeStringLength(CI, B, 8);
+  if (Value *V = optimizeStringLength(CI, B, 8))
+    return V;
+  annotateNonNullBasedOnAccess(CI, 0);
+  return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
@@ -756,21 +888,35 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
     Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
     return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
   }
+
+  annotateNonNullBasedOnAccess(CI, {0, 1});
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilder<> &B) {
+  if (isKnownNonZero(CI->getOperand(2), DL))
+    annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
   Value *SrcStr = CI->getArgOperand(0);
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-  ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
 
   // memchr(x, y, 0) -> null
-  if (LenC && LenC->isZero())
-    return Constant::getNullValue(CI->getType());
+  if (LenC) {
+    if (LenC->isZero())
+      return Constant::getNullValue(CI->getType());
+  } else {
+    // From now on we need at least constant length and string.
+    return nullptr;
+  }
 
-  // From now on we need at least constant length and string.
   StringRef Str;
-  if (!LenC || !getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
+  if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
     return nullptr;
 
   // Truncate the string to LenC. If Str is smaller than LenC we will still only
@@ -913,6 +1059,7 @@ static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
       Ret = 1;
     return ConstantInt::get(CI->getType(), Ret);
   }
+
   return nullptr;
 }
 
@@ -925,12 +1072,19 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
   if (LHS == RHS) // memcmp(s,s,x) -> 0
     return Constant::getNullValue(CI->getType());
 
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
   // Handle constant lengths.
-  if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size))
-    if (Value *Res = optimizeMemCmpConstantSize(CI, LHS, RHS,
-                                                LenC->getZExtValue(), B, DL))
-      return Res;
+  ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+  if (!LenC)
+    return nullptr;
 
+  // memcmp(d,s,0) -> 0
+  if (LenC->getZExtValue() == 0)
+    return Constant::getNullValue(CI->getType());
+
+  if (Value *Res =
+          optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL))
+    return Res;
   return nullptr;
 }
 
@@ -939,9 +1093,9 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
     return V;
 
   // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
-  // `bcmp` can be more efficient than memcmp because it only has to know that
-  // there is a difference, not where it is.
-  if (isOnlyUsedInZeroEqualityComparison(CI) && TLI->has(LibFunc_bcmp)) {
+  // bcmp can be more efficient than memcmp because it only has to know that
+  // there is a difference, not how different one is to the other.
+  if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
     Value *LHS = CI->getArgOperand(0);
     Value *RHS = CI->getArgOperand(1);
     Value *Size = CI->getArgOperand(2);
@@ -956,16 +1110,78 @@ Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
   // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
-  B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                 CI->getArgOperand(2));
+  CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(),
+                                   CI->getArgOperand(1), Align::None(), Size);
+  NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
+Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) {
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+  StringRef SrcStr;
+  if (CI->use_empty() && Dst == Src)
+    return Dst;
+  // memccpy(d, s, c, 0) -> nullptr
+  if (N) {
+    if (N->isNullValue())
+      return Constant::getNullValue(CI->getType());
+    if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0,
+                               /*TrimAtNul=*/false) ||
+        !StopChar)
+      return nullptr;
+  } else {
+    return nullptr;
+  }
+
+  // Wrap arg 'c' of type int to char
+  size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
+  if (Pos == StringRef::npos) {
+    if (N->getZExtValue() <= SrcStr.size()) {
+      B.CreateMemCpy(Dst, Align::None(), Src, Align::None(),
+                     CI->getArgOperand(3));
+      return Constant::getNullValue(CI->getType());
+    }
+    return nullptr;
+  }
+
+  Value *NewN =
+      ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
+  // memccpy -> llvm.memcpy
+  B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), NewN);
+  return Pos + 1 <= N->getZExtValue()
+             ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
+             : Constant::getNullValue(CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilder<> &B) {
+  Value *Dst = CI->getArgOperand(0);
+  Value *N = CI->getArgOperand(2);
+  // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n
+  CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), CI->getArgOperand(1),
+                                   Align::None(), N);
+  NewCI->setAttributes(CI->getAttributes());
+  return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
+}
+
 Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
   // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
-  B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                  CI->getArgOperand(2));
+  CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(),
+                                    CI->getArgOperand(1), Align::None(), Size);
+  NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
@@ -1003,25 +1219,30 @@ Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) {
   B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
   const DataLayout &DL = Malloc->getModule()->getDataLayout();
   IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
-  Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
-                             Malloc->getArgOperand(0), Malloc->getAttributes(),
-                             B, *TLI);
-  if (!Calloc)
-    return nullptr;
-
-  Malloc->replaceAllUsesWith(Calloc);
-  eraseFromParent(Malloc);
+  if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
+                                 Malloc->getArgOperand(0),
+                                 Malloc->getAttributes(), B, *TLI)) {
+    substituteInParent(Malloc, Calloc);
+    return Calloc;
+  }
 
-  return Calloc;
+  return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
   if (auto *Calloc = foldMallocMemset(CI, B))
     return Calloc;
 
   // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-  B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+  CallInst *NewCI =
+      B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align::None());
+  NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
@@ -1096,21 +1317,18 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B,
   if (!V[0] || (isBinary && !V[1]))
     return nullptr;
 
-  StringRef CalleeNm = CalleeFn->getName();
-  AttributeList CalleeAt = CalleeFn->getAttributes();
-  bool CalleeIn = CalleeFn->isIntrinsic();
-
   // If call isn't an intrinsic, check that it isn't within a function with the
   // same name as the float version of this call, otherwise the result is an
   // infinite loop.  For example, from MinGW-w64:
   //
   // float expf(float val) { return (float) exp((double) val); }
-  if (!CalleeIn) {
-    const Function *Fn = CI->getFunction();
-    StringRef FnName = Fn->getName();
-    if (FnName.back() == 'f' &&
-        FnName.size() == (CalleeNm.size() + 1) &&
-        FnName.startswith(CalleeNm))
+  StringRef CalleeName = CalleeFn->getName();
+  bool IsIntrinsic = CalleeFn->isIntrinsic();
+  if (!IsIntrinsic) {
+    StringRef CallerName = CI->getFunction()->getName();
+    if (!CallerName.empty() && CallerName.back() == 'f' &&
+        CallerName.size() == (CalleeName.size() + 1) &&
+        CallerName.startswith(CalleeName))
       return nullptr;
   }
 
@@ -1120,16 +1338,16 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B,
 
   // g((double) float) -> (double) gf(float)
   Value *R;
-  if (CalleeIn) {
+  if (IsIntrinsic) {
     Module *M = CI->getModule();
     Intrinsic::ID IID = CalleeFn->getIntrinsicID();
     Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
     R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
+  } else {
+    AttributeList CalleeAttrs = CalleeFn->getAttributes();
+    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
+                 : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
   }
-  else
-    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeNm, B, CalleeAt)
-                 : emitUnaryFloatFnCall(V[0], CalleeNm, B, CalleeAt);
-
   return B.CreateFPExt(R, B.getDoubleTy());
 }
 
@@ -1234,9 +1452,25 @@ static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
   return InnerChain[Exp];
 }
 
+// Return a properly extended 32-bit integer if the operation is an itofp.
+static Value *getIntToFPVal(Value *I2F, IRBuilder<> &B) {
+  if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) {
+    Value *Op = cast<Instruction>(I2F)->getOperand(0);
+    // Make sure that the exponent fits inside an int32_t,
+    // thus avoiding any range issues that FP has not.
+    unsigned BitWidth = Op->getType()->getPrimitiveSizeInBits();
+    if (BitWidth < 32 ||
+        (BitWidth == 32 && isa<SIToFPInst>(I2F)))
+      return isa<SIToFPInst>(I2F) ? B.CreateSExt(Op, B.getInt32Ty())
+                                  : B.CreateZExt(Op, B.getInt32Ty());
+  }
+
+  return nullptr;
+}
+
 /// Use exp{,2}(x * y) for pow(exp{,2}(x), y);
-/// exp2(n * x) for pow(2.0 ** n, x); exp10(x) for pow(10.0, x);
-/// exp2(log2(n) * x) for pow(n, x).
+/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
+/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
 Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
   AttributeList Attrs = Pow->getCalledFunction()->getAttributes();
@@ -1269,9 +1503,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       StringRef ExpName;
       Intrinsic::ID ID;
       Value *ExpFn;
-      LibFunc LibFnFloat;
-      LibFunc LibFnDouble;
-      LibFunc LibFnLongDouble;
+      LibFunc LibFnFloat, LibFnDouble, LibFnLongDouble;
 
       switch (LibFn) {
       default:
@@ -1305,9 +1537,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       // elimination cannot be trusted to remove it, since it may have side
       // effects (e.g., errno).  When the only consumer for the original
       // exp{,2}() is pow(), then it has to be explicitly erased.
-      BaseFn->replaceAllUsesWith(ExpFn);
-      eraseFromParent(BaseFn);
-
+      substituteInParent(BaseFn, ExpFn);
       return ExpFn;
     }
   }
@@ -1318,8 +1548,18 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   if (!match(Pow->getArgOperand(0), m_APFloat(BaseF)))
     return nullptr;
 
+  // pow(2.0, itofp(x)) -> ldexp(1.0, x)
+  if (match(Base, m_SpecificFP(2.0)) &&
+      (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
+      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+    if (Value *ExpoI = getIntToFPVal(Expo, B))
+      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI,
+                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+                                   B, Attrs);
+  }
+
   // pow(2.0 ** n, x) -> exp2(n * x)
-  if (hasUnaryFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
+  if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
     APFloat BaseR = APFloat(1.0);
     BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
     BaseR = BaseR / *BaseF;
@@ -1344,7 +1584,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   // pow(10.0, x) -> exp10(x)
   // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
   if (match(Base, m_SpecificFP(10.0)) &&
-      hasUnaryFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
+      hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
     return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
                                 LibFunc_exp10l, B, Attrs);
 
@@ -1359,17 +1599,15 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
 
     if (Log) {
       Value *FMul = B.CreateFMul(Log, Expo, "mul");
-      if (Pow->doesNotAccessMemory()) {
+      if (Pow->doesNotAccessMemory())
         return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
                             FMul, "exp2");
-      } else {
-        if (hasUnaryFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f,
-                            LibFunc_exp2l))
-          return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
-                                      LibFunc_exp2l, B, Attrs);
-      }
+      else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
+        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+                                    LibFunc_exp2l, B, Attrs);
     }
   }
+
   return nullptr;
 }
 
@@ -1384,8 +1622,7 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
   }
 
   // Otherwise, use the libcall for sqrt().
-  if (hasUnaryFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf,
-                      LibFunc_sqrtl))
+  if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
     // TODO: We also should check that the target can in fact lower the sqrt()
     // libcall. We currently have no way to ask this question, so we ask if
     // the target has a sqrt() libcall, which is not exactly the same.
@@ -1407,6 +1644,11 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilder<> &B) {
       (!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)))
     return nullptr;
 
+  // Converting pow(X, -0.5) to 1/sqrt(X) may introduce an extra rounding step,
+  // so that requires fast-math-flags (afn or reassoc).
+  if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc()))
+    return nullptr;
+
   Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI);
   if (!Sqrt)
     return nullptr;
@@ -1452,7 +1694,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
   bool Ignored;
 
   // Bail out if simplifying libcalls to pow() is disabled.
-  if (!hasUnaryFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl))
+  if (!hasFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl))
     return nullptr;
 
   // Propagate the math semantics from the call to any created instructions.
@@ -1504,7 +1746,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
     // TODO: This whole transformation should be backend specific (e.g. some
     //       backends might prefer libcalls or the limit for the exponent might
     //       be different) and it should also consider optimizing for size.
-    APFloat LimF(ExpoF->getSemantics(), 33.0),
+    APFloat LimF(ExpoF->getSemantics(), 33),
             ExpoA(abs(*ExpoF));
     if (ExpoA.compare(LimF) == APFloat::cmpLessThan) {
       // This transformation applies to integer or integer+0.5 exponents only.
@@ -1558,16 +1800,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
 
   // powf(x, itofp(y)) -> powi(x, y)
   if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) {
-    Value *IntExpo = cast<Instruction>(Expo)->getOperand(0);
-    Value *NewExpo = nullptr;
-    unsigned BitWidth = IntExpo->getType()->getPrimitiveSizeInBits();
-    if (isa<SIToFPInst>(Expo) && BitWidth == 32)
-      NewExpo = IntExpo;
-    else if (BitWidth < 32)
-      NewExpo = isa<SIToFPInst>(Expo) ? B.CreateSExt(IntExpo, B.getInt32Ty())
-                                      : B.CreateZExt(IntExpo, B.getInt32Ty());
-    if (NewExpo)
-      return createPowWithIntegerExponent(Base, NewExpo, M, B);
+    if (Value *ExpoI = getIntToFPVal(Expo, B))
+      return createPowWithIntegerExponent(Base, ExpoI, M, B);
   }
 
   return Shrunk;
@@ -1575,45 +1809,25 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  Value *Ret = nullptr;
   StringRef Name = Callee->getName();
-  if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name))
+  Value *Ret = nullptr;
+  if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
+      hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
+  Type *Ty = CI->getType();
   Value *Op = CI->getArgOperand(0);
+
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
-  LibFunc LdExp = LibFunc_ldexpl;
-  if (Op->getType()->isFloatTy())
-    LdExp = LibFunc_ldexpf;
-  else if (Op->getType()->isDoubleTy())
-    LdExp = LibFunc_ldexp;
-
-  if (TLI->has(LdExp)) {
-    Value *LdExpArg = nullptr;
-    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
-      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
-        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
-    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
-      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
-        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
-    }
-
-    if (LdExpArg) {
-      Constant *One = ConstantFP::get(CI->getContext(), APFloat(1.0f));
-      if (!Op->getType()->isFloatTy())
-        One = ConstantExpr::getFPExtend(One, Op->getType());
-
-      Module *M = CI->getModule();
-      FunctionCallee NewCallee = M->getOrInsertFunction(
-          TLI->getName(LdExp), Op->getType(), Op->getType(), B.getInt32Ty());
-      CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
-      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
-        CI->setCallingConv(F->getCallingConv());
-
-      return CI;
-    }
+  if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
+      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+    if (Value *Exp = getIntToFPVal(Op, B))
+      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
+                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+                                   B, CI->getCalledFunction()->getAttributes());
   }
+
   return Ret;
 }
 
@@ -1644,48 +1858,155 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
   return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
 }
 
-Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
+Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) {
+  Function *LogFn = Log->getCalledFunction();
+  AttributeList Attrs = LogFn->getAttributes();
+  StringRef LogNm = LogFn->getName();
+  Intrinsic::ID LogID = LogFn->getIntrinsicID();
+  Module *Mod = Log->getModule();
+  Type *Ty = Log->getType();
   Value *Ret = nullptr;
-  StringRef Name = Callee->getName();
-  if (UnsafeFPShrink && hasFloatVersion(Name))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  if (!CI->isFast())
-    return Ret;
-  Value *Op1 = CI->getArgOperand(0);
-  auto *OpC = dyn_cast<CallInst>(Op1);
+  if (UnsafeFPShrink && hasFloatVersion(LogNm))
+    Ret = optimizeUnaryDoubleFP(Log, B, true);
 
   // The earlier call must also be 'fast' in order to do these transforms.
-  if (!OpC || !OpC->isFast())
+  CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
+  if (!Log->isFast() || !Arg || !Arg->isFast() || !Arg->hasOneUse())
     return Ret;
 
-  // log(pow(x,y)) -> y*log(x)
-  // This is only applicable to log, log2, log10.
-  if (Name != "log" && Name != "log2" && Name != "log10")
+  LibFunc LogLb, ExpLb, Exp2Lb, Exp10Lb, PowLb;
+
+  // This is only applicable to log(), log2(), log10().
+  if (TLI->getLibFunc(LogNm, LogLb))
+    switch (LogLb) {
+    case LibFunc_logf:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_logl:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    case LibFunc_log2f:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log2:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_log2l:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    case LibFunc_log10f:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log10:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_log10l:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    default:
+      return Ret;
+    }
+  else if (LogID == Intrinsic::log || LogID == Intrinsic::log2 ||
+           LogID == Intrinsic::log10) {
+    if (Ty->getScalarType()->isFloatTy()) {
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+    } else if (Ty->getScalarType()->isDoubleTy()) {
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+    } else
+      return Ret;
+  } else
     return Ret;
 
   IRBuilder<>::FastMathFlagGuard Guard(B);
-  FastMathFlags FMF;
-  FMF.setFast();
-  B.setFastMathFlags(FMF);
+  B.setFastMathFlags(FastMathFlags::getFast());
+
+  Intrinsic::ID ArgID = Arg->getIntrinsicID();
+  LibFunc ArgLb = NotLibFunc;
+  TLI->getLibFunc(Arg, ArgLb);
+
+  // log(pow(x,y)) -> y*log(x)
+  if (ArgLb == PowLb || ArgID == Intrinsic::pow) {
+    Value *LogX =
+        Log->doesNotAccessMemory()
+            ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+                           Arg->getOperand(0), "log")
+            : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
+    Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
+    // Since pow() may have side effects, e.g. errno,
+    // dead code elimination may not be trusted to remove it.
+    substituteInParent(Arg, MulY);
+    return MulY;
+  }
+
+  // log(exp{,2,10}(y)) -> y*log({e,2,10})
+  // TODO: There is no exp10() intrinsic yet.
+  if (ArgLb == ExpLb || ArgLb == Exp2Lb || ArgLb == Exp10Lb ||
+           ArgID == Intrinsic::exp || ArgID == Intrinsic::exp2) {
+    Constant *Eul;
+    if (ArgLb == ExpLb || ArgID == Intrinsic::exp)
+      // FIXME: Add more precise value of e for long double.
+      Eul = ConstantFP::get(Log->getType(), numbers::e);
+    else if (ArgLb == Exp2Lb || ArgID == Intrinsic::exp2)
+      Eul = ConstantFP::get(Log->getType(), 2.0);
+    else
+      Eul = ConstantFP::get(Log->getType(), 10.0);
+    Value *LogE = Log->doesNotAccessMemory()
+                      ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+                                     Eul, "log")
+                      : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
+    Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
+    // Since exp() may have side effects, e.g. errno,
+    // dead code elimination may not be trusted to remove it.
+    substituteInParent(Arg, MulY);
+    return MulY;
+  }
 
-  LibFunc Func;
-  Function *F = OpC->getCalledFunction();
-  if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
-      Func == LibFunc_pow) || F->getIntrinsicID() == Intrinsic::pow))
-    return B.CreateFMul(OpC->getArgOperand(1),
-      emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
-                           Callee->getAttributes()), "mul");
-
-  // log(exp2(y)) -> y*log(2)
-  if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
-      TLI->has(Func) && Func == LibFunc_exp2)
-    return B.CreateFMul(
-        OpC->getArgOperand(0),
-        emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
-                             Callee->getName(), B, Callee->getAttributes()),
-        "logmul");
   return Ret;
 }
 
@@ -2137,6 +2458,7 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
     return New;
   }
 
+  annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
@@ -2154,9 +2476,11 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
       return nullptr; // we found a format specifier, bail out.
 
     // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
-    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                   ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                                    FormatStr.size() + 1)); // Copy the null byte.
+    B.CreateMemCpy(
+        CI->getArgOperand(0), Align::None(), CI->getArgOperand(1),
+        Align::None(),
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                         FormatStr.size() + 1)); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
   }
 
@@ -2191,7 +2515,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
       return nullptr;
     Value *IncLen =
         B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
-    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, IncLen);
+    B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(2),
+                   Align::None(), IncLen);
 
     // The sprintf result is the unincremented number of bytes in the string.
     return B.CreateIntCast(Len, CI->getType(), false);
@@ -2231,21 +2556,21 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
     return New;
   }
 
+  annotateNonNullBasedOnAccess(CI, {0, 1});
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
-  // Check for a fixed format string.
-  StringRef FormatStr;
-  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
-    return nullptr;
-
   // Check for size
   ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   if (!Size)
     return nullptr;
 
   uint64_t N = Size->getZExtValue();
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+    return nullptr;
 
   // If we just have a format string (nothing else crazy) transform it.
   if (CI->getNumArgOperands() == 3) {
@@ -2262,7 +2587,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
     // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
     // strlen(fmt)+1)
     B.CreateMemCpy(
-        CI->getArgOperand(0), 1, CI->getArgOperand(2), 1,
+        CI->getArgOperand(0), Align::None(), CI->getArgOperand(2),
+        Align::None(),
         ConstantInt::get(DL.getIntPtrType(CI->getContext()),
                          FormatStr.size() + 1)); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
@@ -2303,7 +2629,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
       else if (N < Str.size() + 1)
         return nullptr;
 
-      B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(3), 1,
+      B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(3),
+                     Align::None(),
                      ConstantInt::get(CI->getType(), Str.size() + 1));
 
       // The snprintf result is the unincremented number of bytes in the string.
@@ -2318,6 +2645,8 @@ Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) {
     return V;
   }
 
+  if (isKnownNonZero(CI->getOperand(1), DL))
+    annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
@@ -2442,7 +2771,8 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
   // Don't rewrite fputs to fwrite when optimising for size because fwrite
   // requires more arguments and thus extra MOVs are required.
   bool OptForSize = CI->getFunction()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
   if (OptForSize)
     return nullptr;
 
@@ -2503,6 +2833,7 @@ Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
+  annotateNonNullBasedOnAccess(CI, 0);
   if (!CI->use_empty())
     return nullptr;
 
@@ -2515,6 +2846,13 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) {
+  // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
+  return B.CreateMemMove(CI->getArgOperand(1), Align::None(),
+                         CI->getArgOperand(0), Align::None(),
+                         CI->getArgOperand(2));
+}
+
 bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
   LibFunc Func;
   SmallString<20> FloatFuncName = FuncName;
@@ -2557,6 +2895,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeStrLen(CI, Builder);
     case LibFunc_strpbrk:
       return optimizeStrPBrk(CI, Builder);
+    case LibFunc_strndup:
+      return optimizeStrNDup(CI, Builder);
     case LibFunc_strtol:
     case LibFunc_strtod:
     case LibFunc_strtof:
@@ -2573,12 +2913,18 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeStrStr(CI, Builder);
     case LibFunc_memchr:
       return optimizeMemChr(CI, Builder);
+    case LibFunc_memrchr:
+      return optimizeMemRChr(CI, Builder);
     case LibFunc_bcmp:
       return optimizeBCmp(CI, Builder);
     case LibFunc_memcmp:
       return optimizeMemCmp(CI, Builder);
     case LibFunc_memcpy:
       return optimizeMemCpy(CI, Builder);
+    case LibFunc_memccpy:
+      return optimizeMemCCpy(CI, Builder);
+    case LibFunc_mempcpy:
+      return optimizeMemPCpy(CI, Builder);
     case LibFunc_memmove:
       return optimizeMemMove(CI, Builder);
     case LibFunc_memset:
@@ -2587,6 +2933,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeRealloc(CI, Builder);
     case LibFunc_wcslen:
       return optimizeWcslen(CI, Builder);
+    case LibFunc_bcopy:
+      return optimizeBCopy(CI, Builder);
     default:
       break;
     }
@@ -2626,11 +2974,21 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
   case LibFunc_sqrt:
   case LibFunc_sqrtl:
     return optimizeSqrt(CI, Builder);
+  case LibFunc_logf:
   case LibFunc_log:
+  case LibFunc_logl:
+  case LibFunc_log10f:
   case LibFunc_log10:
+  case LibFunc_log10l:
+  case LibFunc_log1pf:
   case LibFunc_log1p:
+  case LibFunc_log1pl:
+  case LibFunc_log2f:
   case LibFunc_log2:
+  case LibFunc_log2l:
+  case LibFunc_logbf:
   case LibFunc_logb:
+  case LibFunc_logbl:
     return optimizeLog(CI, Builder);
   case LibFunc_tan:
   case LibFunc_tanf:
@@ -2721,10 +3079,18 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     case Intrinsic::exp2:
       return optimizeExp2(CI, Builder);
     case Intrinsic::log:
+    case Intrinsic::log2:
+    case Intrinsic::log10:
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
       return optimizeSqrt(CI, Builder);
     // TODO: Use foldMallocMemset() with memset intrinsic.
+    case Intrinsic::memset:
+      return optimizeMemSet(CI, Builder);
+    case Intrinsic::memcpy:
+      return optimizeMemCpy(CI, Builder);
+    case Intrinsic::memmove:
+      return optimizeMemMove(CI, Builder);
     default:
       return nullptr;
     }
@@ -2740,8 +3106,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       IRBuilder<> TmpBuilder(SimplifiedCI);
       if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {
         // If we were able to further simplify, remove the now redundant call.
-        SimplifiedCI->replaceAllUsesWith(V);
-        eraseFromParent(SimplifiedCI);
+        substituteInParent(SimplifiedCI, V);
         return V;
       }
     }
@@ -2898,7 +3263,9 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
       uint64_t Len = GetStringLength(CI->getArgOperand(*StrOp));
       // If the length is 0 we don't know how long it is and so we can't
       // remove the check.
-      if (Len == 0)
+      if (Len)
+        annotateDereferenceableBytes(CI, *StrOp, Len);
+      else
         return false;
       return ObjSizeCI->getZExtValue() >= Len;
     }
@@ -2915,8 +3282,10 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
                                                      IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
-    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                   CI->getArgOperand(2));
+    CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(),
+                                     CI->getArgOperand(1), Align::None(),
+                                     CI->getArgOperand(2));
+    NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2925,8 +3294,10 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
                                                       IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
-    B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                    CI->getArgOperand(2));
+    CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(),
+                                      CI->getArgOperand(1), Align::None(),
+                                      CI->getArgOperand(2));
+    NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2938,7 +3309,9 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
 
   if (isFortifiedCallFoldable(CI, 3, 2)) {
     Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
+                                     CI->getArgOperand(2), Align::None());
+    NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2974,7 +3347,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
 
   // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
 
   Type *SizeTTy = DL.getIntPtrType(CI->getContext());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp
index 1519751197d2..d2a400027d4b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -10,28 +10,80 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
+
 using namespace llvm;
 
-static cl::opt<bool> ProfileGuidedSizeOpt(
+cl::opt<bool> EnablePGSO(
     "pgso", cl::Hidden, cl::init(true),
-    cl::desc("Enable the profile guided size optimization. "));
-
-bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
-                                 BlockFrequencyInfo *BFI) {
-  assert(F);
-  if (!PSI || !BFI || !PSI->hasProfileSummary())
-    return false;
-  return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI);
+    cl::desc("Enable the profile guided size optimizations. "));
+
+cl::opt<bool> PGSOLargeWorkingSetSizeOnly(
+    "pgso-lwss-only", cl::Hidden, cl::init(true),
+    cl::desc("Apply the profile guided size optimizations only "
+             "if the working set size is large (except for cold code.)"));
+
+cl::opt<bool> PGSOColdCodeOnly(
+    "pgso-cold-code-only", cl::Hidden, cl::init(true),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code."));
+
+cl::opt<bool> PGSOIRPassOrTestOnly(
+    "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only"
+             "to the IR passes or tests."));
+
+cl::opt<bool> ForcePGSO(
+    "force-pgso", cl::Hidden, cl::init(false),
+    cl::desc("Force the (profiled-guided) size optimizations. "));
+
+cl::opt<int> PgsoCutoffInstrProf(
+    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(250000), cl::ZeroOrMore,
+    cl::desc("The profile guided size optimization profile summary cutoff "
+             "for instrumentation profile."));
+
+cl::opt<int> PgsoCutoffSampleProf(
+    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(800000), cl::ZeroOrMore,
+    cl::desc("The profile guided size optimization profile summary cutoff "
+             "for sample profile."));
+
+namespace {
+struct BasicBlockBFIAdapter {
+  static bool isFunctionColdInCallGraph(const Function *F,
+                                        ProfileSummaryInfo *PSI,
+                                        BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionColdInCallGraph(F, BFI);
+  }
+  static bool isFunctionHotInCallGraphNthPercentile(int CutOff,
+                                                    const Function *F,
+                                                    ProfileSummaryInfo *PSI,
+                                                    BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI);
+  }
+  static bool isColdBlock(const BasicBlock *BB,
+                          ProfileSummaryInfo *PSI,
+                          BlockFrequencyInfo *BFI) {
+    return PSI->isColdBlock(BB, BFI);
+  }
+  static bool isHotBlockNthPercentile(int CutOff,
+                                      const BasicBlock *BB,
+                                      ProfileSummaryInfo *PSI,
+                                      BlockFrequencyInfo *BFI) {
+    return PSI->isHotBlockNthPercentile(CutOff, BB, BFI);
+  }
+};
+} // end anonymous namespace
+
+bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
+                                 BlockFrequencyInfo *BFI,
+                                 PGSOQueryType QueryType) {
+  return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI,
+                                                             QueryType);
 }
 
-bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
-                                 BlockFrequencyInfo *BFI) {
-  assert(BB);
-  if (!PSI || !BFI || !PSI->hasProfileSummary())
-    return false;
-  return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI);
+bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
+                                 BlockFrequencyInfo *BFI,
+                                 PGSOQueryType QueryType) {
+  return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI,
+                                                         QueryType);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index 50844cf9d1c5..7880ea1c6c47 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index 97a4533fabe5..21cbbfb140b6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index 456724779b43..aacf81d83519 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -69,6 +69,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -380,11 +381,11 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
   // TODO see if there is a more elegant solution to selecting the rewrite
   // descriptor type
   if (!Target.empty())
-    DL->push_back(llvm::make_unique<ExplicitRewriteFunctionDescriptor>(
+    DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>(
         Source, Target, Naked));
   else
     DL->push_back(
-        llvm::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
+        std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
 
   return true;
 }
@@ -442,11 +443,11 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
   }
 
   if (!Target.empty())
-    DL->push_back(llvm::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
+    DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
         Source, Target,
         /*Naked*/ false));
   else
-    DL->push_back(llvm::make_unique<PatternRewriteGlobalVariableDescriptor>(
+    DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
         Source, Transform));
 
   return true;
@@ -505,11 +506,11 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
   }
 
   if (!Target.empty())
-    DL->push_back(llvm::make_unique<ExplicitRewriteNamedAliasDescriptor>(
+    DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>(
         Source, Target,
         /*Naked*/ false));
   else
-    DL->push_back(llvm::make_unique<PatternRewriteNamedAliasDescriptor>(
+    DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>(
         Source, Transform));
 
   return true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 7f7bdf8a3d6d..9af39d9a0dd1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -18,10 +18,16 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 char UnifyFunctionExitNodes::ID = 0;
+
+UnifyFunctionExitNodes::UnifyFunctionExitNodes() : FunctionPass(ID) {
+  initializeUnifyFunctionExitNodesPass(*PassRegistry::getPassRegistry());
+}
+
 INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn",
                 "Unify function exit nodes", false, false)
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp
index 5272ab6e95d5..7769c7493cda 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp
@@ -39,6 +39,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeMetaRenamerPass(Registry);
   initializeStripGCRelocatesPass(Registry);
   initializePredicateInfoPrinterLegacyPassPass(Registry);
+  initializeInjectTLIMappingsLegacyPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp
index a77bf50fe10b..591e1fd2dbee 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -431,7 +431,7 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal);
     NewLoad->takeName(SrcVal);
-    NewLoad->setAlignment(SrcVal->getAlignment());
+    NewLoad->setAlignment(MaybeAlign(SrcVal->getAlignment()));
 
     LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
     LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp
index fbc3407c301f..da68d3713b40 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -27,8 +27,8 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
@@ -66,7 +66,7 @@ struct WorklistEntry {
   enum EntryKind {
     MapGlobalInit,
     MapAppendingVar,
-    MapGlobalAliasee,
+    MapGlobalIndirectSymbol,
     RemapFunction
   };
   struct GVInitTy {
@@ -77,9 +77,9 @@ struct WorklistEntry {
     GlobalVariable *GV;
     Constant *InitPrefix;
   };
-  struct GlobalAliaseeTy {
-    GlobalAlias *GA;
-    Constant *Aliasee;
+  struct GlobalIndirectSymbolTy {
+    GlobalIndirectSymbol *GIS;
+    Constant *Target;
   };
 
   unsigned Kind : 2;
@@ -89,7 +89,7 @@ struct WorklistEntry {
   union {
     GVInitTy GVInit;
     AppendingGVTy AppendingGV;
-    GlobalAliaseeTy GlobalAliasee;
+    GlobalIndirectSymbolTy GlobalIndirectSymbol;
     Function *RemapF;
   } Data;
 };
@@ -161,8 +161,8 @@ public:
                                     bool IsOldCtorDtor,
                                     ArrayRef<Constant *> NewMembers,
                                     unsigned MCID);
-  void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
-                                unsigned MCID);
+  void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target,
+                                       unsigned MCID);
   void scheduleRemapFunction(Function &F, unsigned MCID);
 
   void flush();
@@ -172,7 +172,7 @@ private:
   void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
                             bool IsOldCtorDtor,
                             ArrayRef<Constant *> NewMembers);
-  void mapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee);
+  void mapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target);
   void remapFunction(Function &F, ValueToValueMapTy &VM);
 
   ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
@@ -774,20 +774,6 @@ Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
   return *getMappedOp(&FirstN);
 }
 
-namespace {
-
-struct MapMetadataDisabler {
-  ValueToValueMapTy &VM;
-
-  MapMetadataDisabler(ValueToValueMapTy &VM) : VM(VM) {
-    VM.disableMapMetadata();
-  }
-
-  ~MapMetadataDisabler() { VM.enableMapMetadata(); }
-};
-
-} // end anonymous namespace
-
 Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
   // If the value already exists in the map, use it.
   if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
@@ -802,9 +788,6 @@ Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
     return const_cast<Metadata *>(MD);
 
   if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
-    // Disallow recursion into metadata mapping through mapValue.
-    MapMetadataDisabler MMD(getVM());
-
     // Don't memoize ConstantAsMetadata.  Instead of lasting until the
     // LLVMContext is destroyed, they can be deleted when the GlobalValue they
     // reference is destructed.  These aren't super common, so the extra
@@ -846,9 +829,9 @@ void Mapper::flush() {
       AppendingInits.resize(PrefixSize);
       break;
     }
-    case WorklistEntry::MapGlobalAliasee:
-      E.Data.GlobalAliasee.GA->setAliasee(
-          mapConstant(E.Data.GlobalAliasee.Aliasee));
+    case WorklistEntry::MapGlobalIndirectSymbol:
+      E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol(
+          mapConstant(E.Data.GlobalIndirectSymbol.Target));
       break;
     case WorklistEntry::RemapFunction:
       remapFunction(*E.Data.RemapF);
@@ -1041,16 +1024,16 @@ void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
   AppendingInits.append(NewMembers.begin(), NewMembers.end());
 }
 
-void Mapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
-                                      unsigned MCID) {
-  assert(AlreadyScheduled.insert(&GA).second && "Should not reschedule");
+void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+                                             Constant &Target, unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule");
   assert(MCID < MCs.size() && "Invalid mapping context");
 
   WorklistEntry WE;
-  WE.Kind = WorklistEntry::MapGlobalAliasee;
+  WE.Kind = WorklistEntry::MapGlobalIndirectSymbol;
   WE.MCID = MCID;
-  WE.Data.GlobalAliasee.GA = &GA;
-  WE.Data.GlobalAliasee.Aliasee = &Aliasee;
+  WE.Data.GlobalIndirectSymbol.GIS = &GIS;
+  WE.Data.GlobalIndirectSymbol.Target = &Target;
   Worklist.push_back(WE);
 }
 
@@ -1147,9 +1130,10 @@ void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
       GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
 }
 
-void ValueMapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
-                                           unsigned MCID) {
-  getAsMapper(pImpl)->scheduleMapGlobalAliasee(GA, Aliasee, MCID);
+void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+                                                  Constant &Target,
+                                                  unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID);
 }
 
 void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 4273080ddd91..7478daa2a0a5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -38,6 +38,7 @@
 // could use this pass (with some modifications), but currently it implements
 // its own pass to do something similar to what we do here.
 
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/MapVector.h"
@@ -52,7 +53,6 @@
 #include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
@@ -71,14 +71,15 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
-#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
@@ -147,7 +148,7 @@ private:
   static const unsigned MaxDepth = 3;
 
   bool isConsecutiveAccess(Value *A, Value *B);
-  bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
                               unsigned Depth = 0) const;
   bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
                                    unsigned Depth) const;
@@ -336,14 +337,29 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
 }
 
 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
-                                        const APInt &PtrDelta,
-                                        unsigned Depth) const {
+                                        APInt PtrDelta, unsigned Depth) const {
   unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
   APInt OffsetA(PtrBitWidth, 0);
   APInt OffsetB(PtrBitWidth, 0);
   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
+  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+
+  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+    return false;
+
+  // In case if we have to shrink the pointer
+  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
+  // possible overflow and the value should fit into a smallest data type
+  // used in the cast/gep chain.
+  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
+         OffsetB.getMinSignedBits() <= NewPtrBitWidth);
+
+  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+
   APInt OffsetDelta = OffsetB - OffsetA;
 
   // Check if they are based on the same pointer. That makes the offsets
@@ -650,7 +666,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
       // We can ignore the alias if the we have a load store pair and the load
       // is known to be invariant. The load cannot be clobbered by the store.
       auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
-        return LI->getMetadata(LLVMContext::MD_invariant_load);
+        return LI->hasMetadata(LLVMContext::MD_invariant_load);
       };
 
       // We can ignore the alias as long as the load comes before the store,
@@ -1077,7 +1093,7 @@ bool Vectorizer::vectorizeLoadChain(
   LoadInst *L0 = cast<LoadInst>(Chain[0]);
 
   // If the vector has an int element, default to int for the whole load.
-  Type *LoadTy;
+  Type *LoadTy = nullptr;
   for (const auto &V : Chain) {
     LoadTy = cast<LoadInst>(V)->getType();
     if (LoadTy->isIntOrIntVectorTy())
@@ -1089,6 +1105,7 @@ bool Vectorizer::vectorizeLoadChain(
       break;
     }
   }
+  assert(LoadTy && "Can't determine LoadInst type from chain");
 
   unsigned Sz = DL.getTypeSizeInBits(LoadTy);
   unsigned AS = L0->getPointerAddressSpace();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 138f18e49c92..3f943f4c0688 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -13,7 +13,10 @@
 // pass. It should be easy to create an analysis pass around it if there
 // is a need (but D45420 needs to happen first).
 //
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 
@@ -47,38 +50,6 @@ static const unsigned MaxInterleaveFactor = 16;
 
 namespace llvm {
 
-#ifndef NDEBUG
-static void debugVectorizationFailure(const StringRef DebugMsg,
-    Instruction *I) {
-  dbgs() << "LV: Not vectorizing: " << DebugMsg;
-  if (I != nullptr)
-    dbgs() << " " << *I;
-  else
-    dbgs() << '.';
-  dbgs() << '\n';
-}
-#endif
-
-OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
-                                                  StringRef RemarkName,
-                                                  Loop *TheLoop,
-                                                  Instruction *I) {
-  Value *CodeRegion = TheLoop->getHeader();
-  DebugLoc DL = TheLoop->getStartLoc();
-
-  if (I) {
-    CodeRegion = I->getParent();
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-  }
-
-  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
-  R << "loop not vectorized: ";
-  return R;
-}
-
 bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   switch (Kind) {
   case HK_WIDTH:
@@ -88,6 +59,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   case HK_FORCE:
     return (Val <= 1);
   case HK_ISVECTORIZED:
+  case HK_PREDICATE:
     return (Val == 0 || Val == 1);
   }
   return false;
@@ -99,7 +71,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
     : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
       Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
       Force("vectorize.enable", FK_Undefined, HK_FORCE),
-      IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
+      IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
+      Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L),
+      ORE(ORE) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
 
@@ -250,7 +224,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
+  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
   for (auto H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -435,7 +409,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   const ValueToValueMap &Strides =
       getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
 
-  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+  bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
   if (Stride == 1 || Stride == -1)
     return Stride;
   return 0;
@@ -445,14 +420,6 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return LAI->isUniform(V);
 }
 
-void LoopVectorizationLegality::reportVectorizationFailure(
-    const StringRef DebugMsg, const StringRef OREMsg,
-    const StringRef ORETag, Instruction *I) const {
-  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
-  ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
-      ORETag, TheLoop, I) << OREMsg);
-}
-
 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
   // Store the result and return it at the end instead of exiting early, in case
@@ -467,7 +434,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
     if (!Br) {
       reportVectorizationFailure("Unsupported basic block terminator",
           "loop control flow is not understood by vectorizer",
-          "CFGNotUnderstood");
+          "CFGNotUnderstood", ORE, TheLoop);
       if (DoExtraAnalysis)
         Result = false;
       else
@@ -486,7 +453,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
         !LI->isLoopHeader(Br->getSuccessor(1))) {
       reportVectorizationFailure("Unsupported conditional branch",
           "loop control flow is not understood by vectorizer",
-          "CFGNotUnderstood");
+          "CFGNotUnderstood", ORE, TheLoop);
       if (DoExtraAnalysis)
         Result = false;
       else
@@ -500,7 +467,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
                          TheLoop /*context outer loop*/)) {
     reportVectorizationFailure("Outer loop contains divergent loops",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -511,7 +478,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   if (!setupOuterLoopInductions()) {
     reportVectorizationFailure("Unsupported outer loop Phi(s)",
                                "Unsupported outer loop Phi(s)",
-                               "UnsupportedPhi");
+                               "UnsupportedPhi", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -618,7 +585,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
             !PhiTy->isPointerTy()) {
           reportVectorizationFailure("Found a non-int non-pointer PHI",
                                      "loop control flow is not understood by vectorizer",
-                                     "CFGNotUnderstood");
+                                     "CFGNotUnderstood", ORE, TheLoop);
           return false;
         }
 
@@ -639,7 +606,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (Phi->getNumIncomingValues() != 2) {
           reportVectorizationFailure("Found an invalid PHI",
               "loop control flow is not understood by vectorizer",
-              "CFGNotUnderstood", Phi);
+              "CFGNotUnderstood", ORE, TheLoop, Phi);
           return false;
         }
 
@@ -691,7 +658,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         reportVectorizationFailure("Found an unidentified PHI",
             "value that could not be identified as "
             "reduction is used outside the loop",
-            "NonReductionValueUsedOutsideLoop", Phi);
+            "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
         return false;
       } // end of PHI handling
 
@@ -722,11 +689,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
               "library call cannot be vectorized. "
               "Try compiling with -fno-math-errno, -ffast-math, "
               "or similar flags",
-              "CantVectorizeLibcall", CI);
+              "CantVectorizeLibcall", ORE, TheLoop, CI);
         } else {
           reportVectorizationFailure("Found a non-intrinsic callsite",
                                      "call instruction cannot be vectorized",
-                                     "CantVectorizeLibcall", CI);
+                                     "CantVectorizeLibcall", ORE, TheLoop, CI);
         }
         return false;
       }
@@ -741,7 +708,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
                   "intrinsic instruction cannot be vectorized",
-                  "CantVectorizeIntrinsic", CI);
+                  "CantVectorizeIntrinsic", ORE, TheLoop, CI);
               return false;
             }
           }
@@ -754,7 +721,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           isa<ExtractElementInst>(I)) {
         reportVectorizationFailure("Found unvectorizable type",
             "instruction return type cannot be vectorized",
-            "CantVectorizeInstructionReturnType", &I);
+            "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
         return false;
       }
 
@@ -764,7 +731,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (!VectorType::isValidElementType(T)) {
           reportVectorizationFailure("Store instruction cannot be vectorized",
                                      "store instruction cannot be vectorized",
-                                     "CantVectorizeStore", ST);
+                                     "CantVectorizeStore", ORE, TheLoop, ST);
           return false;
         }
 
@@ -774,12 +741,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           // Arbitrarily try a vector of 2 elements.
           Type *VecTy = VectorType::get(T, /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of stored type");
-          unsigned Alignment = getLoadStoreAlignment(ST);
-          if (!TTI->isLegalNTStore(VecTy, Alignment)) {
+          const MaybeAlign Alignment = getLoadStoreAlignment(ST);
+          assert(Alignment && "Alignment should be set");
+          if (!TTI->isLegalNTStore(VecTy, *Alignment)) {
             reportVectorizationFailure(
                 "nontemporal store instruction cannot be vectorized",
                 "nontemporal store instruction cannot be vectorized",
-                "CantVectorizeNontemporalStore", ST);
+                "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
             return false;
           }
         }
@@ -790,12 +758,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           // supported on the target (arbitrarily try a vector of 2 elements).
           Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of load type");
-          unsigned Alignment = getLoadStoreAlignment(LD);
-          if (!TTI->isLegalNTLoad(VecTy, Alignment)) {
+          const MaybeAlign Alignment = getLoadStoreAlignment(LD);
+          assert(Alignment && "Alignment should be set");
+          if (!TTI->isLegalNTLoad(VecTy, *Alignment)) {
             reportVectorizationFailure(
                 "nontemporal load instruction cannot be vectorized",
                 "nontemporal load instruction cannot be vectorized",
-                "CantVectorizeNontemporalLoad", LD);
+                "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
             return false;
           }
         }
@@ -824,7 +793,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         }
         reportVectorizationFailure("Value cannot be used outside the loop",
                                    "value cannot be used outside the loop",
-                                   "ValueUsedOutsideLoop", &I);
+                                   "ValueUsedOutsideLoop", ORE, TheLoop, &I);
         return false;
       }
     } // next instr.
@@ -834,18 +803,30 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     if (Inductions.empty()) {
       reportVectorizationFailure("Did not find one integer induction var",
           "loop induction variable could not be identified",
-          "NoInductionVariable");
+          "NoInductionVariable", ORE, TheLoop);
       return false;
     } else if (!WidestIndTy) {
       reportVectorizationFailure("Did not find one integer induction var",
           "integer loop induction variable could not be identified",
-          "NoIntegerInductionVariable");
+          "NoIntegerInductionVariable", ORE, TheLoop);
       return false;
     } else {
       LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     }
   }
 
+  // For first order recurrences, we use the previous value (incoming value from
+  // the latch) to check if it dominates all users of the recurrence. Bail out
+  // if we have to sink such an instruction for another recurrence, as the
+  // dominance requirement may not hold after sinking.
+  BasicBlock *LoopLatch = TheLoop->getLoopLatch();
+  if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
+        Instruction *V =
+            cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
+        return SinkAfter.find(V) != SinkAfter.end();
+      }))
+    return false;
+
   // Now we know the widest induction type, check if our found induction
   // is the same size. If it's not, unset it here and InnerLoopVectorizer
   // will create another.
@@ -870,7 +851,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
     reportVectorizationFailure("Stores to a uniform address",
         "write to a loop invariant address could not be vectorized",
-        "CantVectorizeStoreToLoopInvariantAddress");
+        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
     return false;
   }
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
@@ -906,7 +887,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
   for (Instruction &I : *BB) {
@@ -925,7 +906,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
         // !llvm.mem.parallel_loop_access implies if-conversion safety.
         // Otherwise, record that the load needs (real or emulated) masking
         // and let the cost model decide.
-        if (!IsAnnotatedParallel)
+        if (!IsAnnotatedParallel || PreserveGuards)
           MaskedOp.insert(LI);
         continue;
       }
@@ -954,23 +935,41 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion) {
     reportVectorizationFailure("If-conversion is disabled",
                                "if-conversion is disabled",
-                               "IfConversionDisabled");
+                               "IfConversionDisabled",
+                               ORE, TheLoop);
     return false;
   }
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
 
-  // A list of pointers that we can safely read and write to.
+  // A list of pointers which are known to be dereferenceable within scope of
+  // the loop body for each iteration of the loop which executes.  That is,
+  // the memory pointed to can be dereferenced (with the access size implied by
+  // the value's type) unconditionally within the loop header without
+  // introducing a new fault.
   SmallPtrSet<Value *, 8> SafePointes;
 
   // Collect safe addresses.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (blockNeedsPredication(BB))
+    if (!blockNeedsPredication(BB)) {
+      for (Instruction &I : *BB)
+        if (auto *Ptr = getLoadStorePointerOperand(&I))
+          SafePointes.insert(Ptr);
       continue;
+    }
 
-    for (Instruction &I : *BB)
-      if (auto *Ptr = getLoadStorePointerOperand(&I))
-        SafePointes.insert(Ptr);
+    // For a block which requires predication, a address may be safe to access
+    // in the loop w/o predication if we can prove dereferenceability facts
+    // sufficient to ensure it'll never fault within the loop. For the moment,
+    // we restrict this to loads; stores are more complicated due to
+    // concurrency restrictions.
+    ScalarEvolution &SE = *PSE.getSE();
+    for (Instruction &I : *BB) {
+      LoadInst *LI = dyn_cast<LoadInst>(&I);
+      if (LI && !mustSuppressSpeculation(*LI) &&
+          isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
+        SafePointes.insert(LI->getPointerOperand());
+    }
   }
 
   // Collect the blocks that need predication.
@@ -980,7 +979,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (!isa<BranchInst>(BB->getTerminator())) {
       reportVectorizationFailure("Loop contains a switch statement",
                                  "loop contains a switch statement",
-                                 "LoopContainsSwitch", BB->getTerminator());
+                                 "LoopContainsSwitch", ORE, TheLoop,
+                                 BB->getTerminator());
       return false;
     }
 
@@ -990,14 +990,16 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
         reportVectorizationFailure(
             "Control flow cannot be substituted for a select",
             "control flow cannot be substituted for a select",
-            "NoCFGForSelect", BB->getTerminator());
+            "NoCFGForSelect", ORE, TheLoop,
+            BB->getTerminator());
         return false;
       }
     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
       reportVectorizationFailure(
           "Control flow cannot be substituted for a select",
           "control flow cannot be substituted for a select",
-          "NoCFGForSelect", BB->getTerminator());
+          "NoCFGForSelect", ORE, TheLoop,
+          BB->getTerminator());
       return false;
     }
   }
@@ -1027,7 +1029,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (!Lp->getLoopPreheader()) {
     reportVectorizationFailure("Loop doesn't have a legal pre-header",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1038,7 +1040,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (Lp->getNumBackEdges() != 1) {
     reportVectorizationFailure("The loop must have a single backedge",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1049,7 +1051,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (!Lp->getExitingBlock()) {
     reportVectorizationFailure("The loop must have an exiting block",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1062,7 +1064,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
     reportVectorizationFailure("The exiting block is not the loop latch",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1125,7 +1127,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     if (!canVectorizeOuterLoop()) {
       reportVectorizationFailure("Unsupported outer loop",
                                  "unsupported outer loop",
-                                 "UnsupportedOuterLoop");
+                                 "UnsupportedOuterLoop",
+                                 ORE, TheLoop);
       // TODO: Implement DoExtraAnalysis when subsequent legal checks support
       // outer loops.
       return false;
@@ -1177,7 +1180,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
     reportVectorizationFailure("Too many SCEV checks needed",
         "Too many SCEV assumptions need to be made and checked at runtime",
-        "TooManySCEVRunTimeChecks");
+        "TooManySCEVRunTimeChecks", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1191,7 +1194,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   return Result;
 }
 
-bool LoopVectorizationLegality::canFoldTailByMasking() {
+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
 
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
@@ -1200,22 +1203,21 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
         "No primary induction, cannot fold tail by masking",
         "Missing a primary induction variable in the loop, which is "
         "needed in order to fold tail by masking as required.",
-        "NoPrimaryInduction");
+        "NoPrimaryInduction", ORE, TheLoop);
     return false;
   }
 
-  // TODO: handle reductions when tail is folded by masking.
-  if (!Reductions.empty()) {
-    reportVectorizationFailure(
-        "Loop has reductions, cannot fold tail by masking",
-        "Cannot fold tail by masking in the presence of reductions.",
-        "ReductionFoldingTailByMasking");
-    return false;
-  }
+  SmallPtrSet<const Value *, 8> ReductionLiveOuts;
 
-  // TODO: handle outside users when tail is folded by masking.
+  for (auto &Reduction : *getReductionVars())
+    ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+
+  // TODO: handle non-reduction outside users when tail is folded by masking.
   for (auto *AE : AllowedExit) {
-    // Check that all users of allowed exit values are inside the loop.
+    // Check that all users of allowed exit values are inside the loop or
+    // are the live-out of a reduction.
+    if (ReductionLiveOuts.count(AE))
+      continue;
     for (User *U : AE->users()) {
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
@@ -1223,7 +1225,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
       reportVectorizationFailure(
           "Cannot fold tail by masking, loop has an outside user for",
           "Cannot fold tail by masking in the presence of live outs.",
-          "LiveOutFoldingTailByMasking", UI);
+          "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
       return false;
     }
   }
@@ -1234,11 +1236,12 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers)) {
+    if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
       reportVectorizationFailure(
           "Cannot fold tail by masking as required",
           "control flow cannot be substituted for a select",
-          "NoCFGForSelect", BB->getTerminator());
+          "NoCFGForSelect", ORE, TheLoop,
+          BB->getTerminator());
       return false;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 97077cce83e3..e5edd305d3d5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -201,6 +201,9 @@ class LoopVectorizationPlanner {
   /// The profitability analysis.
   LoopVectorizationCostModel &CM;
 
+  /// The interleaved access analysis.
+  InterleavedAccessInfo &IAI;
+
   SmallVector<VPlanPtr, 4> VPlans;
 
   /// This class is used to enable the VPlan to invoke a method of ILV. This is
@@ -211,6 +214,8 @@ class LoopVectorizationPlanner {
     VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
 
     Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
+    Value *getOrCreateScalarValue(Value *V,
+                                  const VPIteration &Instance) override;
   };
 
   /// A builder used to construct the current plan.
@@ -223,16 +228,18 @@ public:
   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                            const TargetTransformInfo *TTI,
                            LoopVectorizationLegality *Legal,
-                           LoopVectorizationCostModel &CM)
-      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
+                           LoopVectorizationCostModel &CM,
+                           InterleavedAccessInfo &IAI)
+      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
+        IAI(IAI) {}
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF);
+  Optional<VectorizationFactor> plan(unsigned UserVF);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
-  VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+  VectorizationFactor planInVPlanNativePath(unsigned UserVF);
 
   /// Finalize the best decision and dispose of all other VPlans.
   void setBestPlan(unsigned VF, unsigned UF);
@@ -272,9 +279,10 @@ private:
 
   /// Build a VPlan using VPRecipes according to the information gather by
   /// Legal. This method is only used for the legacy inner loop vectorizer.
-  VPlanPtr
-  buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
-                          SmallPtrSetImpl<Instruction *> &DeadInstructions);
+  VPlanPtr buildVPlanWithVPRecipes(
+      VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+      SmallPtrSetImpl<Instruction *> &DeadInstructions,
+      const DenseMap<Instruction *, Instruction *> &SinkAfter);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 46265e3f3e13..ebfd5fe8b762 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,8 +58,8 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanHCFGBuilder.h"
-#include "VPlanHCFGTransforms.h"
 #include "VPlanPredicator.h"
+#include "VPlanTransforms.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -124,6 +124,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -149,7 +150,6 @@
 #include <string>
 #include <tuple>
 #include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -177,6 +177,14 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
              "value are vectorized only if no scalar iteration overheads "
              "are incurred."));
 
+// Indicates that an epilogue is undesired, predication is preferred.
+// This means that the vectorizer will try to fold the loop-tail (epilogue)
+// into the loop and predicate the loop body accordingly.
+static cl::opt<bool> PreferPredicateOverEpilog(
+    "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
+    cl::desc("Indicate that an epilogue is undesired, predication should be "
+             "used instead."));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -192,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
 
-/// We don't interleave loops with a known constant trip count below this
-/// number.
-static const unsigned TinyTripCountInterleaveThreshold = 128;
+static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
+    "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
+    cl::desc("We don't interleave loops with a estimated constant trip count "
+             "below this number"));
 
 static cl::opt<unsigned> ForceTargetNumScalarRegs(
     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
@@ -347,6 +356,29 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
                            : ConstantFP::get(Ty, C);
 }
 
+/// Returns "best known" trip count for the specified loop \p L as defined by
+/// the following procedure:
+///   1) Returns exact trip count if it is known.
+///   2) Returns expected trip count according to profile data if any.
+///   3) Returns upper bound estimate if it is known.
+///   4) Returns None if all of the above failed.
+static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+  // Check if exact trip count is known.
+  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
+    return ExpectedTC;
+
+  // Check if there is an expected trip count available from profile data.
+  if (LoopVectorizeWithBlockFrequency)
+    if (auto EstimatedTC = getLoopEstimatedTripCount(L))
+      return EstimatedTC;
+
+  // Check if upper bound estimate is known.
+  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
+    return ExpectedTC;
+
+  return None;
+}
+
 namespace llvm {
 
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
@@ -396,6 +428,11 @@ public:
   /// new unrolled loop, where UF is the unroll factor.
   using VectorParts = SmallVector<Value *, 2>;
 
+  /// Vectorize a single GetElementPtrInst based on information gathered and
+  /// decisions taken during planning.
+  void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
+                bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
+
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
@@ -445,15 +482,20 @@ public:
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 
-  /// Try to vectorize the interleaved access group that \p Instr belongs to,
-  /// optionally masking the vector operations if \p BlockInMask is non-null.
-  void vectorizeInterleaveGroup(Instruction *Instr,
-                                VectorParts *BlockInMask = nullptr);
-
-  /// Vectorize Load and Store instructions, optionally masking the vector
-  /// operations if \p BlockInMask is non-null.
-  void vectorizeMemoryInstruction(Instruction *Instr,
-                                  VectorParts *BlockInMask = nullptr);
+  /// Try to vectorize the interleaved access group that \p Instr belongs to
+  /// with the base address given in \p Addr, optionally masking the vector
+  /// operations if \p BlockInMask is non-null. Use \p State to translate given
+  /// VPValues to IR values in the vectorized loop.
+  void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
+                                VPValue *Addr, VPValue *BlockInMask = nullptr);
+
+  /// Vectorize Load and Store instructions with the base address given in \p
+  /// Addr, optionally masking the vector operations if \p BlockInMask is
+  /// non-null. Use \p State to translate given VPValues to IR values in the
+  /// vectorized loop.
+  void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
+                                  VPValue *Addr,
+                                  VPValue *BlockInMask = nullptr);
 
   /// Set the debug location in the builder using the debug location in
   /// the instruction.
@@ -494,6 +536,9 @@ protected:
   /// vectorizing this phi node.
   void fixReduction(PHINode *Phi);
 
+  /// Clear NSW/NUW flags from reduction instructions if necessary.
+  void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
+
   /// The Loop exit block may have single value PHI nodes with some
   /// incoming value. While vectorizing we only handled real values
   /// that were defined inside the loop and we should have one value for
@@ -508,10 +553,6 @@ protected:
   /// represented as.
   void truncateToMinimalBitwidths();
 
-  /// Insert the new loop to the loop hierarchy and pass manager
-  /// and update the analysis passes.
-  void updateAnalysis();
-
   /// Create a broadcast instruction. This method generates a broadcast
   /// instruction (shuffle) for loop invariant values and for the induction
   /// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -795,6 +836,59 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
     B.SetCurrentDebugLocation(DebugLoc());
 }
 
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
+#ifndef NDEBUG
+static void debugVectorizationFailure(const StringRef DebugMsg,
+    Instruction *I) {
+  dbgs() << "LV: Not vectorizing: " << DebugMsg;
+  if (I != nullptr)
+    dbgs() << " " << *I;
+  else
+    dbgs() << '.';
+  dbgs() << '\n';
+}
+#endif
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
+/// RemarkName is the identifier for the remark.  If \p I is passed it is an
+/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
+/// the location of the remark.  \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
+    StringRef RemarkName, Loop *TheLoop, Instruction *I) {
+  Value *CodeRegion = TheLoop->getHeader();
+  DebugLoc DL = TheLoop->getStartLoc();
+
+  if (I) {
+    CodeRegion = I->getParent();
+    // If there is no debug location attached to the instruction, revert back to
+    // using the loop's.
+    if (I->getDebugLoc())
+      DL = I->getDebugLoc();
+  }
+
+  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+  R << "loop not vectorized: ";
+  return R;
+}
+
+namespace llvm {
+
+void reportVectorizationFailure(const StringRef DebugMsg,
+    const StringRef OREMsg, const StringRef ORETag,
+    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+                ORETag, TheLoop, I) << OREMsg);
+}
+
+} // end namespace llvm
+
 #ifndef NDEBUG
 /// \return string containing a file name and a line # for the given loop.
 static std::string getDebugLocString(const Loop *L) {
@@ -836,6 +930,26 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
 
 namespace llvm {
 
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+
+  // The default: allowing scalar epilogues.
+  CM_ScalarEpilogueAllowed,
+
+  // Vectorization with OptForSize: don't allow epilogues.
+  CM_ScalarEpilogueNotAllowedOptSize,
+
+  // A special case of vectorisation with OptForSize: loops with a very small
+  // trip count are considered for vectorization under OptForSize, thereby
+  // making sure the cost of their loop body is dominant, free of runtime
+  // guards and scalar iteration overheads.
+  CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+  // Loop hint predicate indicating an epilogue is undesired.
+  CM_ScalarEpilogueNotNeededUsePredicate
+};
+
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
 /// In many cases vectorization is not profitable. This can happen because of
@@ -845,20 +959,26 @@ namespace llvm {
 /// different operations.
 class LoopVectorizationCostModel {
 public:
-  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
-                             LoopInfo *LI, LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
+                             LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
                              InterleavedAccessInfo &IAI)
-      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
-    AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
+      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+        Hints(Hints), InterleaveInfo(IAI) {}
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<unsigned> computeMaxVF(bool OptForSize);
+  Optional<unsigned> computeMaxVF();
+
+  /// \return True if runtime checks are required for vectorization, and false
+  /// otherwise.
+  bool runtimeChecksRequired();
 
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -881,8 +1001,7 @@ public:
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
-                                 unsigned LoopCost);
+  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -897,10 +1016,11 @@ public:
   /// of a loop.
   struct RegisterUsage {
     /// Holds the number of loop invariant values that are used in the loop.
-    unsigned LoopInvariantRegs;
-
+    /// The key is ClassID of target-provided register class.
+    SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
     /// Holds the maximum number of concurrent live intervals in the loop.
-    unsigned MaxLocalUsers;
+    /// The key is ClassID of target-provided register class.
+    SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
   };
 
   /// \return Returns information about the register usages of the loop for the
@@ -1080,26 +1200,28 @@ public:
 
   /// Returns true if the target machine supports masked store operation
   /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
-    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+    return Legal->isConsecutivePtr(Ptr) &&
+           TTI.isLegalMaskedStore(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked load operation
   /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
-    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+    return Legal->isConsecutivePtr(Ptr) &&
+           TTI.isLegalMaskedLoad(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked scatter operation
   /// for the given \p DataType.
-  bool isLegalMaskedScatter(Type *DataType) {
-    return TTI.isLegalMaskedScatter(DataType);
+  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+    return TTI.isLegalMaskedScatter(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked gather operation
   /// for the given \p DataType.
-  bool isLegalMaskedGather(Type *DataType) {
-    return TTI.isLegalMaskedGather(DataType);
+  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
+    return TTI.isLegalMaskedGather(DataType, Alignment);
   }
 
   /// Returns true if the target machine can represent \p V as a masked gather
@@ -1110,7 +1232,9 @@ public:
     if (!LI && !SI)
       return false;
     auto *Ty = getMemInstValueType(V);
-    return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+    MaybeAlign Align = getLoadStoreAlignment(V);
+    return (LI && isLegalMaskedGather(Ty, Align)) ||
+           (SI && isLegalMaskedScatter(Ty, Align));
   }
 
   /// Returns true if \p I is an instruction that will be scalarized with
@@ -1157,11 +1281,14 @@ public:
   /// to handle accesses with gaps, and there is nothing preventing us from
   /// creating a scalar epilogue.
   bool requiresScalarEpilogue() const {
-    return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
+    return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
   }
 
-  /// Returns true if a scalar epilogue is not allowed due to optsize.
-  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+  /// Returns true if a scalar epilogue is not allowed due to optsize or a
+  /// loop hint annotation.
+  bool isScalarEpilogueAllowed() const {
+    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+  }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const { return FoldTailByMasking; }
@@ -1187,7 +1314,7 @@ private:
 
   /// \return An upper bound for the vectorization factor, larger than zero.
   /// One is returned if vectorization should best be avoided due to cost.
-  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
+  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -1246,15 +1373,6 @@ private:
   /// should be used.
   bool useEmulatedMaskMemRefHack(Instruction *I);
 
-  /// Create an analysis remark that explains why vectorization failed
-  ///
-  /// \p RemarkName is the identifier for the remark.  \return the remark object
-  /// that can be streamed to.
-  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
-    return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
-                                  RemarkName, TheLoop);
-  }
-
   /// Map of scalar integer values to the smallest bitwidth they can be legally
   /// represented as. The vector equivalents of these values should be truncated
   /// to this type.
@@ -1270,13 +1388,13 @@ private:
   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
 
   /// Records whether it is allowed to have the original scalar loop execute at
-  /// least once. This may be needed as a fallback loop in case runtime 
+  /// least once. This may be needed as a fallback loop in case runtime
   /// aliasing/dependence checks fail, or to handle the tail/remainder
   /// iterations when the trip count is unknown or doesn't divide by the VF,
   /// or as a peel-loop to handle gaps in interleave-groups.
   /// Under optsize and when the trip count is very small we don't allow any
   /// iterations to execute in the scalar loop.
-  bool IsScalarEpilogueAllowed = true;
+  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
 
   /// All blocks of loop are to be masked to fold tail of scalar iterations.
   bool FoldTailByMasking = false;
@@ -1496,7 +1614,7 @@ struct LoopVectorize : public FunctionPass {
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
@@ -2049,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
-                                                   VectorParts *BlockInMask) {
+                                                   VPTransformState &State,
+                                                   VPValue *Addr,
+                                                   VPValue *BlockInMask) {
   const InterleaveGroup<Instruction> *Group =
       Cost->getInterleavedAccessGroup(Instr);
   assert(Group && "Fail to get an interleaved access group.");
@@ -2059,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
     return;
 
   const DataLayout &DL = Instr->getModule()->getDataLayout();
-  Value *Ptr = getLoadStorePointerOperand(Instr);
 
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
-  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
 
   // Prepare for the new pointers.
-  setDebugLocFromInst(Builder, Ptr);
-  SmallVector<Value *, 2> NewPtrs;
+  SmallVector<Value *, 2> AddrParts;
   unsigned Index = Group->getIndex(Instr);
 
-  VectorParts Mask;
-  bool IsMaskForCondRequired = BlockInMask;
-  if (IsMaskForCondRequired) {
-    Mask = *BlockInMask;
-    // TODO: extend the masked interleaved-group support to reversed access.
-    assert(!Group->isReverse() && "Reversed masked interleave-group "
-                                  "not supported.");
-  }
+  // TODO: extend the masked interleaved-group support to reversed access.
+  assert((!BlockInMask || !Group->isReverse()) &&
+         "Reversed masked interleave-group not supported.");
 
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
@@ -2090,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
   if (Group->isReverse())
     Index += (VF - 1) * Group->getFactor();
 
-  bool InBounds = false;
-  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
-    InBounds = gep->isInBounds();
-
   for (unsigned Part = 0; Part < UF; Part++) {
-    Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
+    Value *AddrPart = State.get(Addr, {Part, 0});
+    setDebugLocFromInst(Builder, AddrPart);
 
     // Notice current instruction could be any index. Need to adjust the address
     // to the member of index 0.
@@ -2108,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
     //       A[i]   = b;     // Member of index 0
     //       A[i+2] = c;     // Member of index 2 (Current instruction)
     // Current pointer is pointed to A[i+2], adjust it to A[i].
-    NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
-    if (InBounds)
-      cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
+
+    bool InBounds = false;
+    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
+      InBounds = gep->isInBounds();
+    AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
+    cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
 
     // Cast to the vector pointer type.
-    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+    unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
+    Type *PtrTy = VecTy->getPointerTo(AddressSpace);
+    AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
   }
 
   setDebugLocFromInst(Builder, Instr);
@@ -2131,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
     SmallVector<Value *, 2> NewLoads;
     for (unsigned Part = 0; Part < UF; Part++) {
       Instruction *NewLoad;
-      if (IsMaskForCondRequired || MaskForGaps) {
+      if (BlockInMask || MaskForGaps) {
         assert(useMaskedInterleavedAccesses(*TTI) &&
                "masked interleaved groups are not allowed.");
         Value *GroupMask = MaskForGaps;
-        if (IsMaskForCondRequired) {
-          auto *Undefs = UndefValue::get(Mask[Part]->getType());
+        if (BlockInMask) {
+          Value *BlockInMaskPart = State.get(BlockInMask, Part);
+          auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
           Value *ShuffledMask = Builder.CreateShuffleVector(
-              Mask[Part], Undefs, RepMask, "interleaved.mask");
+              BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
           GroupMask = MaskForGaps
                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
                                                 MaskForGaps)
                           : ShuffledMask;
         }
         NewLoad =
-            Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+            Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
                                      GroupMask, UndefVec, "wide.masked.vec");
       }
       else
-        NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
+        NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
                                             Group->getAlignment(), "wide.vec");
       Group->addMetadata(NewLoad);
       NewLoads.push_back(NewLoad);
@@ -2219,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
                                               "interleaved.vec");
 
     Instruction *NewStoreInstr;
-    if (IsMaskForCondRequired) {
-      auto *Undefs = UndefValue::get(Mask[Part]->getType());
+    if (BlockInMask) {
+      Value *BlockInMaskPart = State.get(BlockInMask, Part);
+      auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
       Value *ShuffledMask = Builder.CreateShuffleVector(
-          Mask[Part], Undefs, RepMask, "interleaved.mask");
+          BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
       NewStoreInstr = Builder.CreateMaskedStore(
-          IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+          IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
     }
     else
-      NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 
-        Group->getAlignment());
+      NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
+                                                 Group->getAlignment());
 
     Group->addMetadata(NewStoreInstr);
   }
 }
 
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
-                                                     VectorParts *BlockInMask) {
+                                                     VPTransformState &State,
+                                                     VPValue *Addr,
+                                                     VPValue *BlockInMask) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2248,18 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
          "CM decision should be taken at this point");
   if (Decision == LoopVectorizationCostModel::CM_Interleave)
-    return vectorizeInterleaveGroup(Instr);
+    return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
-  Value *Ptr = getLoadStorePointerOperand(Instr);
-  unsigned Alignment = getLoadStoreAlignment(Instr);
   // An alignment of 0 means target abi alignment. We need to use the scalar's
   // target abi alignment in such a case.
   const DataLayout &DL = Instr->getModule()->getDataLayout();
-  if (!Alignment)
-    Alignment = DL.getABITypeAlignment(ScalarDataTy);
-  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
+  const Align Alignment =
+      DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
@@ -2273,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   // gather/scatter. Otherwise Decision should have been to Scalarize.
   assert((ConsecutiveStride || CreateGatherScatter) &&
          "The instruction should be scalarized");
+  (void)ConsecutiveStride;
 
-  // Handle consecutive loads/stores.
-  if (ConsecutiveStride)
-    Ptr = getOrCreateScalarValue(Ptr, {0, 0});
-
-  VectorParts Mask;
+  VectorParts BlockInMaskParts(UF);
   bool isMaskRequired = BlockInMask;
   if (isMaskRequired)
-    Mask = *BlockInMask;
-
-  bool InBounds = false;
-  if (auto *gep = dyn_cast<GetElementPtrInst>(
-          getLoadStorePointerOperand(Instr)->stripPointerCasts()))
-    InBounds = gep->isInBounds();
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockInMaskParts[Part] = State.get(BlockInMask, Part);
 
   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
     // Calculate the pointer for the specific unroll-part.
     GetElementPtrInst *PartPtr = nullptr;
 
+    bool InBounds = false;
+    if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+      InBounds = gep->isInBounds();
+
     if (Reverse) {
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
@@ -2302,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
       PartPtr->setIsInBounds(InBounds);
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
-        Mask[Part] = reverseVector(Mask[Part]);
+        BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
       PartPtr = cast<GetElementPtrInst>(
           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
       PartPtr->setIsInBounds(InBounds);
     }
 
+    unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
   };
 
@@ -2320,10 +2433,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       Instruction *NewSI = nullptr;
       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
       if (CreateGatherScatter) {
-        Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
-        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
-                                            MaskPart);
+        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        Value *VectorGep = State.get(Addr, Part);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
+                                            Alignment.value(), MaskPart);
       } else {
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
@@ -2332,12 +2445,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
           // We don't want to update the value in the map as it might be used in
           // another expression. So don't call resetVectorValue(StoredVal).
         }
-        auto *VecPtr = CreateVecPtr(Part, Ptr);
+        auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
         if (isMaskRequired)
-          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
-                                            Mask[Part]);
+          NewSI = Builder.CreateMaskedStore(
+              StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
         else
-          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+          NewSI =
+              Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
       }
       addMetadata(NewSI, SI);
     }
@@ -2350,20 +2464,20 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *NewLI;
     if (CreateGatherScatter) {
-      Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
-      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
-      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      Value *VectorGep = State.get(Addr, Part);
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
                                          nullptr, "wide.masked.gather");
       addMetadata(NewLI, LI);
     } else {
-      auto *VecPtr = CreateVecPtr(Part, Ptr);
+      auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
       if (isMaskRequired)
-        NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
-                                         UndefValue::get(DataTy),
-                                         "wide.masked.load");
+        NewLI = Builder.CreateMaskedLoad(
+            VecPtr, Alignment.value(), BlockInMaskParts[Part],
+            UndefValue::get(DataTy), "wide.masked.load");
       else
-        NewLI =
-            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+        NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
+                                          "wide.load");
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
       addMetadata(NewLI, LI);
@@ -2570,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
                                                          BasicBlock *Bypass) {
   Value *Count = getOrCreateTripCount(L);
-  BasicBlock *BB = L->getLoopPreheader();
-  IRBuilder<> Builder(BB->getTerminator());
+  // Reuse existing vector loop preheader for TC checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  IRBuilder<> Builder(TCCheckBlock->getTerminator());
 
   // Generate code to check if the loop's trip count is less than VF * UF, or
   // equal to it in case a scalar epilogue is required; this implies that the
@@ -2588,47 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
         P, Count, ConstantInt::get(Count->getType(), VF * UF),
         "min.iters.check");
 
-  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
-  // Update dominator tree immediately if the generated block is a
-  // LoopBypassBlock because SCEV expansions to generate loop bypass
-  // checks may query it before the current function is finished.
-  DT->addNewBlock(NewBB, BB);
-  if (L->getParentLoop())
-    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
-  ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
-  LoopBypassBlocks.push_back(BB);
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader =
+      SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
+                 "vector.ph");
+
+  assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+                               DT->getNode(Bypass)->getIDom()) &&
+         "TC check is expected to dominate Bypass");
+
+  // Update dominator for Bypass & LoopExit.
+  DT->changeImmediateDominator(Bypass, TCCheckBlock);
+  DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+  ReplaceInstWithInst(
+      TCCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+  LoopBypassBlocks.push_back(TCCheckBlock);
 }
 
 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
-  BasicBlock *BB = L->getLoopPreheader();
+  // Reuse existing vector loop preheader for SCEV checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
 
   // Generate the code to check that the SCEV assumptions that we made.
   // We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
                    "scev.check");
-  Value *SCEVCheck =
-      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+  Value *SCEVCheck = Exp.expandCodeForPredicate(
+      &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
 
   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
     if (C->isZero())
       return;
 
-  assert(!Cost->foldTailByMasking() &&
-         "Cannot SCEV check stride or overflow when folding tail");
-  // Create a new block containing the stride check.
-  BB->setName("vector.scevcheck");
-  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
-  // Update dominator tree immediately if the generated block is a
-  // LoopBypassBlock because SCEV expansions to generate loop bypass
-  // checks may query it before the current function is finished.
-  DT->addNewBlock(NewBB, BB);
-  if (L->getParentLoop())
-    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
-  ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, SCEVCheck));
-  LoopBypassBlocks.push_back(BB);
+  assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
+         "Cannot SCEV check stride or overflow when optimizing for size");
+
+  SCEVCheckBlock->setName("vector.scevcheck");
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader =
+      SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
+                 nullptr, "vector.ph");
+
+  // Update dominator only if this is first RT check.
+  if (LoopBypassBlocks.empty()) {
+    DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
+    DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
+  }
+
+  ReplaceInstWithInst(
+      SCEVCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
+  LoopBypassBlocks.push_back(SCEVCheckBlock);
   AddedSafetyChecks = true;
 }
 
@@ -2637,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   if (EnableVPlanNativePath)
     return;
 
-  BasicBlock *BB = L->getLoopPreheader();
+  // Reuse existing vector loop preheader for runtime memory checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const MemCheckBlock = L->getLoopPreheader();
 
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
@@ -2645,29 +2777,46 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   Instruction *FirstCheckInst;
   Instruction *MemRuntimeCheck;
   std::tie(FirstCheckInst, MemRuntimeCheck) =
-      Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
+      Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
   if (!MemRuntimeCheck)
     return;
 
-  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
-  // Create a new block containing the memory check.
-  BB->setName("vector.memcheck");
-  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
-  // Update dominator tree immediately if the generated block is a
-  // LoopBypassBlock because SCEV expansions to generate loop bypass
-  // checks may query it before the current function is finished.
-  DT->addNewBlock(NewBB, BB);
-  if (L->getParentLoop())
-    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
-  ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
-  LoopBypassBlocks.push_back(BB);
+  if (MemCheckBlock->getParent()->hasOptSize()) {
+    assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+           "Cannot emit memory checks when optimizing for size, unless forced "
+           "to vectorize.");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+                                        L->getStartLoc(), L->getHeader())
+             << "Code-size may be reduced by not forcing "
+                "vectorization, or by source-code modifications "
+                "eliminating the need for runtime checks "
+                "(e.g., adding 'restrict').";
+    });
+  }
+
+  MemCheckBlock->setName("vector.memcheck");
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader =
+      SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
+                 "vector.ph");
+
+  // Update dominator only if this is first RT check.
+  if (LoopBypassBlocks.empty()) {
+    DT->changeImmediateDominator(Bypass, MemCheckBlock);
+    DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
+  }
+
+  ReplaceInstWithInst(
+      MemCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
+  LoopBypassBlocks.push_back(MemCheckBlock);
   AddedSafetyChecks = true;
 
   // We currently don't use LoopVersioning for the actual loop cloning but we
   // still use it to add the noalias metadata.
-  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
-                                           PSE.getSE());
+  LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+                                          PSE.getSE());
   LVer->prepareNoAliasMetadata();
 }
 
@@ -2792,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
    ...
    */
 
-  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
-  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   MDNode *OrigLoopID = OrigLoop->getLoopID();
-  assert(VectorPH && "Invalid loop structure");
-  assert(ExitBlock && "Must have an exit block");
 
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
@@ -2814,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   Type *IdxTy = Legal->getWidestInductionType();
 
   // Split the single block loop into the two loop structure described above.
-  BasicBlock *VecBody =
-      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
-  BasicBlock *MiddleBlock =
-      VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
-  BasicBlock *ScalarPH =
-      MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+  LoopScalarBody = OrigLoop->getHeader();
+  LoopVectorPreHeader = OrigLoop->getLoopPreheader();
+  LoopExitBlock = OrigLoop->getExitBlock();
+  assert(LoopExitBlock && "Must have an exit block");
+  assert(LoopVectorPreHeader && "Invalid loop structure");
+
+  LoopMiddleBlock =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+                 LI, nullptr, "middle.block");
+  LoopScalarPreHeader =
+      SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
+                 nullptr, "scalar.ph");
+  // We intentionally don't let SplitBlock to update LoopInfo since
+  // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
+  // LoopVectorBody is explicitly added to the correct place few lines later.
+  LoopVectorBody =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+                 nullptr, nullptr, "vector.body");
+
+  // Update dominator for loop exit.
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
 
   // Create and register the new vector loop.
   Loop *Lp = LI->AllocateLoop();
@@ -2829,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // before calling any utilities such as SCEV that require valid LoopInfo.
   if (ParentLoop) {
     ParentLoop->addChildLoop(Lp);
-    ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
-    ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
   } else {
     LI->addTopLevelLoop(Lp);
   }
-  Lp->addBasicBlockToLoop(VecBody, *LI);
+  Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
 
   // Find the loop boundaries.
   Value *Count = getOrCreateTripCount(Lp);
@@ -2846,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // backedge-taken count is uint##_max: adding one to it will overflow leading
   // to an incorrect trip count of zero. In this (rare) case we will also jump
   // to the scalar loop.
-  emitMinimumIterationCountCheck(Lp, ScalarPH);
+  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
 
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
-  emitSCEVChecks(Lp, ScalarPH);
+  emitSCEVChecks(Lp, LoopScalarPreHeader);
 
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
-  emitMemRuntimeChecks(Lp, ScalarPH);
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
 
   // Generate the induction variable.
   // The loop step is equal to the vectorization factor (num of SIMD elements)
@@ -2883,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     InductionDescriptor II = InductionEntry.second;
 
     // Create phi nodes to merge from the  backedge-taken check block.
-    PHINode *BCResumeVal = PHINode::Create(
-        OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
+    PHINode *BCResumeVal =
+        PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                        LoopScalarPreHeader->getTerminator());
     // Copy original phi DL over to the new one.
     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
     Value *&EndValue = IVEndValues[OrigPhi];
@@ -2895,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
-        CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
+          CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
-      const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+      const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
       EndValue->setName("ind.end");
     }
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    BCResumeVal->addIncoming(EndValue, MiddleBlock);
+    BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
 
     // Fix the scalar body counter (PHI node).
     // The old induction's phi node in the scalar body needs the truncated
     // value.
     for (BasicBlock *BB : LoopBypassBlocks)
       BCResumeVal->addIncoming(II.getStartValue(), BB);
-    OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
+    OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
   }
 
   // We need the OrigLoop (scalar loop part) latch terminator to help
@@ -2929,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // If tail is to be folded, we know we don't need to run the remainder.
   Value *CmpN = Builder.getTrue();
   if (!Cost->foldTailByMasking()) {
-    CmpN =
-        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                        CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+    CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                           CountRoundDown, "cmp.n",
+                           LoopMiddleBlock->getTerminator());
 
     // Here we use the same DebugLoc as the scalar loop latch branch instead
     // of the corresponding compare because they may have ended up with
@@ -2940,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
   }
 
-  BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
+  BranchInst *BrInst =
+      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
-  ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
+  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
 
   // Get ready to start creating new instructions into the vectorized body.
-  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
-
-  // Save the state.
-  LoopVectorPreHeader = Lp->getLoopPreheader();
-  LoopScalarPreHeader = ScalarPH;
-  LoopMiddleBlock = MiddleBlock;
-  LoopExitBlock = ExitBlock;
-  LoopVectorBody = VecBody;
-  LoopScalarBody = OldBasicBlock;
+  assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
+         "Inconsistent vector loop preheader");
+  Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
 
   Optional<MDNode *> VectorizedLoopID =
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
@@ -2974,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   LoopVectorizeHints Hints(Lp, true, *ORE);
   Hints.setAlreadyVectorized();
 
+#ifdef EXPENSIVE_CHECKS
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+  LI->verify(*DT);
+#endif
+
   return LoopVectorPreHeader;
 }
 
@@ -3309,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
   // This is the second stage of vectorizing recurrences.
   fixCrossIterationPHIs();
 
-  // Update the dominator tree.
-  //
-  // FIXME: After creating the structure of the new loop, the dominator tree is
-  //        no longer up-to-date, and it remains that way until we update it
-  //        here. An out-of-date dominator tree is problematic for SCEV,
-  //        because SCEVExpander uses it to guide code generation. The
-  //        vectorizer use SCEVExpanders in several places. Instead, we should
-  //        keep the dominator tree up-to-date as we go.
-  updateAnalysis();
+  // Forget the original basic block.
+  PSE.getSE()->forgetLoop(OrigLoop);
 
   // Fix-up external users of the induction variables.
   for (auto &Entry : *Legal->getInductionVars())
@@ -3430,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // among all unrolled iterations, due to the order of their construction.
   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
 
-  // Set the insertion point after the previous value if it is an instruction.
+  // Find and set the insertion point after the previous value if it is an
+  // instruction.
+  BasicBlock::iterator InsertPt;
   // Note that the previous value may have been constant-folded so it is not
-  // guaranteed to be an instruction in the vector loop. Also, if the previous
-  // value is a phi node, we should insert after all the phi nodes to avoid
-  // breaking basic block verification.
-  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
-      isa<PHINode>(PreviousLastPart))
-    Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
-  else
-    Builder.SetInsertPoint(
-        &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
+  // guaranteed to be an instruction in the vector loop.
+  // FIXME: Loop invariant values do not form recurrences. We should deal with
+  //        them earlier.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
+    InsertPt = LoopVectorBody->getFirstInsertionPt();
+  else {
+    Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
+    if (isa<PHINode>(PreviousLastPart))
+      // If the previous value is a phi node, we should insert after all the phi
+      // nodes in the block containing the PHI to avoid breaking basic block
+      // verification. Note that the basic block may be different to
+      // LoopVectorBody, in case we predicate the loop.
+      InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
+    else
+      InsertPt = ++PreviousInst->getIterator();
+  }
+  Builder.SetInsertPoint(&*InsertPt);
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
@@ -3573,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
     }
   }
 
+  // Wrap flags are in general invalid after vectorization, clear them.
+  clearReductionWrapFlags(RdxDesc);
+
   // Fix the vector-loop phi.
 
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   BasicBlock *Latch = OrigLoop->getLoopLatch();
   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
     Value *Val = getOrCreateVectorValue(LoopVal, Part);
-    // Make sure to add the reduction stat value only to the
+    // Make sure to add the reduction start value only to the
     // first unroll part.
     Value *StartVal = (Part == 0) ? VectorStart : Identity;
     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
@@ -3598,6 +3763,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   setDebugLocFromInst(Builder, LoopExitInst);
 
+  // If tail is folded by masking, the vector value to leave the loop should be
+  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
+  // instead of the former.
+  if (Cost->foldTailByMasking()) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *VecLoopExitInst =
+          VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Sel = nullptr;
+      for (User *U : VecLoopExitInst->users()) {
+        if (isa<SelectInst>(U)) {
+          assert(!Sel && "Reduction exit feeding two selects");
+          Sel = U;
+        } else
+          assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
+      }
+      assert(Sel && "Reduction exit feeds no select");
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+    }
+  }
+
   // If the vector reduction can be performed in a smaller type, we truncate
   // then extend the loop exit value to enable InstCombine to evaluate the
   // entire expression in the smaller type.
@@ -3699,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
 }
 
+void InnerLoopVectorizer::clearReductionWrapFlags(
+    RecurrenceDescriptor &RdxDesc) {
+  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+  if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
+      RK != RecurrenceDescriptor::RK_IntegerMult)
+    return;
+
+  Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
+  assert(LoopExitInstr && "null loop exit instruction");
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  Worklist.push_back(LoopExitInstr);
+  Visited.insert(LoopExitInstr);
+
+  while (!Worklist.empty()) {
+    Instruction *Cur = Worklist.pop_back_val();
+    if (isa<OverflowingBinaryOperator>(Cur))
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *V = getOrCreateVectorValue(Cur, Part);
+        cast<Instruction>(V)->dropPoisonGeneratingFlags();
+      }
+
+    for (User *U : Cur->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
+          Visited.insert(UI).second)
+        Worklist.push_back(UI);
+    }
+  }
+}
+
 void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getNumIncomingValues() == 1) {
@@ -3820,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
   }
 }
 
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
+                                   unsigned VF, bool IsPtrLoopInvariant,
+                                   SmallBitVector &IsIndexLoopInvariant) {
+  // Construct a vector GEP by widening the operands of the scalar GEP as
+  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+  // results in a vector of pointers when at least one operand of the GEP
+  // is vector-typed. Thus, to keep the representation compact, we only use
+  // vector-typed operands for loop-varying values.
+
+  if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+    // If we are vectorizing, but the GEP has only loop-invariant operands,
+    // the GEP we build (by only using vector-typed operands for
+    // loop-varying values) would be a scalar pointer. Thus, to ensure we
+    // produce a vector of pointers, we need to either arbitrarily pick an
+    // operand to broadcast, or broadcast a clone of the original GEP.
+    // Here, we broadcast a clone of the original.
+    //
+    // TODO: If at some point we decide to scalarize instructions having
+    //       loop-invariant operands, this special case will no longer be
+    //       required. We would add the scalarization decision to
+    //       collectLoopScalars() and teach getVectorValue() to broadcast
+    //       the lane-zero scalar value.
+    auto *Clone = Builder.Insert(GEP->clone());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+      VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
+      addMetadata(EntryPart, GEP);
+    }
+  } else {
+    // If the GEP has at least one loop-varying operand, we are sure to
+    // produce a vector of pointers. But if we are only unrolling, we want
+    // to produce a scalar GEP for each unroll part. Thus, the GEP we
+    // produce with the code below will be scalar (if VF == 1) or vector
+    // (otherwise). Note that for the unroll-only case, we still maintain
+    // values in the vector mapping with initVector, as we do for other
+    // instructions.
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // The pointer operand of the new GEP. If it's loop-invariant, we
+      // won't broadcast it.
+      auto *Ptr = IsPtrLoopInvariant
+                      ? GEP->getPointerOperand()
+                      : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+
+      // Collect all the indices for the new GEP. If any index is
+      // loop-invariant, we won't broadcast it.
+      SmallVector<Value *, 4> Indices;
+      for (auto Index : enumerate(GEP->indices())) {
+        Value *User = Index.value().get();
+        if (IsIndexLoopInvariant[Index.index()])
+          Indices.push_back(User);
+        else
+          Indices.push_back(getOrCreateVectorValue(User, Part));
+      }
+
+      // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+      // but it should be a vector, otherwise.
+      auto *NewGEP =
+          GEP->isInBounds()
+              ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
+                                          Indices)
+              : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
+      assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+             "NewGEP is not a pointer vector");
+      VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
+      addMetadata(NewGEP, GEP);
+    }
+  }
+}
+
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
                                               unsigned VF) {
   PHINode *P = cast<PHINode>(PN);
@@ -3922,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   switch (I.getOpcode()) {
   case Instruction::Br:
   case Instruction::PHI:
+  case Instruction::GetElementPtr:
     llvm_unreachable("This instruction is handled by a different recipe.");
-  case Instruction::GetElementPtr: {
-    // Construct a vector GEP by widening the operands of the scalar GEP as
-    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
-    // results in a vector of pointers when at least one operand of the GEP
-    // is vector-typed. Thus, to keep the representation compact, we only use
-    // vector-typed operands for loop-varying values.
-    auto *GEP = cast<GetElementPtrInst>(&I);
-
-    if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
-      // If we are vectorizing, but the GEP has only loop-invariant operands,
-      // the GEP we build (by only using vector-typed operands for
-      // loop-varying values) would be a scalar pointer. Thus, to ensure we
-      // produce a vector of pointers, we need to either arbitrarily pick an
-      // operand to broadcast, or broadcast a clone of the original GEP.
-      // Here, we broadcast a clone of the original.
-      //
-      // TODO: If at some point we decide to scalarize instructions having
-      //       loop-invariant operands, this special case will no longer be
-      //       required. We would add the scalarization decision to
-      //       collectLoopScalars() and teach getVectorValue() to broadcast
-      //       the lane-zero scalar value.
-      auto *Clone = Builder.Insert(GEP->clone());
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
-        VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
-        addMetadata(EntryPart, GEP);
-      }
-    } else {
-      // If the GEP has at least one loop-varying operand, we are sure to
-      // produce a vector of pointers. But if we are only unrolling, we want
-      // to produce a scalar GEP for each unroll part. Thus, the GEP we
-      // produce with the code below will be scalar (if VF == 1) or vector
-      // (otherwise). Note that for the unroll-only case, we still maintain
-      // values in the vector mapping with initVector, as we do for other
-      // instructions.
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        // The pointer operand of the new GEP. If it's loop-invariant, we
-        // won't broadcast it.
-        auto *Ptr =
-            OrigLoop->isLoopInvariant(GEP->getPointerOperand())
-                ? GEP->getPointerOperand()
-                : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
-
-        // Collect all the indices for the new GEP. If any index is
-        // loop-invariant, we won't broadcast it.
-        SmallVector<Value *, 4> Indices;
-        for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
-          if (OrigLoop->isLoopInvariant(U.get()))
-            Indices.push_back(U.get());
-          else
-            Indices.push_back(getOrCreateVectorValue(U.get(), Part));
-        }
-
-        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
-        // but it should be a vector, otherwise.
-        auto *NewGEP =
-            GEP->isInBounds()
-                ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
-                                            Indices)
-                : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
-        assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
-               "NewGEP is not a pointer vector");
-        VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
-        addMetadata(NewGEP, GEP);
-      }
-    }
-
-    break;
-  }
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
@@ -4064,7 +4281,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   case Instruction::FCmp: {
     // Widen compares. Generate vector compares.
     bool FCmp = (I.getOpcode() == Instruction::FCmp);
-    auto *Cmp = dyn_cast<CmpInst>(&I);
+    auto *Cmp = cast<CmpInst>(&I);
     setDebugLocFromInst(Builder, Cmp);
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
@@ -4097,7 +4314,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    auto *CI = dyn_cast<CastInst>(&I);
+    auto *CI = cast<CastInst>(&I);
     setDebugLocFromInst(Builder, CI);
 
     /// Vectorize casts.
@@ -4195,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   } // end of switch.
 }
 
-void InnerLoopVectorizer::updateAnalysis() {
-  // Forget the original basic block.
-  PSE.getSE()->forgetLoop(OrigLoop);
-
-  // DT is not kept up-to-date for outer loop vectorization
-  if (EnableVPlanNativePath)
-    return;
-
-  // Update the dominator tree information.
-  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
-         "Entry does not dominate exit.");
-
-  DT->addNewBlock(LoopMiddleBlock,
-                  LI->getLoopFor(LoopVectorBody)->getLoopLatch());
-  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
-  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
-  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
-  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-}
-
 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   // We should not collect Scalars more than once per VF. Right now, this
   // function is called from collectUniformsAndScalars(), which already does
@@ -4421,9 +4618,11 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
              "Widening decision should be ready at this moment");
       return WideningDecision == CM_Scalarize;
     }
-    return isa<LoadInst>(I) ?
-        !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
-      : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+    const MaybeAlign Alignment = getLoadStoreAlignment(I);
+    return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
+                                isLegalMaskedGather(Ty, Alignment))
+                            : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
+                                isLegalMaskedScatter(Ty, Alignment));
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -4452,10 +4651,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
   // Check if masking is required.
   // A Group may need masking for one of two reasons: it resides in a block that
   // needs predication, or it was decided to use masking to deal with gaps.
-  bool PredicatedAccessRequiresMasking = 
+  bool PredicatedAccessRequiresMasking =
       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
-  bool AccessWithGapsRequiresMasking = 
-      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  bool AccessWithGapsRequiresMasking =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
     return true;
 
@@ -4466,8 +4665,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
          "Masked interleave-groups for predicated accesses are not enabled.");
 
   auto *Ty = getMemInstValueType(I);
-  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
-                          : TTI.isLegalMaskedStore(Ty);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
+                          : TTI.isLegalMaskedStore(Ty, Alignment);
 }
 
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
@@ -4525,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   SetVector<Instruction *> Worklist;
   BasicBlock *Latch = TheLoop->getLoopLatch();
 
+  // Instructions that are scalar with predication must not be considered
+  // uniform after vectorization, because that would create an erroneous
+  // replicating region where only a single instance out of VF should be formed.
+  // TODO: optimize such seldom cases if found important, see PR40816.
+  auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+    if (isScalarWithPredication(I, VF)) {
+      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
+                        << *I << "\n");
+      return;
+    }
+    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
+    Worklist.insert(I);
+  };
+
   // Start with the conditional branch. If the branch condition is an
   // instruction contained in the loop that is only used by the branch, it is
   // uniform.
   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
-    Worklist.insert(Cmp);
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
-  }
+  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    addToWorklistIfAllowed(Cmp);
 
   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
   // are pointers that are treated like consecutive pointers during
@@ -4591,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   // Add to the Worklist all consecutive and consecutive-like pointers that
   // aren't also identified as possibly non-uniform.
   for (auto *V : ConsecutiveLikePtrs)
-    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
-      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
-      Worklist.insert(V);
-    }
+    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
+      addToWorklistIfAllowed(V);
 
   // Expand Worklist in topological order: whenever a new instruction
   // is added , its users should be already inside Worklist.  It ensures
@@ -4620,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
             return Worklist.count(J) ||
                    (OI == getLoadStorePointerOperand(J) &&
                     isUniformDecision(J, VF));
-          })) {
-        Worklist.insert(OI);
-        LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
-      }
+          }))
+        addToWorklistIfAllowed(OI);
     }
   }
 
@@ -4665,92 +4873,103 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
       continue;
 
     // The induction variable and its update instruction will remain uniform.
-    Worklist.insert(Ind);
-    Worklist.insert(IndUpdate);
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
-                      << "\n");
+    addToWorklistIfAllowed(Ind);
+    addToWorklistIfAllowed(IndUpdate);
   }
 
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
-  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
-    // TODO: It may by useful to do since it's still likely to be dynamically
-    // uniform if the target can skip.
-    LLVM_DEBUG(
-        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
-
-    ORE->emit(
-      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
-      << "runtime pointer checks needed. Not enabled for divergent target");
-
-    return None;
-  }
-
-  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
-    return computeFeasibleMaxVF(OptForSize, TC);
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
 
   if (Legal->getRuntimePointerChecking()->Need) {
-    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
-              << "runtime pointer checks needed. Enable vectorization of this "
-                 "loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs()
-        << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return None;
+    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
+        "runtime pointer checks needed. Enable vectorization of this "
+        "loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
   }
 
   if (!PSE.getUnionPredicate().getPredicates().empty()) {
-    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
-              << "runtime SCEV checks needed. Enable vectorization of this "
-                 "loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs()
-        << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
-    return None;
+    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
+        "runtime SCEV checks needed. Enable vectorization of this "
+        "loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
   }
 
   // FIXME: Avoid specializing for stride==1 instead of bailing out.
   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
-    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
-              << "runtime stride == 1 checks needed. Enable vectorization of "
-                 "this loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs()
-        << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+    reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
+        "runtime stride == 1 checks needed. Enable vectorization of "
+        "this loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
+  }
+
+  return false;
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+    // TODO: It may by useful to do since it's still likely to be dynamically
+    // uniform if the target can skip.
+    reportVectorizationFailure(
+        "Not inserting runtime ptr check for divergent target",
+        "runtime pointer checks needed. Not enabled for divergent target",
+        "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
     return None;
   }
 
-  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
   if (TC == 1) {
-    ORE->emit(createMissedAnalysis("SingleIterationLoop")
-              << "loop trip count is one, irrelevant for vectorization");
-    LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
+    reportVectorizationFailure("Single iteration (non) loop",
+        "loop trip count is one, irrelevant for vectorization",
+        "SingleIterationLoop", ORE, TheLoop);
     return None;
   }
 
-  // Record that scalar epilogue is not allowed.
-  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+  switch (ScalarEpilogueStatus) {
+  case CM_ScalarEpilogueAllowed:
+    return computeFeasibleMaxVF(TC);
+  case CM_ScalarEpilogueNotNeededUsePredicate:
+    LLVM_DEBUG(
+        dbgs() << "LV: vector predicate hint/switch found.\n"
+               << "LV: Not allowing scalar epilogue, creating predicated "
+               << "vector loop.\n");
+    break;
+  case CM_ScalarEpilogueNotAllowedLowTripLoop:
+    // fallthrough as a special case of OptForSize
+  case CM_ScalarEpilogueNotAllowedOptSize:
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+      LLVM_DEBUG(
+          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+    else
+      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+                        << "count.\n");
 
-  IsScalarEpilogueAllowed = !OptForSize;
+    // Bail if runtime checks are required, which are not good when optimising
+    // for size.
+    if (runtimeChecksRequired())
+      return None;
+    break;
+  }
+
+  // Now try the tail folding
 
-  // We don't create an epilogue when optimizing for size.
   // Invalidate interleave groups that require an epilogue if we can't mask
   // the interleave-group.
-  if (!useMaskedInterleavedAccesses(TTI)) 
+  if (!useMaskedInterleavedAccesses(TTI))
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
 
-  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
-
+  unsigned MaxVF = computeFeasibleMaxVF(TC);
   if (TC > 0 && TC % MaxVF == 0) {
+    // Accept MaxVF if we do not have a tail.
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
     return MaxVF;
   }
@@ -4759,28 +4978,30 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->canFoldTailByMasking()) {
+  if (Legal->prepareToFoldTailByMasking()) {
     FoldTailByMasking = true;
     return MaxVF;
   }
 
   if (TC == 0) {
-    ORE->emit(
-        createMissedAnalysis("UnknownLoopCountComplexCFG")
-        << "unable to calculate the loop count due to complex control flow");
+    reportVectorizationFailure(
+        "Unable to calculate the loop count due to complex control flow",
+        "unable to calculate the loop count due to complex control flow",
+        "UnknownLoopCountComplexCFG", ORE, TheLoop);
     return None;
   }
 
-  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-            << "cannot optimize for size and vectorize at the same time. "
-               "Enable vectorization of this loop with '#pragma clang loop "
-               "vectorize(enable)' when compiling with -Os/-Oz");
+  reportVectorizationFailure(
+      "Cannot optimize for size and vectorize at the same time.",
+      "cannot optimize for size and vectorize at the same time. "
+      "Enable vectorization of this loop with '#pragma clang loop "
+      "vectorize(enable)' when compiling with -Os/-Oz",
+      "NoTailLoopWithOptForSize", ORE, TheLoop);
   return None;
 }
 
 unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
-                                                 unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -4818,8 +5039,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
   }
 
   unsigned MaxVF = MaxVectorSize;
-  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
-      (MaximizeBandwidth && !OptForSize)) {
+  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorSize).
     SmallVector<unsigned, 8> VFs;
@@ -4832,9 +5053,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
 
     // Select the largest VF which doesn't require more registers than existing
     // ones.
-    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
     for (int i = RUs.size() - 1; i >= 0; --i) {
-      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+      bool Selected = true;
+      for (auto& pair : RUs[i].MaxLocalUsers) {
+        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+        if (pair.second > TargetNumRegisters)
+          Selected = false;
+      }
+      if (Selected) {
         MaxVF = VFs[i];
         break;
       }
@@ -4886,10 +5112,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
   }
 
   if (!EnableCondStoresVectorization && NumPredStores) {
-    ORE->emit(createMissedAnalysis("ConditionalStore")
-              << "store that is conditionally executed prevents vectorization");
-    LLVM_DEBUG(
-        dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    reportVectorizationFailure("There are conditional stores.",
+        "store that is conditionally executed prevents vectorization",
+        "ConditionalStore", ORE, TheLoop);
     Width = 1;
     Cost = ScalarCost;
   }
@@ -4958,8 +5183,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   return {MinWidth, MaxWidth};
 }
 
-unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
-                                                           unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
                                                            unsigned LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4975,35 +5199,25 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  // When we optimize for size, we don't interleave.
-  if (OptForSize)
+  if (!isScalarEpilogueAllowed())
     return 1;
 
   // We used the distance for the interleave count.
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
 
-  // Do not interleave loops with a relatively small trip count.
-  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
+  // Do not interleave loops with a relatively small known or estimated trip
+  // count.
+  auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
+  if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
     return 1;
 
-  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
-  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
-                    << " registers\n");
-
-  if (VF == 1) {
-    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
-      TargetNumRegisters = ForceTargetNumScalarRegs;
-  } else {
-    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
-      TargetNumRegisters = ForceTargetNumVectorRegs;
-  }
-
   RegisterUsage R = calculateRegisterUsage({VF})[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
-  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  for (auto& pair : R.MaxLocalUsers) {
+    pair.second = std::max(pair.second, 1U);
+  }
 
   // We calculate the interleave count using the following formula.
   // Subtract the number of loop invariants from the number of available
@@ -5016,13 +5230,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // We also want power of two interleave counts to ensure that the induction
   // variable of the vector loop wraps to zero, when tail is folded by masking;
   // this currently happens when OptForSize, in which case IC is set to 1 above.
-  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
-                              R.MaxLocalUsers);
+  unsigned IC = UINT_MAX;
 
-  // Don't count the induction variable as interleaved.
-  if (EnableIndVarRegisterHeur)
-    IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
-                       std::max(1U, (R.MaxLocalUsers - 1)));
+  for (auto& pair : R.MaxLocalUsers) {
+    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+    LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+                      << " registers of "
+                      << TTI.getRegisterClassName(pair.first) << " register class\n");
+    if (VF == 1) {
+      if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+        TargetNumRegisters = ForceTargetNumScalarRegs;
+    } else {
+      if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+        TargetNumRegisters = ForceTargetNumVectorRegs;
+    }
+    unsigned MaxLocalUsers = pair.second;
+    unsigned LoopInvariantRegs = 0;
+    if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
+      LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+
+    unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+    // Don't count the induction variable as interleaved.
+    if (EnableIndVarRegisterHeur) {
+      TmpIC =
+          PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+                        std::max(1U, (MaxLocalUsers - 1)));
+    }
+
+    IC = std::min(IC, TmpIC);
+  }
 
   // Clamp the interleave ranges to reasonable counts.
   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
@@ -5036,6 +5272,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   }
 
+  // If trip count is known or estimated compile time constant, limit the
+  // interleave count to be less than the trip count divided by VF.
+  if (BestKnownTC) {
+    MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
+  }
+
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0)
@@ -5044,7 +5286,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   assert(LoopCost && "Non-zero loop cost expected");
 
   // Clamp the calculated IC to be between the 1 and the max interleave count
-  // that the target allows.
+  // that the target and trip count allows.
   if (IC > MaxInterleaveCount)
     IC = MaxInterleaveCount;
   else if (IC < 1)
@@ -5196,7 +5438,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
   SmallVector<RegisterUsage, 8> RUs(VFs.size());
-  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
@@ -5226,21 +5468,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
 
     // For each VF find the maximum usage of registers.
     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+      // Count the number of live intervals.
+      SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
       if (VFs[j] == 1) {
-        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
-        continue;
+        for (auto Inst : OpenIntervals) {
+          unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+          if (RegUsage.find(ClassID) == RegUsage.end())
+            RegUsage[ClassID] = 1;
+          else
+            RegUsage[ClassID] += 1;
+        }
+      } else {
+        collectUniformsAndScalars(VFs[j]);
+        for (auto Inst : OpenIntervals) {
+          // Skip ignored values for VF > 1.
+          if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
+            continue;
+          if (isScalarAfterVectorization(Inst, VFs[j])) {
+            unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+            if (RegUsage.find(ClassID) == RegUsage.end())
+              RegUsage[ClassID] = 1;
+            else
+              RegUsage[ClassID] += 1;
+          } else {
+            unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
+            if (RegUsage.find(ClassID) == RegUsage.end())
+              RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
+            else
+              RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+          }
+        }
       }
-      collectUniformsAndScalars(VFs[j]);
-      // Count the number of live intervals.
-      unsigned RegUsage = 0;
-      for (auto Inst : OpenIntervals) {
-        // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
-            isScalarAfterVectorization(Inst, VFs[j]))
-          continue;
-        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+    
+      for (auto& pair : RegUsage) {
+        if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
+          MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
+        else
+          MaxUsages[j][pair.first] = pair.second;
       }
-      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
     }
 
     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
@@ -5251,18 +5517,34 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   }
 
   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
-    unsigned Invariant = 0;
-    if (VFs[i] == 1)
-      Invariant = LoopInvariants.size();
-    else {
-      for (auto Inst : LoopInvariants)
-        Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+    SmallMapVector<unsigned, unsigned, 4> Invariant;
+  
+    for (auto Inst : LoopInvariants) {
+      unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+      unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+      if (Invariant.find(ClassID) == Invariant.end())
+        Invariant[ClassID] = Usage;
+      else
+        Invariant[ClassID] += Usage;
     }
 
-    LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
-    LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
-    LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
-                      << '\n');
+    LLVM_DEBUG({
+      dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
+      dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
+             << " item\n";
+      for (const auto &pair : MaxUsages[i]) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
+             << " item\n";
+      for (const auto &pair : Invariant) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+    });
 
     RU.LoopInvariantRegs = Invariant;
     RU.MaxLocalUsers = MaxUsages[i];
@@ -5511,7 +5793,6 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   Type *ValTy = getMemInstValueType(I);
   auto SE = PSE.getSE();
 
-  unsigned Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   Value *Ptr = getLoadStorePointerOperand(I);
   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
@@ -5525,9 +5806,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
-  Cost += VF *
-          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
-                              AS);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                                   Alignment, AS);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -5552,16 +5833,17 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                              unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
-  unsigned Alignment = getLoadStoreAlignment(I);
   Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
 
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
   unsigned Cost = 0;
   if (Legal->isMaskRequired(I))
-    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
+                                      Alignment ? Alignment->value() : 0, AS);
   else
     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
 
@@ -5575,7 +5857,7 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                          unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
-  unsigned Alignment = getLoadStoreAlignment(I);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   if (isa<LoadInst>(I)) {
     return TTI.getAddressComputationCost(ValTy) +
@@ -5587,21 +5869,23 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
   return TTI.getAddressComputationCost(ValTy) +
          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
-         (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
-                                               Instruction::ExtractElement,
-                                               VectorTy, VF - 1));
+         (isLoopInvariantStoreValue
+              ? 0
+              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+                                       VF - 1));
 }
 
 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                           unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
-  unsigned Alignment = getLoadStoreAlignment(I);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
   Value *Ptr = getLoadStorePointerOperand(I);
 
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
-                                    Legal->isMaskRequired(I), Alignment);
+                                    Legal->isMaskRequired(I),
+                                    Alignment ? Alignment->value() : 0);
 }
 
 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
@@ -5626,8 +5910,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   }
 
   // Calculate the cost of the whole interleaved group.
-  bool UseMaskForGaps = 
-      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  bool UseMaskForGaps =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   unsigned Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -5648,7 +5932,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   // moment.
   if (VF == 1) {
     Type *ValTy = getMemInstValueType(I);
-    unsigned Alignment = getLoadStoreAlignment(I);
+    const MaybeAlign Alignment = getLoadStoreAlignment(I);
     unsigned AS = getLoadStoreAddressSpace(I);
 
     return TTI.getAddressComputationCost(ValTy) +
@@ -5991,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
     return N * TTI.getArithmeticInstrCost(
                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
-                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
+                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
   }
   case Instruction::FNeg: {
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
@@ -5999,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
                    TargetTransformInfo::OK_AnyValue,
                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
-                   I->getOperand(0));
+                   I->getOperand(0), I);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -6167,8 +6451,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
 }
 
 VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
-                                                unsigned UserVF) {
+LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
   unsigned VF = UserVF;
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
@@ -6207,10 +6490,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
   return VectorizationFactor::Disabled();
 }
 
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
-                                                             unsigned UserVF) {
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
   assert(OrigLoop->empty() && "Inner loop expected.");
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
     return None;
 
@@ -6434,7 +6716,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
   assert(BI && "Unexpected terminator found");
 
-  if (!BI->isConditional())
+  if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
@@ -6490,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
   return BlockMaskCache[BB] = BlockMask;
 }
 
-VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
-                                                           VFRange &Range,
-                                                           VPlanPtr &Plan) {
-  const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
-  if (!IG)
-    return nullptr;
-
-  // Now check if IG is relevant for VF's in the given range.
-  auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
-    return [=](unsigned VF) -> bool {
-      return (VF >= 2 && // Query is illegal for VF == 1
-              CM.getWideningDecision(I, VF) ==
-                  LoopVectorizationCostModel::CM_Interleave);
-    };
-  };
-  if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
-    return nullptr;
-
-  // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
-  // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
-  // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
-  assert(I == IG->getInsertPos() &&
-         "Generating a recipe for an adjunct member of an interleave group");
-
-  VPValue *Mask = nullptr;
-  if (Legal->isMaskRequired(I))
-    Mask = createBlockInMask(I->getParent(), Plan);
-
-  return new VPInterleaveRecipe(IG, Mask);
-}
-
 VPWidenMemoryInstructionRecipe *
 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
                                   VPlanPtr &Plan) {
@@ -6530,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
   auto willWiden = [&](unsigned VF) -> bool {
     if (VF == 1)
       return false;
-    if (CM.isScalarAfterVectorization(I, VF) ||
-        CM.isProfitableToScalarize(I, VF))
-      return false;
     LoopVectorizationCostModel::InstWidening Decision =
         CM.getWideningDecision(I, VF);
     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
            "CM decision should be taken at this point.");
-    assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
-           "Interleave memory opportunity should be caught earlier.");
+    if (Decision == LoopVectorizationCostModel::CM_Interleave)
+      return true;
+    if (CM.isScalarAfterVectorization(I, VF) ||
+        CM.isProfitableToScalarize(I, VF))
+      return false;
     return Decision != LoopVectorizationCostModel::CM_Scalarize;
   };
 
@@ -6549,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
   if (Legal->isMaskRequired(I))
     Mask = createBlockInMask(I->getParent(), Plan);
 
-  return new VPWidenMemoryInstructionRecipe(*I, Mask);
+  VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
+  return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
 }
 
 VPWidenIntOrFpInductionRecipe *
@@ -6637,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
     case Instruction::FPTrunc:
     case Instruction::FRem:
     case Instruction::FSub:
-    case Instruction::GetElementPtr:
     case Instruction::ICmp:
     case Instruction::IntToPtr:
     case Instruction::Load:
@@ -6702,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
 
   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
     return false;
+  // If this ingredient's recipe is to be recorded, keep its recipe a singleton
+  // to avoid having to split recipes later.
+  bool IsSingleton = Ingredient2Recipe.count(I);
+
+  // Success: widen this instruction.
 
-  // Success: widen this instruction. We optimize the common case where
+  // Use the default widening recipe. We optimize the common case where
   // consecutive instructions can be represented by a single recipe.
-  if (!VPBB->empty()) {
-    VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
-    if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
-      return true;
-  }
+  if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
+      LastExtensibleRecipe->appendInstruction(I))
+    return true;
 
-  VPBB->appendRecipe(new VPWidenRecipe(I));
+  VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
+  if (!IsSingleton)
+    LastExtensibleRecipe = WidenRecipe;
+  setRecipe(I, WidenRecipe);
+  VPBB->appendRecipe(WidenRecipe);
   return true;
 }
 
@@ -6727,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
 
   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+  setRecipe(I, Recipe);
 
   // Find if I uses a predicated instruction. If so, it will use its scalar
   // value. Avoid hoisting the insert-element which packs the scalar value into
@@ -6785,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
   VPRecipeBase *Recipe = nullptr;
-  // Check if Instr should belong to an interleave memory recipe, or already
-  // does. In the latter case Instr is irrelevant.
-  if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
-    VPBB->appendRecipe(Recipe);
-    return true;
-  }
 
-  // Check if Instr is a memory operation that should be widened.
-  if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+  // First, check for specific widening recipes that deal with memory
+  // operations, inductions and Phi nodes.
+  if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
+      (Recipe = tryToOptimizeInduction(Instr, Range)) ||
+      (Recipe = tryToBlend(Instr, Plan)) ||
+      (isa<PHINode>(Instr) &&
+       (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
+    setRecipe(Instr, Recipe);
     VPBB->appendRecipe(Recipe);
     return true;
   }
 
-  // Check if Instr should form some PHI recipe.
-  if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
-    VPBB->appendRecipe(Recipe);
-    return true;
-  }
-  if ((Recipe = tryToBlend(Instr, Plan))) {
+  // Handle GEP widening.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+    auto Scalarize = [&](unsigned VF) {
+      return CM.isScalarWithPredication(Instr, VF) ||
+             CM.isScalarAfterVectorization(Instr, VF) ||
+             CM.isProfitableToScalarize(Instr, VF);
+    };
+    if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
+      return false;
+    VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
+    setRecipe(Instr, Recipe);
     VPBB->appendRecipe(Recipe);
     return true;
   }
-  if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
-    VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
-    return true;
-  }
 
   // Check if Instr is to be widened by a general VPWidenRecipe, after
-  // having first checked for specific widening recipes that deal with
-  // Interleave Groups, Inductions and Phi nodes.
+  // having first checked for specific widening recipes.
   if (tryToWiden(Instr, VPBB, Range))
     return true;
 
@@ -6840,8 +7099,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
 
   // If the tail is to be folded by masking, the primary induction variable
   // needs to be represented in VPlan for it to model early-exit masking.
-  if (CM.foldTailByMasking())
+  // Also, both the Phi and the live-out instruction of each reduction are
+  // required in order to introduce a select between them in VPlan.
+  if (CM.foldTailByMasking()) {
     NeedDef.insert(Legal->getPrimaryInduction());
+    for (auto &Reduction : *Legal->getReductionVars()) {
+      NeedDef.insert(Reduction.first);
+      NeedDef.insert(Reduction.second.getLoopExitInstr());
+    }
+  }
 
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
@@ -6852,30 +7118,72 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
   SmallPtrSet<Instruction *, 4> DeadInstructions;
   collectTriviallyDeadInstructions(DeadInstructions);
 
+  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+  // Dead instructions do not need sinking. Remove them from SinkAfter.
+  for (Instruction *I : DeadInstructions)
+    SinkAfter.erase(I);
+
   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
     VFRange SubRange = {VF, MaxVF + 1};
-    VPlans.push_back(
-        buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
+    VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
+                                             DeadInstructions, SinkAfter));
     VF = SubRange.End;
   }
 }
 
 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
-    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+    SmallPtrSetImpl<Instruction *> &DeadInstructions,
+    const DenseMap<Instruction *, Instruction *> &SinkAfter) {
+
   // Hold a mapping from predicated instructions to their recipes, in order to
   // fix their AlsoPack behavior if a user is determined to replicate and use a
   // scalar instead of vector value.
   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
 
-  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
-  DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+  SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
+
+  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
+
+  // ---------------------------------------------------------------------------
+  // Pre-construction: record ingredients whose recipes we'll need to further
+  // process after constructing the initial VPlan.
+  // ---------------------------------------------------------------------------
+
+  // Mark instructions we'll need to sink later and their targets as
+  // ingredients whose recipe we'll need to record.
+  for (auto &Entry : SinkAfter) {
+    RecipeBuilder.recordRecipeOf(Entry.first);
+    RecipeBuilder.recordRecipeOf(Entry.second);
+  }
+
+  // For each interleave group which is relevant for this (possibly trimmed)
+  // Range, add it to the set of groups to be later applied to the VPlan and add
+  // placeholders for its members' Recipes which we'll be replacing with a
+  // single VPInterleaveRecipe.
+  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
+    auto applyIG = [IG, this](unsigned VF) -> bool {
+      return (VF >= 2 && // Query is illegal for VF == 1
+              CM.getWideningDecision(IG->getInsertPos(), VF) ==
+                  LoopVectorizationCostModel::CM_Interleave);
+    };
+    if (!getDecisionAndClampRange(applyIG, Range))
+      continue;
+    InterleaveGroups.insert(IG);
+    for (unsigned i = 0; i < IG->getFactor(); i++)
+      if (Instruction *Member = IG->getMember(i))
+        RecipeBuilder.recordRecipeOf(Member);
+  };
+
+  // ---------------------------------------------------------------------------
+  // Build initial VPlan: Scan the body of the loop in a topological order to
+  // visit each basic block after having visited its predecessor basic blocks.
+  // ---------------------------------------------------------------------------
 
   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
-  auto Plan = llvm::make_unique<VPlan>(VPBB);
+  auto Plan = std::make_unique<VPlan>(VPBB);
 
-  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
   // Represent values that will have defs inside VPlan.
   for (Value *V : NeedDef)
     Plan->addVPValue(V);
@@ -6894,10 +7202,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VPBB = FirstVPBBForBB;
     Builder.setInsertPoint(VPBB);
 
-    std::vector<Instruction *> Ingredients;
-
-    // Organize the ingredients to vectorize from current basic block in the
-    // right order.
+    // Introduce each ingredient into VPlan.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       Instruction *Instr = &I;
 
@@ -6907,43 +7212,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
           DeadInstructions.find(Instr) != DeadInstructions.end())
         continue;
 
-      // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
-      // member of the IG, do not construct any Recipe for it.
-      const InterleaveGroup<Instruction> *IG =
-          CM.getInterleavedAccessGroup(Instr);
-      if (IG && Instr != IG->getInsertPos() &&
-          Range.Start >= 2 && // Query is illegal for VF == 1
-          CM.getWideningDecision(Instr, Range.Start) ==
-              LoopVectorizationCostModel::CM_Interleave) {
-        auto SinkCandidate = SinkAfterInverse.find(Instr);
-        if (SinkCandidate != SinkAfterInverse.end())
-          Ingredients.push_back(SinkCandidate->second);
-        continue;
-      }
-
-      // Move instructions to handle first-order recurrences, step 1: avoid
-      // handling this instruction until after we've handled the instruction it
-      // should follow.
-      auto SAIt = SinkAfter.find(Instr);
-      if (SAIt != SinkAfter.end()) {
-        LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
-                          << *SAIt->second
-                          << " to vectorize a 1st order recurrence.\n");
-        SinkAfterInverse[SAIt->second] = Instr;
-        continue;
-      }
-
-      Ingredients.push_back(Instr);
-
-      // Move instructions to handle first-order recurrences, step 2: push the
-      // instruction to be sunk at its insertion point.
-      auto SAInvIt = SinkAfterInverse.find(Instr);
-      if (SAInvIt != SinkAfterInverse.end())
-        Ingredients.push_back(SAInvIt->second);
-    }
-
-    // Introduce each ingredient into VPlan.
-    for (Instruction *Instr : Ingredients) {
       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
         continue;
 
@@ -6968,6 +7236,45 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
   delete PreEntry;
 
+  // ---------------------------------------------------------------------------
+  // Transform initial VPlan: Apply previously taken decisions, in order, to
+  // bring the VPlan to its final state.
+  // ---------------------------------------------------------------------------
+
+  // Apply Sink-After legal constraints.
+  for (auto &Entry : SinkAfter) {
+    VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
+    VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
+    Sink->moveAfter(Target);
+  }
+
+  // Interleave memory: for each Interleave Group we marked earlier as relevant
+  // for this VPlan, replace the Recipes widening its memory instructions with a
+  // single VPInterleaveRecipe at its insertion point.
+  for (auto IG : InterleaveGroups) {
+    auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
+        RecipeBuilder.getRecipe(IG->getInsertPos()));
+    (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
+        ->insertBefore(Recipe);
+
+    for (unsigned i = 0; i < IG->getFactor(); ++i)
+      if (Instruction *Member = IG->getMember(i)) {
+        RecipeBuilder.getRecipe(Member)->eraseFromParent();
+      }
+  }
+
+  // Finally, if tail is folded by masking, introduce selects between the phi
+  // and the live-out instruction of each reduction, at the end of the latch.
+  if (CM.foldTailByMasking()) {
+    Builder.setInsertPoint(VPBB);
+    auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+    for (auto &Reduction : *Legal->getReductionVars()) {
+      VPValue *Phi = Plan->getVPValue(Reduction.first);
+      VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+    }
+  }
+
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
   unsigned VF = Range.Start;
@@ -6993,7 +7300,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
 
   // Create new empty VPlan
-  auto Plan = llvm::make_unique<VPlan>();
+  auto Plan = std::make_unique<VPlan>();
 
   // Build hierarchical CFG
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -7012,9 +7319,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   }
 
   SmallPtrSet<Instruction *, 1> DeadInstructions;
-  VPlanHCFGTransforms::VPInstructionsToVPRecipes(
-      Plan, Legal->getInductionVars(), DeadInstructions);
-
+  VPlanTransforms::VPInstructionsToVPRecipes(
+      OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
   return Plan;
 }
 
@@ -7023,13 +7329,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) {
       return ILV.getOrCreateVectorValue(V, Part);
 }
 
+Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
+    Value *V, const VPIteration &Instance) {
+  return ILV.getOrCreateScalarValue(V, Instance);
+}
+
 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
   O << " +\n"
     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
-  if (User) {
+  O << ", ";
+  getAddr()->printAsOperand(O);
+  VPValue *Mask = getMask();
+  if (Mask) {
     O << ", ";
-    User->getOperand(0)->printAsOperand(O);
+    Mask->printAsOperand(O);
   }
   O << "\\l\"";
   for (unsigned i = 0; i < IG->getFactor(); ++i)
@@ -7043,6 +7357,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.ILV->widenInstruction(Instr);
 }
 
+void VPWidenGEPRecipe::execute(VPTransformState &State) {
+  State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
+                      IsIndexLoopInvariant);
+}
+
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
   State.ILV->widenIntOrFpInduction(IV, Trunc);
@@ -7093,15 +7412,8 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  if (!User)
-    return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
-
-  // Last (and currently only) operand is a mask.
-  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
-  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
-  for (unsigned Part = 0; Part < State.UF; ++Part)
-    MaskValues[Part] = State.get(Mask, Part);
-  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
+  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
+                                      getMask());
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
@@ -7188,15 +7500,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
 }
 
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
-  if (!User)
-    return State.ILV->vectorizeMemoryInstruction(&Instr);
+  State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
+}
 
-  // Last (and currently only) operand is a mask.
-  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
-  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
-  for (unsigned Part = 0; Part < State.UF; ++Part)
-    MaskValues[Part] = State.get(Mask, Part);
-  State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+    AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+    LoopVectorizationLegality &LVL) {
+  bool OptSize =
+      F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                     PGSOQueryType::IRPass);
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
+  if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+    return CM_ScalarEpilogueNotAllowedOptSize;
+
+  bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
+                              !PreferPredicateOverEpilog;
+
+  // 2) Next, if disabling predication is requested on the command line, honour
+  // this and request a scalar epilogue. Also do this if we don't have a
+  // primary induction variable, which is required for predication.
+  if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+    return CM_ScalarEpilogueAllowed;
+
+  // 3) and 4) look if enabling predication is requested on the command line,
+  // with a loop hint, or if the TTI hook indicates this is profitable, request
+  // predication .
+  if (PreferPredicateOverEpilog ||
+      Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
+      (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+                                        LVL.getLAI()) &&
+       Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+    return CM_ScalarEpilogueNotNeededUsePredicate;
+
+  return CM_ScalarEpilogueAllowed;
 }
 
 // Process the loop in the VPlan-native vectorization path. This path builds
@@ -7213,25 +7556,22 @@ static bool processLoopInVPlanNativePath(
   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
-  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
+
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                                 &Hints, IAI);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
 
   // Get user vectorization factor.
   const unsigned UserVF = Hints.getWidth();
 
-  // Check the function attributes and profiles to find out if this function
-  // should be optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
-
   // Plan how to best vectorize, return the best VF and its cost.
-  const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
+  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
 
   // If we are stress testing VPlan builds, do not attempt to generate vector
   // code. Masked vector code generation support will follow soon.
@@ -7310,10 +7650,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
@@ -7325,36 +7663,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                         ORE, BFI, PSI, Hints);
 
   assert(L->empty() && "Inner loop expected.");
+
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
-  // Prefer constant trip counts over profile data, over upper bound estimate.
-  unsigned ExpectedTC = 0;
-  bool HasExpectedTC = false;
-  if (const SCEVConstant *ConstExits =
-      dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
-    const APInt &ExitsCount = ConstExits->getAPInt();
-    // We are interested in small values for ExpectedTC. Skip over those that
-    // can't fit an unsigned.
-    if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
-      ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
-      HasExpectedTC = true;
-    }
-  }
-  // ExpectedTC may be large because it's bound by a variable. Check
-  // profiling information to validate we should vectorize.
-  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
-    auto EstimatedTC = getLoopEstimatedTripCount(L);
-    if (EstimatedTC) {
-      ExpectedTC = *EstimatedTC;
-      HasExpectedTC = true;
-    }
-  }
-  if (!HasExpectedTC) {
-    ExpectedTC = SE->getSmallConstantMaxTripCount(L);
-    HasExpectedTC = (ExpectedTC > 0);
-  }
-
-  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
+  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
                       << "This loop is worth vectorizing only if no scalar "
                       << "iteration overheads are incurred.");
@@ -7362,10 +7675,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
     else {
       LLVM_DEBUG(dbgs() << "\n");
-      // Loops with a very small trip count are considered for vectorization
-      // under OptForSize, thereby making sure the cost of their loop body is
-      // dominant, free of runtime guards and scalar iteration overheads.
-      OptForSize = true;
+      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
     }
   }
 
@@ -7374,11 +7684,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // an integer loop and the vector instructions selected are purely integer
   // vector instructions?
   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
-    LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
-                         "attribute is used.\n");
-    ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
-                                     "NoImplicitFloat", L)
-              << "loop not vectorized due to NoImplicitFloat attribute");
+    reportVectorizationFailure(
+        "Can't vectorize when the NoImplicitFloat attribute is used",
+        "loop not vectorized due to NoImplicitFloat attribute",
+        "NoImplicitFloat", ORE, L);
     Hints.emitRemarkWithHints();
     return false;
   }
@@ -7389,11 +7698,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // additional fp-math flags can help.
   if (Hints.isPotentiallyUnsafe() &&
       TTI->isFPVectorizationPotentiallyUnsafe()) {
-    LLVM_DEBUG(
-        dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
-    ORE->emit(
-        createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
-        << "loop not vectorized due to unsafe FP support.");
+    reportVectorizationFailure(
+        "Potentially unsafe FP op prevents vectorization",
+        "loop not vectorized due to unsafe FP support.",
+        "UnsafeFP", ORE, L);
     Hints.emitRemarkWithHints();
     return false;
   }
@@ -7411,18 +7719,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   }
 
   // Use the cost model.
-  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+                                F, &Hints, IAI);
   CM.collectValuesToIgnore();
 
   // Use the planner for vectorization.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
 
   // Get user vectorization factor.
   unsigned UserVF = Hints.getWidth();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
+  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
@@ -7431,7 +7739,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
-    IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
   }
 
   // Identify the diagnostic messages that should be produced.
@@ -7609,7 +7917,8 @@ bool LoopVectorizePass::runImpl(
   // The second condition is necessary because, even if the target has no
   // vector registers, loop vectorization may still enable scalar
   // interleaving.
-  if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+      TTI->getMaxInterleaveFactor(1) < 2)
     return false;
 
   bool Changed = false;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 27a86c0bca91..479bca83b51e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -72,6 +73,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -127,6 +129,10 @@ static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
+static cl::opt<int>
+MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
+    cl::desc("Maximum depth of the lookup for consecutive stores."));
+
 /// Limits the size of scheduling regions in a block.
 /// It avoid long compile times for _very_ large blocks where vector
 /// instructions are spread over a wide range.
@@ -147,6 +153,20 @@ static cl::opt<unsigned> MinTreeSize(
     "slp-min-tree-size", cl::init(3), cl::Hidden,
     cl::desc("Only vectorize small trees if they are fully vectorizable"));
 
+// The maximum depth that the look-ahead score heuristic will explore.
+// The higher this value, the higher the compilation time overhead.
+static cl::opt<int> LookAheadMaxDepth(
+    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
+    cl::desc("The maximum look-ahead depth for operand reordering scores"));
+
+// The Look-ahead heuristic goes through the users of the bundle to calculate
+// the users cost in getExternalUsesCost(). To avoid compilation time increase
+// we limit the number of users visited to this value.
+static cl::opt<unsigned> LookAheadUsersBudget(
+    "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
+    cl::desc("The maximum number of users to visit while visiting the "
+             "predecessors. This prevents compilation time increase."));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -194,10 +214,13 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
   return true;
 }
 
-/// \returns True if all of the values in \p VL are constants.
+/// \returns True if all of the values in \p VL are constants (but not
+/// globals/constant expressions).
 static bool allConstant(ArrayRef<Value *> VL) {
+  // Constant expressions and globals can't be vectorized like normal integer/FP
+  // constants.
   for (Value *i : VL)
-    if (!isa<Constant>(i))
+    if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
       return false;
   return true;
 }
@@ -354,6 +377,18 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {
   return S.OpValue;
 }
 
+/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// instruction for SLP vectorization.
+///
+/// Example of unsupported opcode is SDIV that can potentially cause UB if the
+/// "shuffled out" lane would result in division by zero.
+static bool isValidForAlternation(unsigned Opcode) {
+  if (Instruction::isIntDivRem(Opcode))
+    return false;
+
+  return true;
+}
+
 /// \returns analysis of the Instructions in \p VL described in
 /// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
@@ -376,7 +411,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
         continue;
-      if (Opcode == AltOpcode) {
+      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
+          isValidForAlternation(Opcode)) {
         AltOpcode = InstOpcode;
         AltIndex = Cnt;
         continue;
@@ -388,6 +424,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
           continue;
         if (Opcode == AltOpcode) {
+          assert(isValidForAlternation(Opcode) &&
+                 isValidForAlternation(InstOpcode) &&
+                 "Cast isn't safe for alternation, logic needs to be updated!");
           AltOpcode = InstOpcode;
           AltIndex = Cnt;
           continue;
@@ -486,6 +525,7 @@ namespace slpvectorizer {
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
   struct TreeEntry;
+  struct ScheduleData;
 
 public:
   using ValueList = SmallVector<Value *, 8>;
@@ -543,7 +583,7 @@ public:
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
-  /// into account (anf updating it, if required) list of externally used
+  /// into account (and updating it, if required) list of externally used
   /// values stored in \p ExternallyUsedValues.
   void buildTree(ArrayRef<Value *> Roots,
                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
@@ -605,7 +645,10 @@ public:
     return MinVecRegSize;
   }
 
-  /// Check if ArrayType or StructType is isomorphic to some VectorType.
+  /// Check if homogeneous aggregate is isomorphic to some VectorType.
+  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
+  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
+  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
   ///
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -614,6 +657,15 @@ public:
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable() const;
 
+  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -708,6 +760,7 @@ public:
 
     const DataLayout &DL;
     ScalarEvolution &SE;
+    const BoUpSLP &R;
 
     /// \returns the operand data at \p OpIdx and \p Lane.
     OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -733,6 +786,227 @@ public:
       std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
     }
 
+    // The hard-coded scores listed here are not very important. When computing
+    // the scores of matching one sub-tree with another, we are basically
+    // counting the number of values that are matching. So even if all scores
+    // are set to 1, we would still get a decent matching result.
+    // However, sometimes we have to break ties. For example we may have to
+    // choose between matching loads vs matching opcodes. This is what these
+    // scores are helping us with: they provide the order of preference.
+
+    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
+    static const int ScoreConsecutiveLoads = 3;
+    /// ExtractElementInst from same vector and consecutive indexes.
+    static const int ScoreConsecutiveExtracts = 3;
+    /// Constants.
+    static const int ScoreConstants = 2;
+    /// Instructions with the same opcode.
+    static const int ScoreSameOpcode = 2;
+    /// Instructions with alt opcodes (e.g, add + sub).
+    static const int ScoreAltOpcodes = 1;
+    /// Identical instructions (a.k.a. splat or broadcast).
+    static const int ScoreSplat = 1;
+    /// Matching with an undef is preferable to failing.
+    static const int ScoreUndef = 1;
+    /// Score for failing to find a decent match.
+    static const int ScoreFail = 0;
+    /// User exteranl to the vectorized code.
+    static const int ExternalUseCost = 1;
+    /// The user is internal but in a different lane.
+    static const int UserInDiffLaneCost = ExternalUseCost;
+
+    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
+    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
+                               ScalarEvolution &SE) {
+      auto *LI1 = dyn_cast<LoadInst>(V1);
+      auto *LI2 = dyn_cast<LoadInst>(V2);
+      if (LI1 && LI2)
+        return isConsecutiveAccess(LI1, LI2, DL, SE)
+                   ? VLOperands::ScoreConsecutiveLoads
+                   : VLOperands::ScoreFail;
+
+      auto *C1 = dyn_cast<Constant>(V1);
+      auto *C2 = dyn_cast<Constant>(V2);
+      if (C1 && C2)
+        return VLOperands::ScoreConstants;
+
+      // Extracts from consecutive indexes of the same vector better score as
+      // the extracts could be optimized away.
+      auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
+      auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
+      if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
+          cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
+              cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
+        return VLOperands::ScoreConsecutiveExtracts;
+      }
+
+      auto *I1 = dyn_cast<Instruction>(V1);
+      auto *I2 = dyn_cast<Instruction>(V2);
+      if (I1 && I2) {
+        if (I1 == I2)
+          return VLOperands::ScoreSplat;
+        InstructionsState S = getSameOpcode({I1, I2});
+        // Note: Only consider instructions with <= 2 operands to avoid
+        // complexity explosion.
+        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
+          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
+                                  : VLOperands::ScoreSameOpcode;
+      }
+
+      if (isa<UndefValue>(V2))
+        return VLOperands::ScoreUndef;
+
+      return VLOperands::ScoreFail;
+    }
+
+    /// Holds the values and their lane that are taking part in the look-ahead
+    /// score calculation. This is used in the external uses cost calculation.
+    SmallDenseMap<Value *, int> InLookAheadValues;
+
+    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
+    /// either external to the vectorized code, or require shuffling.
+    int getExternalUsesCost(const std::pair<Value *, int> &LHS,
+                            const std::pair<Value *, int> &RHS) {
+      int Cost = 0;
+      SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
+      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
+        Value *V = Values[Idx].first;
+        // Calculate the absolute lane, using the minimum relative lane of LHS
+        // and RHS as base and Idx as the offset.
+        int Ln = std::min(LHS.second, RHS.second) + Idx;
+        assert(Ln >= 0 && "Bad lane calculation");
+        unsigned UsersBudget = LookAheadUsersBudget;
+        for (User *U : V->users()) {
+          if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
+            // The user is in the VectorizableTree. Check if we need to insert.
+            auto It = llvm::find(UserTE->Scalars, U);
+            assert(It != UserTE->Scalars.end() && "U is in UserTE");
+            int UserLn = std::distance(UserTE->Scalars.begin(), It);
+            assert(UserLn >= 0 && "Bad lane");
+            if (UserLn != Ln)
+              Cost += UserInDiffLaneCost;
+          } else {
+            // Check if the user is in the look-ahead code.
+            auto It2 = InLookAheadValues.find(U);
+            if (It2 != InLookAheadValues.end()) {
+              // The user is in the look-ahead code. Check the lane.
+              if (It2->second != Ln)
+                Cost += UserInDiffLaneCost;
+            } else {
+              // The user is neither in SLP tree nor in the look-ahead code.
+              Cost += ExternalUseCost;
+            }
+          }
+          // Limit the number of visited uses to cap compilation time.
+          if (--UsersBudget == 0)
+            break;
+        }
+      }
+      return Cost;
+    }
+
+    /// Go through the operands of \p LHS and \p RHS recursively until \p
+    /// MaxLevel, and return the cummulative score. For example:
+    /// \verbatim
+    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
+    ///     \ /         \ /         \ /        \ /
+    ///      +           +           +          +
+    ///     G1          G2          G3         G4
+    /// \endverbatim
+    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
+    /// each level recursively, accumulating the score. It starts from matching
+    /// the additions at level 0, then moves on to the loads (level 1). The
+    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
+    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
+    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+    /// Please note that the order of the operands does not matter, as we
+    /// evaluate the score of all profitable combinations of operands. In
+    /// other words the score of G1 and G4 is the same as G1 and G2. This
+    /// heuristic is based on ideas described in:
+    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
+    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+    ///   Luís F. W. Góes
+    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
+                           const std::pair<Value *, int> &RHS, int CurrLevel,
+                           int MaxLevel) {
+
+      Value *V1 = LHS.first;
+      Value *V2 = RHS.first;
+      // Get the shallow score of V1 and V2.
+      int ShallowScoreAtThisLevel =
+          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
+                                       getExternalUsesCost(LHS, RHS));
+      int Lane1 = LHS.second;
+      int Lane2 = RHS.second;
+
+      // If reached MaxLevel,
+      //  or if V1 and V2 are not instructions,
+      //  or if they are SPLAT,
+      //  or if they are not consecutive, early return the current cost.
+      auto *I1 = dyn_cast<Instruction>(V1);
+      auto *I2 = dyn_cast<Instruction>(V2);
+      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
+          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+        return ShallowScoreAtThisLevel;
+      assert(I1 && I2 && "Should have early exited.");
+
+      // Keep track of in-tree values for determining the external-use cost.
+      InLookAheadValues[V1] = Lane1;
+      InLookAheadValues[V2] = Lane2;
+
+      // Contains the I2 operand indexes that got matched with I1 operands.
+      SmallSet<unsigned, 4> Op2Used;
+
+      // Recursion towards the operands of I1 and I2. We are trying all possbile
+      // operand pairs, and keeping track of the best score.
+      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
+           OpIdx1 != NumOperands1; ++OpIdx1) {
+        // Try to pair op1I with the best operand of I2.
+        int MaxTmpScore = 0;
+        unsigned MaxOpIdx2 = 0;
+        bool FoundBest = false;
+        // If I2 is commutative try all combinations.
+        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
+        unsigned ToIdx = isCommutative(I2)
+                             ? I2->getNumOperands()
+                             : std::min(I2->getNumOperands(), OpIdx1 + 1);
+        assert(FromIdx <= ToIdx && "Bad index");
+        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
+          // Skip operands already paired with OpIdx1.
+          if (Op2Used.count(OpIdx2))
+            continue;
+          // Recursively calculate the cost at each level
+          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
+                                            {I2->getOperand(OpIdx2), Lane2},
+                                            CurrLevel + 1, MaxLevel);
+          // Look for the best score.
+          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+            MaxTmpScore = TmpScore;
+            MaxOpIdx2 = OpIdx2;
+            FoundBest = true;
+          }
+        }
+        if (FoundBest) {
+          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
+          Op2Used.insert(MaxOpIdx2);
+          ShallowScoreAtThisLevel += MaxTmpScore;
+        }
+      }
+      return ShallowScoreAtThisLevel;
+    }
+
+    /// \Returns the look-ahead score, which tells us how much the sub-trees
+    /// rooted at \p LHS and \p RHS match, the more they match the higher the
+    /// score. This helps break ties in an informed way when we cannot decide on
+    /// the order of the operands by just considering the immediate
+    /// predecessors.
+    int getLookAheadScore(const std::pair<Value *, int> &LHS,
+                          const std::pair<Value *, int> &RHS) {
+      InLookAheadValues.clear();
+      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
+    }
+
     // Search all operands in Ops[*][Lane] for the one that matches best
     // Ops[OpIdx][LastLane] and return its opreand index.
     // If no good match can be found, return None.
@@ -750,9 +1024,6 @@ public:
       // The linearized opcode of the operand at OpIdx, Lane.
       bool OpIdxAPO = getData(OpIdx, Lane).APO;
 
-      const unsigned BestScore = 2;
-      const unsigned GoodScore = 1;
-
       // The best operand index and its score.
       // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
       // are using the score to differentiate between the two.
@@ -781,41 +1052,19 @@ public:
         // Look for an operand that matches the current mode.
         switch (RMode) {
         case ReorderingMode::Load:
-          if (isa<LoadInst>(Op)) {
-            // Figure out which is left and right, so that we can check for
-            // consecutive loads
-            bool LeftToRight = Lane > LastLane;
-            Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
-            Value *OpRight = (LeftToRight) ? Op : OpLastLane;
-            if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
-                                    cast<LoadInst>(OpRight), DL, SE))
-              BestOp.Idx = Idx;
-          }
-          break;
-        case ReorderingMode::Opcode:
-          // We accept both Instructions and Undefs, but with different scores.
-          if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
-               cast<Instruction>(Op)->getOpcode() ==
-                   cast<Instruction>(OpLastLane)->getOpcode()) ||
-              (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
-              isa<UndefValue>(Op)) {
-            // An instruction has a higher score than an undef.
-            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
-            if (Score > BestOp.Score) {
-              BestOp.Idx = Idx;
-              BestOp.Score = Score;
-            }
-          }
-          break;
         case ReorderingMode::Constant:
-          if (isa<Constant>(Op)) {
-            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
-            if (Score > BestOp.Score) {
-              BestOp.Idx = Idx;
-              BestOp.Score = Score;
-            }
+        case ReorderingMode::Opcode: {
+          bool LeftToRight = Lane > LastLane;
+          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+          Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+          unsigned Score =
+              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
+          if (Score > BestOp.Score) {
+            BestOp.Idx = Idx;
+            BestOp.Score = Score;
           }
           break;
+        }
         case ReorderingMode::Splat:
           if (Op == OpLastLane)
             BestOp.Idx = Idx;
@@ -946,8 +1195,8 @@ public:
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
     VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
-               ScalarEvolution &SE)
-        : DL(DL), SE(SE) {
+               ScalarEvolution &SE, const BoUpSLP &R)
+        : DL(DL), SE(SE), R(R) {
       // Append all the operands of RootVL.
       appendOperandsOfVL(RootVL);
     }
@@ -1117,6 +1366,14 @@ public:
 #endif
   };
 
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Marks values operands for later deletion by replacing them with Undefs.
+  void eraseInstructions(ArrayRef<Value *> AV);
+
+  ~BoUpSLP();
+
 private:
   /// Checks if all users of \p I are the part of the vectorization tree.
   bool areAllUsersVectorized(Instruction *I) const;
@@ -1153,8 +1410,7 @@ private:
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
-  void setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                 const InstructionsState &S);
+  void setInsertPointAfterBundle(TreeEntry *E);
 
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -1169,7 +1425,8 @@ private:
                                              SmallVectorImpl<Value *> &Left,
                                              SmallVectorImpl<Value *> &Right,
                                              const DataLayout &DL,
-                                             ScalarEvolution &SE);
+                                             ScalarEvolution &SE,
+                                             const BoUpSLP &R);
   struct TreeEntry {
     using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
     TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -1191,7 +1448,8 @@ private:
     Value *VectorizedValue = nullptr;
 
     /// Do we need to gather this sequence ?
-    bool NeedToGather = false;
+    enum EntryState { Vectorize, NeedToGather };
+    EntryState State;
 
     /// Does this sequence require some shuffling?
     SmallVector<unsigned, 4> ReuseShuffleIndices;
@@ -1220,27 +1478,37 @@ private:
     /// reordering of operands during buildTree_rec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
+    /// The main/alternate instruction.
+    Instruction *MainOp = nullptr;
+    Instruction *AltOp = nullptr;
+
   public:
     /// Set this bundle's \p OpIdx'th operand to \p OpVL.
-    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,
-                    ArrayRef<unsigned> ReuseShuffleIndices) {
+    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
       if (Operands.size() < OpIdx + 1)
         Operands.resize(OpIdx + 1);
       assert(Operands[OpIdx].size() == 0 && "Already resized?");
       Operands[OpIdx].resize(Scalars.size());
       for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
-        Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())
-                                    ? OpVL[ReuseShuffleIndices[Lane]]
-                                    : OpVL[Lane];
-    }
-
-    /// If there is a user TreeEntry, then set its operand.
-    void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,
-                             ArrayRef<Value *> OpVL,
-                             ArrayRef<unsigned> ReuseShuffleIndices) {
-      if (UserTreeIdx.UserTE)
-        UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL,
-                                       ReuseShuffleIndices);
+        Operands[OpIdx][Lane] = OpVL[Lane];
+    }
+
+    /// Set the operands of this bundle in their original order.
+    void setOperandsInOrder() {
+      assert(Operands.empty() && "Already initialized?");
+      auto *I0 = cast<Instruction>(Scalars[0]);
+      Operands.resize(I0->getNumOperands());
+      unsigned NumLanes = Scalars.size();
+      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
+           OpIdx != NumOperands; ++OpIdx) {
+        Operands[OpIdx].resize(NumLanes);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          auto *I = cast<Instruction>(Scalars[Lane]);
+          assert(I->getNumOperands() == NumOperands &&
+                 "Expected same number of operands");
+          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
+        }
+      }
     }
 
     /// \returns the \p OpIdx operand of this TreeEntry.
@@ -1249,6 +1517,9 @@ private:
       return Operands[OpIdx];
     }
 
+    /// \returns the number of operands.
+    unsigned getNumOperands() const { return Operands.size(); }
+
     /// \return the single \p OpIdx operand.
     Value *getSingleOperand(unsigned OpIdx) const {
       assert(OpIdx < Operands.size() && "Off bounds");
@@ -1256,6 +1527,58 @@ private:
       return Operands[OpIdx][0];
     }
 
+    /// Some of the instructions in the list have alternate opcodes.
+    bool isAltShuffle() const {
+      return getOpcode() != getAltOpcode();
+    }
+
+    bool isOpcodeOrAlt(Instruction *I) const {
+      unsigned CheckedOpcode = I->getOpcode();
+      return (getOpcode() == CheckedOpcode ||
+              getAltOpcode() == CheckedOpcode);
+    }
+
+    /// Chooses the correct key for scheduling data. If \p Op has the same (or
+    /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
+    /// \p OpValue.
+    Value *isOneOf(Value *Op) const {
+      auto *I = dyn_cast<Instruction>(Op);
+      if (I && isOpcodeOrAlt(I))
+        return Op;
+      return MainOp;
+    }
+
+    void setOperations(const InstructionsState &S) {
+      MainOp = S.MainOp;
+      AltOp = S.AltOp;
+    }
+
+    Instruction *getMainOp() const {
+      return MainOp;
+    }
+
+    Instruction *getAltOp() const {
+      return AltOp;
+    }
+
+    /// The main/alternate opcodes for the list of instructions.
+    unsigned getOpcode() const {
+      return MainOp ? MainOp->getOpcode() : 0;
+    }
+
+    unsigned getAltOpcode() const {
+      return AltOp ? AltOp->getOpcode() : 0;
+    }
+
+    /// Update operations state of this entry if reorder occurred.
+    bool updateStateIfReorder() {
+      if (ReorderIndices.empty())
+        return false;
+      InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
+      setOperations(S);
+      return true;
+    }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -1268,23 +1591,40 @@ private:
       dbgs() << "Scalars: \n";
       for (Value *V : Scalars)
         dbgs().indent(2) << *V << "\n";
-      dbgs() << "NeedToGather: " << NeedToGather << "\n";
+      dbgs() << "State: ";
+      switch (State) {
+      case Vectorize:
+        dbgs() << "Vectorize\n";
+        break;
+      case NeedToGather:
+        dbgs() << "NeedToGather\n";
+        break;
+      }
+      dbgs() << "MainOp: ";
+      if (MainOp)
+        dbgs() << *MainOp << "\n";
+      else
+        dbgs() << "NULL\n";
+      dbgs() << "AltOp: ";
+      if (AltOp)
+        dbgs() << *AltOp << "\n";
+      else
+        dbgs() << "NULL\n";
       dbgs() << "VectorizedValue: ";
       if (VectorizedValue)
-        dbgs() << *VectorizedValue;
+        dbgs() << *VectorizedValue << "\n";
       else
-        dbgs() << "NULL";
-      dbgs() << "\n";
+        dbgs() << "NULL\n";
       dbgs() << "ReuseShuffleIndices: ";
       if (ReuseShuffleIndices.empty())
         dbgs() << "Emtpy";
       else
-        for (unsigned Idx : ReuseShuffleIndices)
-          dbgs() << Idx << ", ";
+        for (unsigned ReuseIdx : ReuseShuffleIndices)
+          dbgs() << ReuseIdx << ", ";
       dbgs() << "\n";
       dbgs() << "ReorderIndices: ";
-      for (unsigned Idx : ReorderIndices)
-        dbgs() << Idx << ", ";
+      for (unsigned ReorderIdx : ReorderIndices)
+        dbgs() << ReorderIdx << ", ";
       dbgs() << "\n";
       dbgs() << "UserTreeIndices: ";
       for (const auto &EInfo : UserTreeIndices)
@@ -1295,23 +1635,36 @@ private:
   };
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
+                          const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<unsigned> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
-    VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));
+    bool Vectorized = (bool)Bundle;
+    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
-    Last->NeedToGather = !Vectorized;
+    Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
+    Last->setOperations(S);
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = Last->Idx;
-      }
+        ScalarToTreeEntry[VL[i]] = Last;
+      }
+      // Update the scheduler bundle to point to this TreeEntry.
+      unsigned Lane = 0;
+      for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
+           BundleMember = BundleMember->NextInBundle) {
+        BundleMember->TE = Last;
+        BundleMember->Lane = Lane;
+        ++Lane;
+      }
+      assert((!Bundle.getValue() || Lane == VL.size()) &&
+             "Bundle and VL out of sync");
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
@@ -1319,7 +1672,6 @@ private:
     if (UserTreeIdx.UserTE)
       Last->UserTreeIndices.push_back(UserTreeIdx);
 
-    Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);
     return Last;
   }
 
@@ -1340,19 +1692,19 @@ private:
   TreeEntry *getTreeEntry(Value *V) {
     auto I = ScalarToTreeEntry.find(V);
     if (I != ScalarToTreeEntry.end())
-      return VectorizableTree[I->second].get();
+      return I->second;
     return nullptr;
   }
 
   const TreeEntry *getTreeEntry(Value *V) const {
     auto I = ScalarToTreeEntry.find(V);
     if (I != ScalarToTreeEntry.end())
-      return VectorizableTree[I->second].get();
+      return I->second;
     return nullptr;
   }
 
   /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
 
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
@@ -1408,15 +1760,14 @@ private:
   /// This is required to ensure that there are no incorrect collisions in the
   /// AliasCache, which can happen if a new instruction is allocated at the
   /// same address as a previously deleted instruction.
-  void eraseInstruction(Instruction *I) {
-    I->removeFromParent();
-    I->dropAllReferences();
-    DeletedInstructions.emplace_back(I);
+  void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
+    auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
+    It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
   }
 
   /// Temporary store for deleted instructions. Instructions will be deleted
   /// eventually when the BoUpSLP is destructed.
-  SmallVector<unique_value, 8> DeletedInstructions;
+  DenseMap<Instruction *, bool> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
@@ -1453,6 +1804,8 @@ private:
       UnscheduledDepsInBundle = UnscheduledDeps;
       clearDependencies();
       OpValue = OpVal;
+      TE = nullptr;
+      Lane = -1;
     }
 
     /// Returns true if the dependency information has been calculated.
@@ -1559,6 +1912,12 @@ private:
 
     /// Opcode of the current instruction in the schedule data.
     Value *OpValue = nullptr;
+
+    /// The TreeEntry that this instruction corresponds to.
+    TreeEntry *TE = nullptr;
+
+    /// The lane of this node in the TreeEntry.
+    int Lane = -1;
   };
 
 #ifndef NDEBUG
@@ -1615,7 +1974,7 @@ private:
       return nullptr;
     }
 
-    bool isInSchedulingRegion(ScheduleData *SD) {
+    bool isInSchedulingRegion(ScheduleData *SD) const {
       return SD->SchedulingRegionID == SchedulingRegionID;
     }
 
@@ -1633,10 +1992,9 @@ private:
           continue;
         }
         // Handle the def-use chain dependencies.
-        for (Use &U : BundleMember->Inst->operands()) {
-          auto *I = dyn_cast<Instruction>(U.get());
-          if (!I)
-            continue;
+
+        // Decrement the unscheduled counter and insert to ready list if ready.
+        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
           doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
             if (OpDef && OpDef->hasValidDependencies() &&
                 OpDef->incrementUnscheduledDeps(-1) == 0) {
@@ -1651,6 +2009,24 @@ private:
                          << "SLP:    gets ready (def): " << *DepBundle << "\n");
             }
           });
+        };
+
+        // If BundleMember is a vector bundle, its operands may have been
+        // reordered duiring buildTree(). We therefore need to get its operands
+        // through the TreeEntry.
+        if (TreeEntry *TE = BundleMember->TE) {
+          int Lane = BundleMember->Lane;
+          assert(Lane >= 0 && "Lane not set");
+          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
+               OpIdx != NumOperands; ++OpIdx)
+            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
+              DecrUnsched(I);
+        } else {
+          // If BundleMember is a stand-alone instruction, no operand reordering
+          // has taken place, so we directly access its operands.
+          for (Use &U : BundleMember->Inst->operands())
+            if (auto *I = dyn_cast<Instruction>(U.get()))
+              DecrUnsched(I);
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -1697,8 +2073,11 @@ private:
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                           const InstructionsState &S);
+    /// \returns the scheduling bundle. The returned Optional value is non-None
+    /// if \p VL is allowed to be scheduled.
+    Optional<ScheduleData *>
+    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                      const InstructionsState &S);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1937,7 +2316,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
 
   static std::string getNodeAttributes(const TreeEntry *Entry,
                                        const BoUpSLP *) {
-    if (Entry->NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather)
       return "color=red";
     return "";
   }
@@ -1945,6 +2324,30 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
 
 } // end namespace llvm
 
+BoUpSLP::~BoUpSLP() {
+  for (const auto &Pair : DeletedInstructions) {
+    // Replace operands of ignored instructions with Undefs in case if they were
+    // marked for deletion.
+    if (Pair.getSecond()) {
+      Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+      Pair.getFirst()->replaceAllUsesWith(Undef);
+    }
+    Pair.getFirst()->dropAllReferences();
+  }
+  for (const auto &Pair : DeletedInstructions) {
+    assert(Pair.getFirst()->use_empty() &&
+           "trying to erase instruction with users.");
+    Pair.getFirst()->eraseFromParent();
+  }
+}
+
+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
+  for (auto *V : AV) {
+    if (auto *I = dyn_cast<Instruction>(V))
+      eraseInstruction(I, /*ReplaceWithUndef=*/true);
+  };
+}
+
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
@@ -1965,7 +2368,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather)
       continue;
 
     // For each lane:
@@ -2002,7 +2405,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
-            assert(!UseEntry->NeedToGather && "Bad state");
+            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
             continue;
           }
         }
@@ -2026,28 +2429,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
@@ -2055,11 +2458,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // the same block.
 
   // Don't vectorize ephemeral values.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (EphValues.count(VL[i])) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+  for (Value *V : VL) {
+    if (EphValues.count(V)) {
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
   }
@@ -2069,7 +2472,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
     if (!E->isSame(VL)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
@@ -2077,19 +2480,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     E->UserTreeIndices.push_back(UserTreeIdx);
     LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
                       << ".\n");
-    E->trySetUserTEOperand(UserTreeIdx, VL, None);
     return;
   }
 
   // Check that none of the instructions in the bundle are already in the tree.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    auto *I = dyn_cast<Instruction>(VL[i]);
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
     if (getTreeEntry(I)) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
   }
@@ -2097,10 +2499,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // If any of the scalars is marked as a value that needs to stay scalar, then
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
+  for (Value *V : VL) {
+    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
   }
@@ -2114,7 +2516,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
@@ -2128,13 +2530,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (Res.second)
       UniqueValues.emplace_back(V);
   }
-  if (UniqueValues.size() == VL.size()) {
+  size_t NumUniqueScalarValues = UniqueValues.size();
+  if (NumUniqueScalarValues == VL.size()) {
     ReuseShuffleIndicies.clear();
   } else {
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-    if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
+    if (NumUniqueScalarValues <= 1 ||
+        !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
     VL = UniqueValues;
@@ -2142,16 +2546,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
-    BSRef = llvm::make_unique<BlockScheduling>(BB);
+    BSRef = std::make_unique<BlockScheduling>(BB);
 
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this, S)) {
+  Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+  if (!Bundle) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                 ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -2160,7 +2566,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                 (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
-      PHINode *PH = dyn_cast<PHINode>(VL0);
+      auto *PH = cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
       for (unsigned j = 0; j < VL.size(); ++j)
@@ -2172,23 +2578,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                         ReuseShuffleIndicies);
             return;
           }
         }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE =
+          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
+      // Keeps the reordered operands to avoid code duplication.
+      SmallVector<ValueList, 2> OperandsVec;
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
-
-        buildTree_rec(Operands, Depth + 1, {TE, i});
+        TE->setOperand(i, Operands);
+        OperandsVec.push_back(Operands);
       }
+      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
       return;
     }
     case Instruction::ExtractValue:
@@ -2198,13 +2610,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies);
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
+        VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       if (!CurrentOrder.empty()) {
@@ -2220,17 +2632,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
+        VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
       BS.cancelScheduling(VL, VL0);
       return;
     }
@@ -2246,7 +2660,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -2259,7 +2674,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -2285,19 +2701,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
         uint64_t Size = DL->getTypeAllocSize(ScalarTy);
         // Check that the sorted loads are consecutive.
-        if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
-                         ReuseShuffleIndicies);
+            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+                                         UserTreeIdx, ReuseShuffleIndicies);
+            TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
-                         ReuseShuffleIndicies, I->getFirst());
+            TreeEntry *TE =
+                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                             ReuseShuffleIndicies, I->getFirst());
+            TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
           return;
@@ -2306,7 +2725,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -2322,24 +2742,27 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+      for (Value *V : VL) {
+        Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -2351,19 +2774,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
       Type *ComparedTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-        CmpInst *Cmp = cast<CmpInst>(VL[i]);
+      for (Value *V : VL) {
+        CmpInst *Cmp = cast<CmpInst>(V);
         if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
@@ -2371,7 +2796,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == SwapP0 && "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
@@ -2384,7 +2809,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           Right.push_back(RHS);
         }
       }
-
+      TE->setOperand(0, Left);
+      TE->setOperand(1, Right);
       buildTree_rec(Left, Depth + 1, {TE, 0});
       buildTree_rec(Right, Depth + 1, {TE, 1});
       return;
@@ -2409,19 +2835,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        TE->setOperand(0, Left);
+        TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
         buildTree_rec(Right, Depth + 1, {TE, 1});
         return;
       }
 
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -2434,11 +2864,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+      for (Value *V : VL) {
+        if (cast<Instruction>(V)->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           return;
         }
       }
@@ -2446,59 +2877,120 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = VL0->getOperand(0)->getType();
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+      for (Value *V : VL) {
+        Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        auto Op = cast<Instruction>(VL[j])->getOperand(1);
-        if (!isa<ConstantInt>(Op)) {
+      Type *Ty1 = VL0->getOperand(1)->getType();
+      for (Value *V : VL) {
+        auto Op = cast<Instruction>(V)->getOperand(1);
+        if (!isa<ConstantInt>(Op) ||
+            (Op->getType() != Ty1 &&
+             Op->getType()->getScalarSizeInBits() >
+                 DL->getIndexSizeInBits(
+                     V->getType()->getPointerAddressSpace()))) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           return;
         }
       }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
       return;
     }
     case Instruction::Store: {
-      // Check if the stores are consecutive or of we need to swizzle them.
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+      // Check if the stores are consecutive or if we need to swizzle them.
+      llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+      // Make sure all stores in the bundle are simple - we can't vectorize
+      // atomic or volatile stores.
+      SmallVector<Value *, 4> PointerOps(VL.size());
+      ValueList Operands(VL.size());
+      auto POIter = PointerOps.begin();
+      auto OIter = Operands.begin();
+      for (Value *V : VL) {
+        auto *SI = cast<StoreInst>(V);
+        if (!SI->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
           return;
         }
+        *POIter = SI->getPointerOperand();
+        *OIter = SI->getValueOperand();
+        ++POIter;
+        ++OIter;
+      }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
-      LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
-
-      ValueList Operands;
-      for (Value *j : VL)
-        Operands.push_back(cast<Instruction>(j)->getOperand(0));
+      OrdersType CurrentOrder;
+      // Check the order of pointer operands.
+      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+        Value *Ptr0;
+        Value *PtrN;
+        if (CurrentOrder.empty()) {
+          Ptr0 = PointerOps.front();
+          PtrN = PointerOps.back();
+        } else {
+          Ptr0 = PointerOps[CurrentOrder.front()];
+          PtrN = PointerOps[CurrentOrder.back()];
+        }
+        const SCEV *Scev0 = SE->getSCEV(Ptr0);
+        const SCEV *ScevN = SE->getSCEV(PtrN);
+        const auto *Diff =
+            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+        uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+        // Check that the sorted pointer operands are consecutive.
+        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+          if (CurrentOrder.empty()) {
+            // Original stores are consecutive and does not require reordering.
+            ++NumOpsWantToKeepOriginalOrder;
+            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+                                         UserTreeIdx, ReuseShuffleIndicies);
+            TE->setOperandsInOrder();
+            buildTree_rec(Operands, Depth + 1, {TE, 0});
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+          } else {
+            // Need to reorder.
+            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            ++(I->getSecond());
+            TreeEntry *TE =
+                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                             ReuseShuffleIndicies, I->getFirst());
+            TE->setOperandsInOrder();
+            buildTree_rec(Operands, Depth + 1, {TE, 0});
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+          }
+          return;
+        }
+      }
 
-      buildTree_rec(Operands, Depth + 1, {TE, 0});
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
       return;
     }
     case Instruction::Call: {
@@ -2509,7 +3001,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -2519,14 +3012,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned j = 0; j != NumArgs; ++j)
         if (hasVectorInstrinsicScalarOpd(ID, j))
           ScalarArgs[j] = CI->getArgOperand(j);
-      for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-        CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+      for (Value *V : VL) {
+        CallInst *CI2 = dyn_cast<CallInst>(V);
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
                             << "\n");
           return;
         }
@@ -2537,7 +3031,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Value *A1J = CI2->getArgOperand(j);
             if (ScalarArgs[j] != A1J) {
               BS.cancelScheduling(VL, VL0);
-              newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+              newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                           ReuseShuffleIndicies);
               LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                                 << " argument " << ScalarArgs[j] << "!=" << A1J
                                 << "\n");
@@ -2551,19 +3046,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
-                            << *CI << "!=" << *VL[i] << '\n');
+                            << *CI << "!=" << *V << '\n');
           return;
         }
       }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL) {
-          CallInst *CI2 = dyn_cast<CallInst>(j);
+        for (Value *V : VL) {
+          auto *CI2 = cast<CallInst>(V);
           Operands.push_back(CI2->getArgOperand(i));
         }
         buildTree_rec(Operands, Depth + 1, {TE, i});
@@ -2575,27 +3073,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // then do not vectorize this instruction.
       if (!S.isAltShuffle()) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        TE->setOperand(0, Left);
+        TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
         buildTree_rec(Right, Depth + 1, {TE, 1});
         return;
       }
 
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -2603,34 +3106,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
 }
 
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
-  unsigned N;
-  Type *EltTy;
-  auto *ST = dyn_cast<StructType>(T);
-  if (ST) {
-    N = ST->getNumElements();
-    EltTy = *ST->element_begin();
-  } else {
-    N = cast<ArrayType>(T)->getNumElements();
-    EltTy = cast<ArrayType>(T)->getElementType();
+  unsigned N = 1;
+  Type *EltTy = T;
+
+  while (isa<CompositeType>(EltTy)) {
+    if (auto *ST = dyn_cast<StructType>(EltTy)) {
+      // Check that struct is homogeneous.
+      for (const auto *Ty : ST->elements())
+        if (Ty != *ST->element_begin())
+          return 0;
+      N *= ST->getNumElements();
+      EltTy = *ST->element_begin();
+    } else {
+      auto *SeqT = cast<SequentialType>(EltTy);
+      N *= SeqT->getNumElements();
+      EltTy = SeqT->getElementType();
+    }
   }
+
   if (!isValidElementType(EltTy))
     return 0;
   uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
     return 0;
-  if (ST) {
-    // Check that struct is homogeneous.
-    for (const auto *Ty : ST->elements())
-      if (Ty != EltTy)
-        return 0;
-  }
   return N;
 }
 
@@ -2731,14 +3237,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
-  if (E->NeedToGather) {
+  if (E->State == TreeEntry::NeedToGather) {
     if (allConstant(VL))
       return 0;
     if (isSplat(VL)) {
       return ReuseShuffleCost +
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
-    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+    if (E->getOpcode() == Instruction::ExtractElement &&
         allSameType(VL) && allSameBlock(VL)) {
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
@@ -2761,11 +3267,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
-  InstructionsState S = getSameOpcode(VL);
-  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-               (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  Instruction *VL0 = E->getMainOp();
+  unsigned ShuffleOrOp =
+      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
@@ -2800,7 +3305,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
-      if (!E->NeedToGather) {
+      if (E->State == TreeEntry::Vectorize) {
         int DeadCost = ReuseShuffleCost;
         if (!E->ReorderIndices.empty()) {
           // TODO: Merge this shuffle with the ReuseShuffleCost.
@@ -2851,7 +3356,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
-          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
@@ -2864,7 +3369,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+                  TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
       }
       return VecCost - ScalarCost;
     }
@@ -2872,14 +3377,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+      int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
                                                   Builder.getInt1Ty(), VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+      int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::FNeg:
@@ -2940,13 +3445,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
-          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+          E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
-                                                Op2VK, Op1VP, Op2VP, Operands);
+      int VecCost = TTI->getArithmeticInstrCost(
+          E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -2967,7 +3472,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
-      unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
+      MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());
       int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
       if (NeedToShuffleReuses) {
@@ -2985,15 +3490,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
-      unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
+      bool IsReorder = !E->ReorderIndices.empty();
+      auto *SI =
+          cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
+      MaybeAlign Alignment(SI->getAlignment());
       int ScalarEltCost =
-          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
-      if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
-      }
+          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
+      if (NeedToShuffleReuses)
+        ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecStCost =
-          TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
+      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
+                                           VecTy, Alignment, 0, VL0);
+      if (IsReorder) {
+        // TODO: Merge this shuffle with the ReuseShuffleCost.
+        VecStCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+      }
       return ReuseShuffleCost + VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
@@ -3027,11 +3539,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
@@ -3046,25 +3558,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
               I, TargetTransformInfo::TCK_RecipThroughput);
         }
       }
-      for (Value *i : VL) {
-        Instruction *I = cast<Instruction>(i);
-        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+      for (Value *V : VL) {
+        Instruction *I = cast<Instruction>(V);
+        assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(
             I, TargetTransformInfo::TCK_RecipThroughput);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
-        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
+        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
       } else {
-        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
-        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
         VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
         VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
-        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
+        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
@@ -3079,25 +3591,64 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
                     << VectorizableTree.size() << " is fully vectorizable .\n");
 
   // We only handle trees of heights 1 and 2.
-  if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)
+  if (VectorizableTree.size() == 1 &&
+      VectorizableTree[0]->State == TreeEntry::Vectorize)
     return true;
 
   if (VectorizableTree.size() != 2)
     return false;
 
   // Handle splat and all-constants stores.
-  if (!VectorizableTree[0]->NeedToGather &&
+  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
       (allConstant(VectorizableTree[1]->Scalars) ||
        isSplat(VectorizableTree[1]->Scalars)))
     return true;
 
   // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)
+  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
+      VectorizableTree[1]->State == TreeEntry::NeedToGather)
     return false;
 
   return true;
 }
 
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+  if (RdxOpcode != Instruction::Or)
+    return false;
+
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+
+  // Look past the reduction to find a source value. Arbitrarily follow the
+  // path through operand 0 of any 'or'. Also, peek through optional
+  // shift-left-by-constant.
+  Value *ZextLoad = FirstReduced;
+  while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+         match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+    ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+  // Check if the input to the reduction is an extended load.
+  Value *LoadPtr;
+  if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+    return false;
+
+  // Require that the total load bit width is a legal integer type.
+  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+  Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+  LLVMContext &Context = FirstReduced->getContext();
+  if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+    return false;
+
+  // Everything matched - assume that we can fold the whole sequence using
+  // load combining.
+  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
+             << *(cast<Instruction>(FirstReduced)) << "\n");
+
+  return true;
+}
+
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
@@ -3165,7 +3716,7 @@ int BoUpSLP::getSpillCost() const {
         continue;
       }
 
-      // Debug informations don't impact spill cost.
+      // Debug information does not impact spill cost.
       if ((isa<CallInst>(&*PrevInstIt) &&
            !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
           &*PrevInstIt != PrevInst)
@@ -3209,12 +3760,13 @@ int BoUpSLP::getTreeCost() {
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yield different results.
     //
-    if (TE.NeedToGather &&
-        std::any_of(
-            std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),
-            [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-              return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);
-            }))
+    if (TE.State == TreeEntry::NeedToGather &&
+        std::any_of(std::next(VectorizableTree.begin(), I + 1),
+                    VectorizableTree.end(),
+                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                      return EntryPtr->State == TreeEntry::NeedToGather &&
+                             EntryPtr->isSame(TE.Scalars);
+                    }))
       continue;
 
     int C = getEntryCost(&TE);
@@ -3306,29 +3858,31 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
 
 // Perform operand reordering on the instructions in VL and return the reordered
 // operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(
-    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
-    SmallVectorImpl<Value *> &Right, const DataLayout &DL,
-    ScalarEvolution &SE) {
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                             SmallVectorImpl<Value *> &Left,
+                                             SmallVectorImpl<Value *> &Right,
+                                             const DataLayout &DL,
+                                             ScalarEvolution &SE,
+                                             const BoUpSLP &R) {
   if (VL.empty())
     return;
-  VLOperands Ops(VL, DL, SE);
+  VLOperands Ops(VL, DL, SE, R);
   // Reorder the operands in place.
   Ops.reorder();
   Left = Ops.getVL(0);
   Right = Ops.getVL(1);
 }
 
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                        const InstructionsState &S) {
+void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
-  auto *Front = cast<Instruction>(S.OpValue);
+  auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
-  assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
-    auto *I = cast<Instruction>(V);
-    return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
-  }));
+  assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
+                      [=](Value *V) -> bool {
+                        auto *I = cast<Instruction>(V);
+                        return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+                      }));
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
@@ -3339,7 +3893,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
     auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
+        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -3365,14 +3919,15 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
-    SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
+    SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
-      if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
+      if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
         LastInst = &I;
       if (Bundle.empty())
         break;
     }
   }
+  assert(LastInst && "Failed to find last instruction in bundle");
 
   // Set the insertion point after the last instruction in the bundle. Set the
   // debug location to Front.
@@ -3385,7 +3940,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   // Generate the 'InsertElement' instruction.
   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
-    if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
+    if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
       GatherSeq.insert(Insrt);
       CSEBlocks.insert(Insrt->getParent());
 
@@ -3494,8 +4049,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return E->VectorizedValue;
   }
 
-  InstructionsState S = getSameOpcode(E->Scalars);
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
+  Instruction *VL0 = E->getMainOp();
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
@@ -3503,8 +4057,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
-  if (E->NeedToGather) {
-    setInsertPointAfterBundle(E->Scalars, S);
+  if (E->State == TreeEntry::NeedToGather) {
+    setInsertPointAfterBundle(E);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
       V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3518,11 +4072,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return V;
   }
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-           (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp =
+      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
-      PHINode *PH = dyn_cast<PHINode>(VL0);
+      auto *PH = cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
@@ -3558,7 +4112,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
 
     case Instruction::ExtractElement: {
-      if (!E->NeedToGather) {
+      if (E->State == TreeEntry::Vectorize) {
         Value *V = E->getSingleOperand(0);
         if (!E->ReorderIndices.empty()) {
           OrdersType Mask;
@@ -3577,7 +4131,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         E->VectorizedValue = V;
         return V;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3591,7 +4145,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::ExtractValue: {
-      if (!E->NeedToGather) {
+      if (E->State == TreeEntry::Vectorize) {
         LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
         Builder.SetInsertPoint(LI);
         PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
@@ -3612,7 +4166,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         E->VectorizedValue = NewV;
         return NewV;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3637,7 +4191,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *InVec = vectorizeTree(E->getOperand(0));
 
@@ -3646,7 +4200,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         return E->VectorizedValue;
       }
 
-      CastInst *CI = dyn_cast<CastInst>(VL0);
+      auto *CI = cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3658,7 +4212,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     case Instruction::FCmp:
     case Instruction::ICmp: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *L = vectorizeTree(E->getOperand(0));
       Value *R = vectorizeTree(E->getOperand(1));
@@ -3670,7 +4224,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
-      if (S.getOpcode() == Instruction::FCmp)
+      if (E->getOpcode() == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
@@ -3685,7 +4239,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::Select: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *Cond = vectorizeTree(E->getOperand(0));
       Value *True = vectorizeTree(E->getOperand(1));
@@ -3706,7 +4260,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::FNeg: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *Op = vectorizeTree(E->getOperand(0));
 
@@ -3716,7 +4270,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateUnOp(
-          static_cast<Instruction::UnaryOps>(S.getOpcode()), Op);
+          static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3748,7 +4302,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *LHS = vectorizeTree(E->getOperand(0));
       Value *RHS = vectorizeTree(E->getOperand(1));
@@ -3759,7 +4313,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
+          RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3776,12 +4331,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
-      bool IsReorder = !E->ReorderIndices.empty();
-      if (IsReorder) {
-        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
-        VL0 = cast<Instruction>(S.OpValue);
-      }
-      setInsertPointAfterBundle(E->Scalars, S);
+      bool IsReorder = E->updateStateIfReorder();
+      if (IsReorder)
+        VL0 = E->getMainOp();
+      setInsertPointAfterBundle(E);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
@@ -3797,11 +4350,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (getTreeEntry(PO))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
-      unsigned Alignment = LI->getAlignment();
+      MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
       LI = Builder.CreateLoad(VecTy, VecPtr);
-      if (!Alignment) {
-        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
-      }
+      if (!Alignment)
+        Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
       LI->setAlignment(Alignment);
       Value *V = propagateMetadata(LI, E->Scalars);
       if (IsReorder) {
@@ -3820,15 +4372,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(VL0);
+      bool IsReorder = !E->ReorderIndices.empty();
+      auto *SI = cast<StoreInst>(
+          IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
       unsigned Alignment = SI->getAlignment();
       unsigned AS = SI->getPointerAddressSpace();
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *VecValue = vectorizeTree(E->getOperand(0));
+      if (IsReorder) {
+        OrdersType Mask;
+        inversePermutation(E->ReorderIndices, Mask);
+        VecValue = Builder.CreateShuffleVector(
+            VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
+            "reorder_shuffle");
+      }
       Value *ScalarPtr = SI->getPointerOperand();
-      Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
+      Value *VecPtr = Builder.CreateBitCast(
+          ScalarPtr, VecValue->getType()->getPointerTo(AS));
       StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
 
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
@@ -3840,7 +4402,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (!Alignment)
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
 
-      ST->setAlignment(Alignment);
+      ST->setAlignment(Align(Alignment));
       Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3851,14 +4413,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::GetElementPtr: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *Op0 = vectorizeTree(E->getOperand(0));
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
            ++j) {
-        Value *OpVec = vectorizeTree(E->getOperand(j));
+        ValueList &VL = E->getOperand(j);
+        // Need to cast all elements to the same type before vectorization to
+        // avoid crash.
+        Type *VL0Ty = VL0->getOperand(j)->getType();
+        Type *Ty = llvm::all_of(
+                       VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
+                       ? VL0Ty
+                       : DL->getIndexType(cast<GetElementPtrInst>(VL0)
+                                              ->getPointerOperandType()
+                                              ->getScalarType());
+        for (Value *&V : VL) {
+          auto *CI = cast<ConstantInt>(V);
+          V = ConstantExpr::getIntegerCast(CI, Ty,
+                                           CI->getValue().isSignBitSet());
+        }
+        Value *OpVec = vectorizeTree(VL);
         OpVecs.push_back(OpVec);
       }
 
@@ -3878,13 +4455,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E->Scalars, S);
-      Function *FI;
+      setInsertPointAfterBundle(E);
+
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
-      Value *ScalarArg = nullptr;
-      if (CI && (FI = CI->getCalledFunction())) {
+      if (Function *FI = CI->getCalledFunction())
         IID = FI->getIntrinsicID();
-      }
+
+      Value *ScalarArg = nullptr;
       std::vector<Value *> OpVecs;
       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
         ValueList OpVL;
@@ -3926,20 +4503,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
 
-      Value *LHS, *RHS;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        setInsertPointAfterBundle(E->Scalars, S);
+      Value *LHS = nullptr, *RHS = nullptr;
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        setInsertPointAfterBundle(E);
         LHS = vectorizeTree(E->getOperand(0));
         RHS = vectorizeTree(E->getOperand(1));
       } else {
-        setInsertPointAfterBundle(E->Scalars, S);
+        setInsertPointAfterBundle(E);
         LHS = vectorizeTree(E->getOperand(0));
       }
 
@@ -3949,16 +4526,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V0, *V1;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
+      if (Instruction::isBinaryOp(E->getOpcode())) {
         V0 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
         V1 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+            static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
       } else {
         V0 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
         V1 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
       }
 
       // Create shuffle to take alternate operations from the vector.
@@ -3969,8 +4546,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
-        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        if (OpInst->getOpcode() == S.getAltOpcode()) {
+        assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+        if (OpInst->getOpcode() == E->getAltOpcode()) {
           Mask[i] = Builder.getInt32(e + i);
           AltScalars.push_back(E->Scalars[i]);
         } else {
@@ -4054,7 +4631,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
-    assert(!E->NeedToGather && "Extracting from a gather list");
+    assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
@@ -4127,7 +4704,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather)
       continue;
 
     assert(Entry->VectorizedValue && "Can't find vectorizable value");
@@ -4136,20 +4713,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
+#ifndef NDEBUG
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
-#ifndef NDEBUG
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
-          // It is legal to replace users in the ignorelist by undef.
+          // It is legal to delete users in the ignorelist.
           assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
-                 "Replacing out-of-tree value with undef");
+                 "Deleting out-of-tree value");
         }
-#endif
-        Value *Undef = UndefValue::get(Ty);
-        Scalar->replaceAllUsesWith(Undef);
       }
+#endif
       LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
       eraseInstruction(cast<Instruction>(Scalar));
     }
@@ -4165,7 +4740,7 @@ void BoUpSLP::optimizeGatherSequence() {
                     << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
   for (Instruction *I : GatherSeq) {
-    if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
+    if (isDeleted(I))
       continue;
 
     // Check if this block is inside a loop.
@@ -4219,6 +4794,8 @@ void BoUpSLP::optimizeGatherSequence() {
     // For all instructions in blocks containing gather sequences:
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
       Instruction *In = &*it++;
+      if (isDeleted(In))
+        continue;
       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
         continue;
 
@@ -4245,11 +4822,11 @@ void BoUpSLP::optimizeGatherSequence() {
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
-bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP,
-                                                 const InstructionsState &S) {
+Optional<BoUpSLP::ScheduleData *>
+BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                                            const InstructionsState &S) {
   if (isa<PHINode>(S.OpValue))
-    return true;
+    return nullptr;
 
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
@@ -4262,7 +4839,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   // instructions of the bundle.
   for (Value *V : VL) {
     if (!extendSchedulingRegion(V, S))
-      return false;
+      return None;
   }
 
   for (Value *V : VL) {
@@ -4308,6 +4885,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
     resetSchedule();
     initialFillReadyList(ReadyInsts);
   }
+  assert(Bundle && "Failed to find schedule bundle");
 
   LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
                     << BB->getName() << "\n");
@@ -4329,9 +4907,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   }
   if (!Bundle->isReady()) {
     cancelScheduling(VL, S.OpValue);
-    return false;
+    return None;
   }
-  return true;
+  return Bundle;
 }
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
@@ -4364,7 +4942,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
   // Allocate a new ScheduleData for the instruction.
   if (ChunkPos >= ChunkSize) {
-    ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+    ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
     ChunkPos = 0;
   }
   return &(ScheduleDataChunks.back()[ChunkPos++]);
@@ -4977,7 +5555,7 @@ struct SLPVectorizer : public FunctionPass {
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -5052,7 +5630,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // If the target claims to have no vector registers don't attempt
   // vectorization.
-  if (!TTI->getNumberOfRegisters(true))
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
     return false;
 
   // Don't vectorize when the attribute NoImplicitFloat is used.
@@ -5100,139 +5678,141 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   return Changed;
 }
 
-/// Check that the Values in the slice in VL array are still existent in
-/// the WeakTrackingVH array.
-/// Vectorization of part of the VL array may cause later values in the VL array
-/// to become invalid. We track when this has happened in the WeakTrackingVH
-/// array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
-                               ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
-                               unsigned SliceSize) {
-  VL = VL.slice(SliceBegin, SliceSize);
-  VH = VH.slice(SliceBegin, SliceSize);
-  return !std::equal(VL.begin(), VL.end(), VH.begin());
-}
-
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
-                                            unsigned VecRegSize) {
-  const unsigned ChainLen = Chain.size();
-  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+                                            unsigned Idx) {
+  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
                     << "\n");
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
-  const unsigned VF = VecRegSize / Sz;
+  const unsigned MinVF = R.getMinVecRegSize() / Sz;
+  unsigned VF = Chain.size();
 
-  if (!isPowerOf2_32(Sz) || VF < 2)
+  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
     return false;
 
-  // Keep track of values that were deleted by vectorizing in the loop below.
-  const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
-
-  bool Changed = false;
-  // Look for profitable vectorizable trees at all offsets, starting at zero.
-  for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
-
-    // Check that a previous iteration of this loop did not delete the Value.
-    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
-      continue;
-
-    LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
-                      << "\n");
-    ArrayRef<Value *> Operands = Chain.slice(i, VF);
-
-    R.buildTree(Operands);
-    if (R.isTreeTinyAndNotFullyVectorizable())
-      continue;
+  LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
+                    << "\n");
 
-    R.computeMinimumValueSizes();
+  R.buildTree(Chain);
+  Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+  // TODO: Handle orders of size less than number of elements in the vector.
+  if (Order && Order->size() == Chain.size()) {
+    // TODO: reorder tree nodes without tree rebuilding.
+    SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
+    llvm::transform(*Order, ReorderedOps.begin(),
+                    [Chain](const unsigned Idx) { return Chain[Idx]; });
+    R.buildTree(ReorderedOps);
+  }
+  if (R.isTreeTinyAndNotFullyVectorizable())
+    return false;
 
-    int Cost = R.getTreeCost();
+  R.computeMinimumValueSizes();
 
-    LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
-                      << "\n");
-    if (Cost < -SLPCostThreshold) {
-      LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+  int Cost = R.getTreeCost();
 
-      using namespace ore;
+  LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+  if (Cost < -SLPCostThreshold) {
+    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
 
-      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
-                                          cast<StoreInst>(Chain[i]))
-                       << "Stores SLP vectorized with cost " << NV("Cost", Cost)
-                       << " and with tree size "
-                       << NV("TreeSize", R.getTreeSize()));
+    using namespace ore;
 
-      R.vectorizeTree();
+    R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+                                        cast<StoreInst>(Chain[0]))
+                     << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+                     << " and with tree size "
+                     << NV("TreeSize", R.getTreeSize()));
 
-      // Move to the next bundle.
-      i += VF - 1;
-      Changed = true;
-    }
+    R.vectorizeTree();
+    return true;
   }
 
-  return Changed;
+  return false;
 }
 
 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
                                         BoUpSLP &R) {
-  SetVector<StoreInst *> Heads;
-  SmallDenseSet<StoreInst *> Tails;
-  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
-
   // We may run into multiple chains that merge into a single chain. We mark the
   // stores that we vectorized so that we don't visit the same store twice.
   BoUpSLP::ValueSet VectorizedStores;
   bool Changed = false;
 
-  auto &&FindConsecutiveAccess =
-      [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {
-        if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
-          return false;
-
-        Tails.insert(Stores[Idx]);
-        Heads.insert(Stores[K]);
-        ConsecutiveChain[Stores[K]] = Stores[Idx];
-        return true;
-      };
+  int E = Stores.size();
+  SmallBitVector Tails(E, false);
+  SmallVector<int, 16> ConsecutiveChain(E, E + 1);
+  int MaxIter = MaxStoreLookup.getValue();
+  int IterCnt;
+  auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
+                                  &ConsecutiveChain](int K, int Idx) {
+    if (IterCnt >= MaxIter)
+      return true;
+    ++IterCnt;
+    if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
+      return false;
 
+    Tails.set(Idx);
+    ConsecutiveChain[K] = Idx;
+    return true;
+  };
   // Do a quadratic search on all of the given stores in reverse order and find
   // all of the pairs of stores that follow each other.
-  int E = Stores.size();
   for (int Idx = E - 1; Idx >= 0; --Idx) {
     // If a store has multiple consecutive store candidates, search according
     // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
     // This is because usually pairing with immediate succeeding or preceding
     // candidate create the best chance to find slp vectorization opportunity.
-    for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)
+    const int MaxLookDepth = std::max(E - Idx, Idx + 1);
+    IterCnt = 0;
+    for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
       if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
           (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
         break;
   }
 
   // For stores that start but don't end a link in the chain:
-  for (auto *SI : llvm::reverse(Heads)) {
-    if (Tails.count(SI))
+  for (int Cnt = E; Cnt > 0; --Cnt) {
+    int I = Cnt - 1;
+    if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
       continue;
-
     // We found a store instr that starts a chain. Now follow the chain and try
     // to vectorize it.
     BoUpSLP::ValueList Operands;
-    StoreInst *I = SI;
     // Collect the chain into a list.
-    while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
-      Operands.push_back(I);
+    while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
+      Operands.push_back(Stores[I]);
       // Move to the next value in the chain.
       I = ConsecutiveChain[I];
     }
 
+    // If a vector register can't hold 1 element, we are done.
+    unsigned MaxVecRegSize = R.getMaxVecRegSize();
+    unsigned EltSize = R.getVectorElementSize(Stores[0]);
+    if (MaxVecRegSize % EltSize != 0)
+      continue;
+
+    unsigned MaxElts = MaxVecRegSize / EltSize;
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
-    for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
-         Size /= 2) {
-      if (vectorizeStoreChain(Operands, R, Size)) {
-        // Mark the vectorized stores so that we don't vectorize them again.
-        VectorizedStores.insert(Operands.begin(), Operands.end());
-        Changed = true;
-        break;
+    unsigned StartIdx = 0;
+    for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
+      for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+        ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
+        if (!VectorizedStores.count(Slice.front()) &&
+            !VectorizedStores.count(Slice.back()) &&
+            vectorizeStoreChain(Slice, R, Cnt)) {
+          // Mark the vectorized stores so that we don't vectorize them again.
+          VectorizedStores.insert(Slice.begin(), Slice.end());
+          Changed = true;
+          // If we vectorized initial block, no need to try to vectorize it
+          // again.
+          if (Cnt == StartIdx)
+            StartIdx += Size;
+          Cnt += Size;
+          continue;
+        }
+        ++Cnt;
       }
+      // Check if the whole array was vectorized already - exit.
+      if (StartIdx >= Operands.size())
+        break;
     }
   }
 
@@ -5329,12 +5909,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   bool CandidateFound = false;
   int MinCost = SLPCostThreshold;
 
-  // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
-
   unsigned NextInst = 0, MaxInst = VL.size();
-  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
-       VF /= 2) {
+  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
@@ -5352,13 +5928,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
         break;
 
+      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
       // Check that a previous iteration of this loop did not delete the Value.
-      if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+      if (llvm::any_of(Ops, [&R](Value *V) {
+            auto *I = dyn_cast<Instruction>(V);
+            return I && R.isDeleted(I);
+          }))
         continue;
 
       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
                         << "\n");
-      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
       R.buildTree(Ops);
       Optional<ArrayRef<unsigned>> Order = R.bestOrder();
@@ -5571,7 +6150,7 @@ class HorizontalReduction {
     Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
       assert(isVectorizable() &&
              "Expected add|fadd or min/max reduction operation.");
-      Value *Cmp;
+      Value *Cmp = nullptr;
       switch (Kind) {
       case RK_Arithmetic:
         return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
@@ -5579,23 +6158,23 @@ class HorizontalReduction {
       case RK_Min:
         Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
                                           : Builder.CreateFCmpOLT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_Max:
         Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
                                           : Builder.CreateFCmpOGT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_UMin:
         assert(Opcode == Instruction::ICmp && "Expected integer types.");
         Cmp = Builder.CreateICmpULT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_UMax:
         assert(Opcode == Instruction::ICmp && "Expected integer types.");
         Cmp = Builder.CreateICmpUGT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_None:
-        llvm_unreachable("Unknown reduction operation.");
+        break;
       }
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+      llvm_unreachable("Unknown reduction operation.");
     }
 
   public:
@@ -5618,38 +6197,36 @@ class HorizontalReduction {
 
     explicit operator bool() const { return Opcode; }
 
-    /// Get the index of the first operand.
-    unsigned getFirstOperandIndex() const {
-      assert(!!*this && "The opcode is not set.");
+    /// Return true if this operation is any kind of minimum or maximum.
+    bool isMinMax() const {
       switch (Kind) {
+      case RK_Arithmetic:
+        return false;
       case RK_Min:
-      case RK_UMin:
       case RK_Max:
+      case RK_UMin:
       case RK_UMax:
-        return 1;
-      case RK_Arithmetic:
+        return true;
       case RK_None:
         break;
       }
-      return 0;
+      llvm_unreachable("Reduction kind is not set");
+    }
+
+    /// Get the index of the first operand.
+    unsigned getFirstOperandIndex() const {
+      assert(!!*this && "The opcode is not set.");
+      // We allow calling this before 'Kind' is set, so handle that specially.
+      if (Kind == RK_None)
+        return 0;
+      return isMinMax() ? 1 : 0;
     }
 
     /// Total number of operands in the reduction operation.
     unsigned getNumberOfOperands() const {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        return 2;
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax:
-        return 3;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
+      return isMinMax() ? 3 : 2;
     }
 
     /// Checks if the operation has the same parent as \p P.
@@ -5658,79 +6235,46 @@ class HorizontalReduction {
              "Expected reduction operation.");
       if (!IsRedOp)
         return I->getParent() == P;
-      switch (Kind) {
-      case RK_Arithmetic:
-        // Arithmetic reduction operation must be used once only.
-        return I->getParent() == P;
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax: {
+      if (isMinMax()) {
         // SelectInst must be used twice while the condition op must have single
         // use only.
         auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
         return I->getParent() == P && Cmp && Cmp->getParent() == P;
       }
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
+      // Arithmetic reduction operation must be used once only.
+      return I->getParent() == P;
     }
+
     /// Expected number of uses for reduction operations/reduced values.
     bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        return I->hasOneUse();
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax:
+      if (isMinMax())
         return I->hasNUses(2) &&
                (!IsReductionOp ||
                 cast<SelectInst>(I)->getCondition()->hasOneUse());
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
+      return I->hasOneUse();
     }
 
     /// Initializes the list of reduction operations.
     void initReductionOps(ReductionOpsListType &ReductionOps) {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        ReductionOps.assign(1, ReductionOpsType());
-        break;
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax:
+      if (isMinMax())
         ReductionOps.assign(2, ReductionOpsType());
-        break;
-      case RK_None:
-        llvm_unreachable("Reduction kind is not set");
-      }
+      else
+        ReductionOps.assign(1, ReductionOpsType());
     }
+
     /// Add all reduction operations for the reduction instruction \p I.
     void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        ReductionOps[0].emplace_back(I);
-        break;
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax:
+      if (isMinMax()) {
         ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
         ReductionOps[1].emplace_back(I);
-        break;
-      case RK_None:
-        llvm_unreachable("Reduction kind is not set");
+      } else {
+        ReductionOps[0].emplace_back(I);
       }
     }
 
@@ -5763,12 +6307,12 @@ class HorizontalReduction {
 
     /// Checks if two operation data are both a reduction op or both a reduced
     /// value.
-    bool operator==(const OperationData &OD) {
+    bool operator==(const OperationData &OD) const {
       assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
              "One of the comparing operations is incorrect.");
       return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
     }
-    bool operator!=(const OperationData &OD) { return !(*this == OD); }
+    bool operator!=(const OperationData &OD) const { return !(*this == OD); }
     void clear() {
       Opcode = 0;
       LHS = nullptr;
@@ -5788,18 +6332,7 @@ class HorizontalReduction {
     Value *getLHS() const { return LHS; }
     Value *getRHS() const { return RHS; }
     Type *getConditionType() const {
-      switch (Kind) {
-      case RK_Arithmetic:
-        return nullptr;
-      case RK_Min:
-      case RK_Max:
-      case RK_UMin:
-      case RK_UMax:
-        return CmpInst::makeCmpResultType(LHS->getType());
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
+      return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;
     }
 
     /// Creates reduction operation with the current opcode with the IR flags
@@ -6183,6 +6716,18 @@ public:
       assert(Pair.first && "DebugLoc must be set.");
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
     }
+
+    // The compare instruction of a min/max is the insertion point for new
+    // instructions and may be replaced with a new compare instruction.
+    auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+      assert(isa<SelectInst>(RdxRootInst) &&
+             "Expected min/max reduction to have select root instruction");
+      Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
+      assert(isa<Instruction>(ScalarCond) &&
+             "Expected min/max reduction to have compare condition");
+      return cast<Instruction>(ScalarCond);
+    };
+
     // The reduction root is used as the insertion point for new instructions,
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
@@ -6203,6 +6748,8 @@ public:
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
+      if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+        break;
 
       V.computeMinimumValueSizes();
 
@@ -6236,8 +6783,14 @@ public:
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
       Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
-      // Emit a reduction.
-      Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
+      // Emit a reduction. For min/max, the root is a select, but the insertion
+      // point is the compare condition of that select.
+      Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+      if (ReductionData.isMinMax())
+        Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
+      else
+        Builder.SetInsertPoint(RdxRootInst);
+
       Value *ReducedSubTree =
           emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
       if (VectorizedTree) {
@@ -6273,8 +6826,23 @@ public:
           VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
         }
       }
-      // Update users.
+
+      // Update users. For a min/max reduction that ends with a compare and
+      // select, we also have to RAUW for the compare instruction feeding the
+      // reduction root. That's because the original compare may have extra uses
+      // besides the final select of the reduction.
+      if (ReductionData.isMinMax()) {
+        if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
+          Instruction *ScalarCmp =
+              getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
+          ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());
+        }
+      }
       ReductionRoot->replaceAllUsesWith(VectorizedTree);
+
+      // Mark all scalar reduction ops for deletion, they are replaced by the
+      // vector reductions.
+      V.eraseInstructions(IgnoreList);
     }
     return VectorizedTree != nullptr;
   }
@@ -6323,7 +6891,7 @@ private:
     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
 
-    int ScalarReduxCost;
+    int ScalarReduxCost = 0;
     switch (ReductionData.getKind()) {
     case RK_Arithmetic:
       ScalarReduxCost =
@@ -6397,46 +6965,54 @@ private:
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
-///  starting from the last insertelement instruction.
+///  starting from the last insertelement or insertvalue instruction.
 ///
-/// Returns true if it matches
-static bool findBuildVector(InsertElementInst *LastInsertElem,
-                            TargetTransformInfo *TTI,
-                            SmallVectorImpl<Value *> &BuildVectorOpds,
-                            int &UserCost) {
-  UserCost = 0;
-  Value *V = nullptr;
-  do {
-    if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
-      UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
-                                          LastInsertElem->getType(),
-                                          CI->getZExtValue());
-    }
-    BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
-    V = LastInsertElem->getOperand(0);
-    if (isa<UndefValue>(V))
-      break;
-    LastInsertElem = dyn_cast<InsertElementInst>(V);
-    if (!LastInsertElem || !LastInsertElem->hasOneUse())
-      return false;
-  } while (true);
-  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
-  return true;
-}
-
-/// Like findBuildVector, but looks for construction of aggregate.
+/// Also recognize aggregates like {<2 x float>, <2 x float>},
+/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
+/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
+///
+/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
 ///
 /// \return true if it matches.
-static bool findBuildAggregate(InsertValueInst *IV,
-                               SmallVectorImpl<Value *> &BuildVectorOpds) {
-  Value *V;
+static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
+                               SmallVectorImpl<Value *> &BuildVectorOpds,
+                               int &UserCost) {
+  assert((isa<InsertElementInst>(LastInsertInst) ||
+          isa<InsertValueInst>(LastInsertInst)) &&
+         "Expected insertelement or insertvalue instruction!");
+  UserCost = 0;
   do {
-    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
-    V = IV->getAggregateOperand();
-    if (isa<UndefValue>(V))
+    Value *InsertedOperand;
+    if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+      InsertedOperand = IE->getOperand(1);
+      LastInsertInst = IE->getOperand(0);
+      if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+        UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+                                            IE->getType(), CI->getZExtValue());
+      }
+    } else {
+      auto *IV = cast<InsertValueInst>(LastInsertInst);
+      InsertedOperand = IV->getInsertedValueOperand();
+      LastInsertInst = IV->getAggregateOperand();
+    }
+    if (isa<InsertElementInst>(InsertedOperand) ||
+        isa<InsertValueInst>(InsertedOperand)) {
+      int TmpUserCost;
+      SmallVector<Value *, 8> TmpBuildVectorOpds;
+      if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
+                              TmpUserCost))
+        return false;
+      BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
+                             TmpBuildVectorOpds.rend());
+      UserCost += TmpUserCost;
+    } else {
+      BuildVectorOpds.push_back(InsertedOperand);
+    }
+    if (isa<UndefValue>(LastInsertInst))
       break;
-    IV = dyn_cast<InsertValueInst>(V);
-    if (!IV || !IV->hasOneUse())
+    if ((!isa<InsertValueInst>(LastInsertInst) &&
+         !isa<InsertElementInst>(LastInsertInst)) ||
+        !LastInsertInst->hasOneUse())
       return false;
   } while (true);
   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
@@ -6530,18 +7106,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
-  SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
+  SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
   while (!Stack.empty()) {
-    Value *V;
+    Instruction *Inst;
     unsigned Level;
-    std::tie(V, Level) = Stack.pop_back_val();
-    if (!V)
-      continue;
-    auto *Inst = dyn_cast<Instruction>(V);
-    if (!Inst)
-      continue;
+    std::tie(Inst, Level) = Stack.pop_back_val();
     auto *BI = dyn_cast<BinaryOperator>(Inst);
     auto *SI = dyn_cast<SelectInst>(Inst);
     if (BI || SI) {
@@ -6582,8 +7153,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
       for (auto *Op : Inst->operand_values())
         if (VisitedInstrs.insert(Op).second)
           if (auto *I = dyn_cast<Instruction>(Op))
-            if (!isa<PHINode>(I) && I->getParent() == BB)
-              Stack.emplace_back(Op, Level);
+            if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+              Stack.emplace_back(I, Level);
   }
   return Res;
 }
@@ -6609,25 +7180,26 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
                                                  BasicBlock *BB, BoUpSLP &R) {
+  int UserCost = 0;
   const DataLayout &DL = BB->getModule()->getDataLayout();
   if (!R.canMapToVector(IVI->getType(), DL))
     return false;
 
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IVI, BuildVectorOpds))
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R);
+  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
   int UserCost;
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
        isShuffle(BuildVectorOpds)))
@@ -6652,11 +7224,10 @@ bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
 }
 
 bool SLPVectorizerPass::vectorizeSimpleInstructions(
-    SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
   bool OpsChanged = false;
-  for (auto &VH : reverse(Instructions)) {
-    auto *I = dyn_cast_or_null<Instruction>(VH);
-    if (!I)
+  for (auto *I : reverse(Instructions)) {
+    if (R.isDeleted(I))
       continue;
     if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
@@ -6685,7 +7256,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (!P)
         break;
 
-      if (!VisitedInstrs.count(P))
+      if (!VisitedInstrs.count(P) && !R.isDeleted(P))
         Incoming.push_back(P);
     }
 
@@ -6729,9 +7300,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
   VisitedInstrs.clear();
 
-  SmallVector<WeakVH, 8> PostProcessInstructions;
+  SmallVector<Instruction *, 8> PostProcessInstructions;
   SmallDenseSet<Instruction *, 4> KeyNodes;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // Skip instructions marked for the deletion.
+    if (R.isDeleted(&*it))
+      continue;
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second) {
       if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
@@ -6811,10 +7385,16 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
                       << Entry.second.size() << ".\n");
 
-    // We process the getelementptr list in chunks of 16 (like we do for
-    // stores) to minimize compile-time.
-    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
-      auto Len = std::min<unsigned>(BE - BI, 16);
+    // Process the GEP list in chunks suitable for the target's supported
+    // vector size. If a vector register can't hold 1 element, we are done.
+    unsigned MaxVecRegSize = R.getMaxVecRegSize();
+    unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+    if (MaxVecRegSize < EltSize)
+      continue;
+
+    unsigned MaxElts = MaxVecRegSize / EltSize;
+    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
+      auto Len = std::min<unsigned>(BE - BI, MaxElts);
       auto GEPList = makeArrayRef(&Entry.second[BI], Len);
 
       // Initialize a set a candidate getelementptrs. Note that we use a
@@ -6824,10 +7404,10 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
 
       // Some of the candidates may have already been vectorized after we
-      // initially collected them. If so, the WeakTrackingVHs will have
-      // nullified the
-      // values, so remove them from the set of candidates.
-      Candidates.remove(nullptr);
+      // initially collected them. If so, they are marked as deleted, so remove
+      // them from the set of candidates.
+      Candidates.remove_if(
+          [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
 
       // Remove from the set of candidates all pairs of getelementptrs with
       // constant differences. Such getelementptrs are likely not good
@@ -6835,18 +7415,18 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
       // computed from the other. We also ensure all candidate getelementptr
       // indices are unique.
       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
-        auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+        auto *GEPI = GEPList[I];
         if (!Candidates.count(GEPI))
           continue;
         auto *SCEVI = SE->getSCEV(GEPList[I]);
         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
-          auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+          auto *GEPJ = GEPList[J];
           auto *SCEVJ = SE->getSCEV(GEPList[J]);
           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
-            Candidates.remove(GEPList[I]);
-            Candidates.remove(GEPList[J]);
+            Candidates.remove(GEPI);
+            Candidates.remove(GEPJ);
           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
-            Candidates.remove(GEPList[J]);
+            Candidates.remove(GEPJ);
           }
         }
       }
@@ -6894,14 +7474,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
                       << it->second.size() << ".\n");
 
-    // Process the stores in chunks of 16.
-    // TODO: The limit of 16 inhibits greater vectorization factors.
-    //       For example, AVX2 supports v32i8. Increasing this limit, however,
-    //       may cause a significant compile-time increase.
-    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
-      unsigned Len = std::min<unsigned>(CE - CI, 16);
-      Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
-    }
+    Changed |= vectorizeStores(it->second, R);
   }
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 0ca6a6b93cfd..598fb00e956e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -47,6 +47,24 @@ class VPRecipeBuilder {
   EdgeMaskCacheTy EdgeMaskCache;
   BlockMaskCacheTy BlockMaskCache;
 
+  // VPlan-VPlan transformations support: Hold a mapping from ingredients to
+  // their recipe. To save on memory, only do so for selected ingredients,
+  // marked by having a nullptr entry in this map. If those ingredients get a
+  // VPWidenRecipe, also avoid compressing other ingredients into it to avoid
+  // having to split such recipes later.
+  DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
+  VPWidenRecipe *LastExtensibleRecipe = nullptr;
+
+  /// Set the recipe created for given ingredient. This operation is a no-op for
+  /// ingredients that were not marked using a nullptr entry in the map.
+  void setRecipe(Instruction *I, VPRecipeBase *R) {
+    if (!Ingredient2Recipe.count(I))
+      return;
+    assert(Ingredient2Recipe[I] == nullptr &&
+           "Recipe already set for ingredient");
+    Ingredient2Recipe[I] = R;
+  }
+
 public:
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True. It returns the *entry*
@@ -57,16 +75,22 @@ public:
   /// and DST.
   VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
 
-  /// Check if \I belongs to an Interleave Group within the given VF \p Range,
-  /// \return true in the first returned value if so and false otherwise.
-  /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
-  /// for \p Range.Start, and provide it as the second returned value.
-  /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
-  /// \return value is <true, nullptr>, as it is handled by another recipe.
-  /// \p Range.End may be decreased to ensure same decision from \p Range.Start
-  /// to \p Range.End.
-  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,
-                                            VPlanPtr &Plan);
+  /// Mark given ingredient for recording its recipe once one is created for
+  /// it.
+  void recordRecipeOf(Instruction *I) {
+    assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) &&
+           "Recipe already set for ingredient");
+    Ingredient2Recipe[I] = nullptr;
+  }
+
+  /// Return the recipe created for given ingredient.
+  VPRecipeBase *getRecipe(Instruction *I) {
+    assert(Ingredient2Recipe.count(I) &&
+           "Recording this ingredients recipe was not requested");
+    assert(Ingredient2Recipe[I] != nullptr &&
+           "Ingredient doesn't have a recipe");
+    return Ingredient2Recipe[I];
+  }
 
   /// Check if \I is a memory instruction to be widened for \p Range.Start and
   /// potentially masked. Such instructions are handled by a recipe that takes
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 517d759d7bfc..f1c708720ccf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
@@ -275,14 +276,37 @@ void VPRegionBlock::execute(VPTransformState *State) {
 }
 
 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+  assert(!Parent && "Recipe already in some VPBasicBlock");
+  assert(InsertPos->getParent() &&
+         "Insertion position not in any VPBasicBlock");
   Parent = InsertPos->getParent();
   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
 }
 
+void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
+  assert(!Parent && "Recipe already in some VPBasicBlock");
+  assert(InsertPos->getParent() &&
+         "Insertion position not in any VPBasicBlock");
+  Parent = InsertPos->getParent();
+  Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::removeFromParent() {
+  assert(getParent() && "Recipe not in any VPBasicBlock");
+  getParent()->getRecipeList().remove(getIterator());
+  Parent = nullptr;
+}
+
 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+  assert(getParent() && "Recipe not in any VPBasicBlock");
   return getParent()->getRecipeList().erase(getIterator());
 }
 
+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
+  removeFromParent();
+  insertAfter(InsertPos);
+}
+
 void VPInstruction::generateInstruction(VPTransformState &State,
                                         unsigned Part) {
   IRBuilder<> &Builder = State.Builder;
@@ -309,6 +333,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     State.set(this, V, Part);
     break;
   }
+  case Instruction::Select: {
+    Value *Cond = State.get(getOperand(0), Part);
+    Value *Op1 = State.get(getOperand(1), Part);
+    Value *Op2 = State.get(getOperand(2), Part);
+    Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+    State.set(this, V, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -433,14 +465,20 @@ void VPlan::execute(VPTransformState *State) {
 
   // We do not attempt to preserve DT for outer loop vectorization currently.
   if (!EnableVPlanNativePath)
-    updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
+    updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
+                        L->getExitBlock());
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void VPlan::dump() const { dbgs() << *this << '\n'; }
+#endif
+
 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
-                                BasicBlock *LoopLatchBB) {
+                                BasicBlock *LoopLatchBB,
+                                BasicBlock *LoopExitBB) {
   BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
   assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
-  DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB);
   // The vector body may be more than a single basic-block by this point.
   // Update the dominator tree information inside the vector body by propagating
   // it from header to latch, expecting only triangular control-flow, if any.
@@ -471,6 +509,9 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
     DT->addNewBlock(InterimSucc, BB);
     DT->addNewBlock(PostDomSucc, BB);
   }
+  // Latch block is a new dominator for the loop exit.
+  DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
 }
 
 const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
@@ -495,8 +536,7 @@ void VPlanPrinter::dump() {
   if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {
     OS << ", where:";
     if (Plan.BackedgeTakenCount)
-      OS << "\\n"
-         << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount";
+      OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount";
     for (auto Entry : Plan.Value2VPValue) {
       OS << "\\n" << *Entry.second;
       OS << DOT::EscapeString(" := ");
@@ -508,7 +548,7 @@ void VPlanPrinter::dump() {
   OS << "edge [fontname=Courier, fontsize=30]\n";
   OS << "compound=true\n";
 
-  for (VPBlockBase *Block : depth_first(Plan.getEntry()))
+  for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
     dumpBlock(Block);
 
   OS << "}\n";
@@ -647,6 +687,16 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O,
     O << " " << VPlanIngredient(IV) << "\\l\"";
 }
 
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const {
+  O << " +\n" << Indent << "\"WIDEN-GEP ";
+  O << (IsPtrLoopInvariant ? "Inv" : "Var");
+  size_t IndicesNumber = IsIndexLoopInvariant.size();
+  for (size_t I = 0; I < IndicesNumber; ++I)
+    O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
+  O << "\\l\"";
+  O << " +\n" << Indent << "\"  "  << VPlanIngredient(GEP) << "\\l\"";
+}
+
 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
   O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
 }
@@ -689,9 +739,12 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,
                                            const Twine &Indent) const {
   O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
-  if (User) {
+  O << ", ";
+  getAddr()->printAsOperand(O);
+  VPValue *Mask = getMask();
+  if (Mask) {
     O << ", ";
-    User->getOperand(0)->printAsOperand(O);
+    Mask->printAsOperand(O);
   }
   O << "\\l\"";
 }
@@ -728,7 +781,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
       auto NewIGIter = Old2New.find(IG);
       if (NewIGIter == Old2New.end())
         Old2New[IG] = new InterleaveGroup<VPInstruction>(
-            IG->getFactor(), IG->isReverse(), IG->getAlignment());
+            IG->getFactor(), IG->isReverse(), Align(IG->getAlignment()));
 
       if (Inst == IG->getInsertPos())
         Old2New[IG]->setInsertPos(VPInst);
@@ -736,7 +789,8 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
       InterleaveGroupMap[VPInst] = Old2New[IG];
       InterleaveGroupMap[VPInst]->insertMember(
           VPInst, IG->getIndex(Inst),
-          IG->isReverse() ? (-1) * int(IG->getFactor()) : IG->getFactor());
+          Align(IG->isReverse() ? (-1) * int(IG->getFactor())
+                                : IG->getFactor()));
     }
   } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
     visitRegion(Region, Old2New, IAI);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 8a06412ad590..c65abc3639d7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -31,6 +31,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -226,6 +227,8 @@ public:
 struct VPCallback {
   virtual ~VPCallback() {}
   virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;
+  virtual Value *getOrCreateScalarValue(Value *V,
+                                        const VPIteration &Instance) = 0;
 };
 
 /// VPTransformState holds information passed down when "executing" a VPlan,
@@ -268,6 +271,13 @@ struct VPTransformState {
     return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
   }
 
+  /// Get the generated Value for a given VPValue and given Part and Lane. Note
+  /// that as per-lane Defs are still created by ILV and managed in its ValueMap
+  /// this method currently just delegates the call to ILV.
+  Value *get(VPValue *Def, const VPIteration &Instance) {
+    return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
+  }
+
   /// Set the generated Value for a given VPValue and a given Part.
   void set(VPValue *Def, Value *V, unsigned Part) {
     if (!Data.PerPartOutput.count(Def)) {
@@ -567,6 +577,7 @@ public:
 /// instructions.
 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
   friend VPBasicBlock;
+  friend class VPBlockUtils;
 
 private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -586,6 +597,7 @@ public:
     VPInterleaveSC,
     VPPredInstPHISC,
     VPReplicateSC,
+    VPWidenGEPSC,
     VPWidenIntOrFpInductionSC,
     VPWidenMemoryInstructionSC,
     VPWidenPHISC,
@@ -615,6 +627,18 @@ public:
   /// the specified recipe.
   void insertBefore(VPRecipeBase *InsertPos);
 
+  /// Insert an unlinked Recipe into a basic block immediately after
+  /// the specified Recipe.
+  void insertAfter(VPRecipeBase *InsertPos);
+
+  /// Unlink this recipe from its current VPBasicBlock and insert it into
+  /// the VPBasicBlock that MovePos lives in, right after MovePos.
+  void moveAfter(VPRecipeBase *MovePos);
+
+  /// This method unlinks 'this' from the containing basic block, but does not
+  /// delete it.
+  void removeFromParent();
+
   /// This method unlinks 'this' from the containing basic block and deletes it.
   ///
   /// \returns an iterator pointing to the element after the erased one
@@ -626,7 +650,6 @@ public:
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPUser, public VPRecipeBase {
-  friend class VPlanHCFGTransforms;
   friend class VPlanSlp;
 
 public:
@@ -736,6 +759,36 @@ public:
   void print(raw_ostream &O, const Twine &Indent) const override;
 };
 
+/// A recipe for handling GEP instructions.
+class VPWidenGEPRecipe : public VPRecipeBase {
+private:
+  GetElementPtrInst *GEP;
+  bool IsPtrLoopInvariant;
+  SmallBitVector IsIndexLoopInvariant;
+
+public:
+  VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop)
+      : VPRecipeBase(VPWidenGEPSC), GEP(GEP),
+        IsIndexLoopInvariant(GEP->getNumIndices(), false) {
+    IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
+    for (auto Index : enumerate(GEP->indices()))
+      IsIndexLoopInvariant[Index.index()] =
+          OrigLoop->isLoopInvariant(Index.value().get());
+  }
+  ~VPWidenGEPRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC;
+  }
+
+  /// Generate the gep nodes.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
 class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
@@ -818,13 +871,14 @@ public:
 class VPInterleaveRecipe : public VPRecipeBase {
 private:
   const InterleaveGroup<Instruction> *IG;
-  std::unique_ptr<VPUser> User;
+  VPUser User;
 
 public:
-  VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Mask)
-      : VPRecipeBase(VPInterleaveSC), IG(IG) {
-    if (Mask) // Create a VPInstruction to register as a user of the mask.
-      User.reset(new VPUser({Mask}));
+  VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
+                     VPValue *Mask)
+      : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) {
+    if (Mask)
+      User.addOperand(Mask);
   }
   ~VPInterleaveRecipe() override = default;
 
@@ -833,6 +887,18 @@ public:
     return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;
   }
 
+  /// Return the address accessed by this recipe.
+  VPValue *getAddr() const {
+    return User.getOperand(0); // Address is the 1st, mandatory operand.
+  }
+
+  /// Return the mask used by this recipe. Note that a full mask is represented
+  /// by a nullptr.
+  VPValue *getMask() const {
+    // Mask is optional and therefore the last, currently 2nd operand.
+    return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;
+  }
+
   /// Generate the wide load or store, and shuffles.
   void execute(VPTransformState &State) override;
 
@@ -955,13 +1021,14 @@ public:
 class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
 private:
   Instruction &Instr;
-  std::unique_ptr<VPUser> User;
+  VPUser User;
 
 public:
-  VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {
-    if (Mask) // Create a VPInstruction to register as a user of the mask.
-      User.reset(new VPUser({Mask}));
+  VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr,
+                                 VPValue *Mask)
+      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) {
+    if (Mask)
+      User.addOperand(Mask);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -969,6 +1036,18 @@ public:
     return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
   }
 
+  /// Return the address accessed by this recipe.
+  VPValue *getAddr() const {
+    return User.getOperand(0); // Address is the 1st, mandatory operand.
+  }
+
+  /// Return the mask used by this recipe. Note that a full mask is represented
+  /// by a nullptr.
+  VPValue *getMask() const {
+    // Mask is optional and therefore the last, currently 2nd operand.
+    return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;
+  }
+
   /// Generate the wide load/store.
   void execute(VPTransformState &State) override;
 
@@ -1139,6 +1218,128 @@ public:
   void execute(struct VPTransformState *State) override;
 };
 
+//===----------------------------------------------------------------------===//
+// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs     //
+//===----------------------------------------------------------------------===//
+
+// The following set of template specializations implement GraphTraits to treat
+// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
+// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
+// VPBlockBase is a VPRegionBlock, this specialization provides access to its
+// successors/predecessors but not to the blocks inside the region.
+
+template <> struct GraphTraits<VPBlockBase *> {
+  using NodeRef = VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getSuccessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getSuccessors().end();
+  }
+};
+
+template <> struct GraphTraits<const VPBlockBase *> {
+  using NodeRef = const VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getSuccessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getSuccessors().end();
+  }
+};
+
+// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
+// of successors for the inverse traversal.
+template <> struct GraphTraits<Inverse<VPBlockBase *>> {
+  using NodeRef = VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+  static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getPredecessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getPredecessors().end();
+  }
+};
+
+// The following set of template specializations implement GraphTraits to
+// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
+// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
+// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
+// there won't be automatic recursion into other VPBlockBases that turn to be
+// VPRegionBlocks.
+
+template <>
+struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
+  using GraphRef = VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getEntry());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<const VPRegionBlock *>
+    : public GraphTraits<const VPBlockBase *> {
+  using GraphRef = const VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getEntry());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<Inverse<VPRegionBlock *>>
+    : public GraphTraits<Inverse<VPBlockBase *>> {
+  using GraphRef = VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(Inverse<GraphRef> N) {
+    return N.Graph->getExit();
+  }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getExit());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
 /// VPlan models a candidate for vectorization, encoding various decisions take
 /// to produce efficient output IR, including which branches, basic-blocks and
 /// output IR instructions to generate, and their cost. VPlan holds a
@@ -1241,35 +1442,45 @@ public:
     return Value2VPValue[V];
   }
 
+  VPValue *getOrAddVPValue(Value *V) {
+    assert(V && "Trying to get or add the VPValue of a null Value");
+    if (!Value2VPValue.count(V))
+      addVPValue(V);
+    return getVPValue(V);
+  }
+
   /// Return the VPLoopInfo analysis for this VPlan.
   VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
   const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
 
+  /// Dump the plan to stderr (for debugging).
+  void dump() const;
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
-  static void updateDominatorTree(DominatorTree *DT,
+  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
                                   BasicBlock *LoopPreHeaderBB,
-                                  BasicBlock *LoopLatchBB);
+                                  BasicBlock *LoopExitBB);
 };
 
 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is
 /// indented and follows the dot format.
 class VPlanPrinter {
-  friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan);
+  friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan);
   friend inline raw_ostream &operator<<(raw_ostream &OS,
                                         const struct VPlanIngredient &I);
 
 private:
   raw_ostream &OS;
-  VPlan &Plan;
-  unsigned Depth;
+  const VPlan &Plan;
+  unsigned Depth = 0;
   unsigned TabWidth = 2;
   std::string Indent;
   unsigned BID = 0;
   SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
 
-  VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {}
+  VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {}
 
   /// Handle indentation.
   void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
@@ -1316,135 +1527,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
   return OS;
 }
 
-inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) {
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
   VPlanPrinter Printer(OS, Plan);
   Printer.dump();
   return OS;
 }
 
 //===----------------------------------------------------------------------===//
-// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs     //
-//===----------------------------------------------------------------------===//
-
-// The following set of template specializations implement GraphTraits to treat
-// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
-// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
-// VPBlockBase is a VPRegionBlock, this specialization provides access to its
-// successors/predecessors but not to the blocks inside the region.
-
-template <> struct GraphTraits<VPBlockBase *> {
-  using NodeRef = VPBlockBase *;
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-
-  static inline ChildIteratorType child_begin(NodeRef N) {
-    return N->getSuccessors().begin();
-  }
-
-  static inline ChildIteratorType child_end(NodeRef N) {
-    return N->getSuccessors().end();
-  }
-};
-
-template <> struct GraphTraits<const VPBlockBase *> {
-  using NodeRef = const VPBlockBase *;
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-
-  static inline ChildIteratorType child_begin(NodeRef N) {
-    return N->getSuccessors().begin();
-  }
-
-  static inline ChildIteratorType child_end(NodeRef N) {
-    return N->getSuccessors().end();
-  }
-};
-
-// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
-// of successors for the inverse traversal.
-template <> struct GraphTraits<Inverse<VPBlockBase *>> {
-  using NodeRef = VPBlockBase *;
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
-
-  static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
-
-  static inline ChildIteratorType child_begin(NodeRef N) {
-    return N->getPredecessors().begin();
-  }
-
-  static inline ChildIteratorType child_end(NodeRef N) {
-    return N->getPredecessors().end();
-  }
-};
-
-// The following set of template specializations implement GraphTraits to
-// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
-// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
-// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
-// there won't be automatic recursion into other VPBlockBases that turn to be
-// VPRegionBlocks.
-
-template <>
-struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
-  using GraphRef = VPRegionBlock *;
-  using nodes_iterator = df_iterator<NodeRef>;
-
-  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
-
-  static nodes_iterator nodes_begin(GraphRef N) {
-    return nodes_iterator::begin(N->getEntry());
-  }
-
-  static nodes_iterator nodes_end(GraphRef N) {
-    // df_iterator::end() returns an empty iterator so the node used doesn't
-    // matter.
-    return nodes_iterator::end(N);
-  }
-};
-
-template <>
-struct GraphTraits<const VPRegionBlock *>
-    : public GraphTraits<const VPBlockBase *> {
-  using GraphRef = const VPRegionBlock *;
-  using nodes_iterator = df_iterator<NodeRef>;
-
-  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
-
-  static nodes_iterator nodes_begin(GraphRef N) {
-    return nodes_iterator::begin(N->getEntry());
-  }
-
-  static nodes_iterator nodes_end(GraphRef N) {
-    // df_iterator::end() returns an empty iterator so the node used doesn't
-    // matter.
-    return nodes_iterator::end(N);
-  }
-};
-
-template <>
-struct GraphTraits<Inverse<VPRegionBlock *>>
-    : public GraphTraits<Inverse<VPBlockBase *>> {
-  using GraphRef = VPRegionBlock *;
-  using nodes_iterator = df_iterator<NodeRef>;
-
-  static NodeRef getEntryNode(Inverse<GraphRef> N) {
-    return N.Graph->getExit();
-  }
-
-  static nodes_iterator nodes_begin(GraphRef N) {
-    return nodes_iterator::begin(N->getExit());
-  }
-
-  static nodes_iterator nodes_end(GraphRef N) {
-    // df_iterator::end() returns an empty iterator so the node used doesn't
-    // matter.
-    return nodes_iterator::end(N);
-  }
-};
-
-//===----------------------------------------------------------------------===//
 // VPlan Utilities
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index e5ab24e52df6..9019ed15ec5f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -346,11 +346,14 @@ SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
 
 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
   dbgs() << " Ops: ";
-  for (auto Op : Values)
-    if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
-      dbgs() << *Instr << " | ";
-    else
-      dbgs() << " nullptr | ";
+  for (auto Op : Values) {
+    if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
+      if (auto *Instr = VPInstr->getUnderlyingInstr()) {
+        dbgs() << *Instr << " | ";
+        continue;
+      }
+    dbgs() << " nullptr | ";
+  }
   dbgs() << "\n";
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7ed7d21b6caa..3f6a2efd55cc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1,4 +1,4 @@
-//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===//
+//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,17 +11,17 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "VPlanHCFGTransforms.h"
+#include "VPlanTransforms.h"
 #include "llvm/ADT/PostOrderIterator.h"
 
 using namespace llvm;
 
-void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
-    VPlanPtr &Plan,
+void VPlanTransforms::VPInstructionsToVPRecipes(
+    Loop *OrigLoop, VPlanPtr &Plan,
     LoopVectorizationLegality::InductionList *Inductions,
     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
 
-  VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());
+  auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
   ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
 
   // Condition bit VPValues get deleted during transformation to VPRecipes.
@@ -56,7 +56,9 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
       VPRecipeBase *NewRecipe = nullptr;
       // Create VPWidenMemoryInstructionRecipe for loads and stores.
       if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
-        NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/);
+        NewRecipe = new VPWidenMemoryInstructionRecipe(
+            *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+            nullptr /*Mask*/);
       else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
         InductionDescriptor II = Inductions->lookup(Phi);
         if (II.getKind() == InductionDescriptor::IK_IntInduction ||
@@ -64,6 +66,8 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
           NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
         } else
           NewRecipe = new VPWidenPHIRecipe(Phi);
+      } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+        NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop);
       } else {
         // If the last recipe is a VPWidenRecipe, add Inst to it instead of
         // creating a new recipe.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 79a23c33184f..0d3bd7da09a7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -1,4 +1,4 @@
-//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===//
+//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,8 +10,8 @@
 /// This file provides utility VPlan to VPlan transformations.
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
 
 #include "VPlan.h"
 #include "llvm/IR/Instruction.h"
@@ -19,17 +19,17 @@
 
 namespace llvm {
 
-class VPlanHCFGTransforms {
+class VPlanTransforms {
 
 public:
   /// Replaces the VPInstructions in \p Plan with corresponding
   /// widen recipes.
   static void VPInstructionsToVPRecipes(
-      VPlanPtr &Plan,
+      Loop *OrigLoop, VPlanPtr &Plan,
       LoopVectorizationLegality::InductionList *Inductions,
       SmallPtrSetImpl<Instruction *> &DeadInstructions);
 };
 
 } // namespace llvm
 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 7b6c228c229e..464498c29d89 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -37,7 +37,7 @@ class VPUser;
 // and live-outs which the VPlan will need to fix accordingly.
 class VPValue {
   friend class VPBuilder;
-  friend class VPlanHCFGTransforms;
+  friend class VPlanTransforms;
   friend class VPBasicBlock;
   friend class VPInterleavedAccessInfo;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 394b1b93113b..ab3e7e2282e7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -14,6 +14,7 @@
 
 #include "VPlanVerifier.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Support/CommandLine.h"
 
 #define DEBUG_TYPE "loop-vectorize"