1 files changed, 574 insertions, 121 deletions
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 7e08120f923d..ad7d34b61470 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -42,6 +43,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -73,11 +75,14 @@ using namespace llvm::PatternMatch;
 
 STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumSelects, "Number of selects turned into branches for unswitching");
 STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
 STATISTIC(
     NumCostMultiplierSkipped,
     "Number of unswitch candidates that had their cost multiplier skipped");
+STATISTIC(NumInvariantConditionsInjected,
+          "Number of invariant conditions injected and unswitched");
 
 static cl::opt<bool> EnableNonTrivialUnswitch(
     "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
@@ -118,15 +123,53 @@ static cl::opt<bool> FreezeLoopUnswitchCond(
     cl::desc("If enabled, the freeze instruction will be added to condition "
              "of loop unswitch to prevent miscompilation."));
 
+static cl::opt<bool> InjectInvariantConditions(
+    "simple-loop-unswitch-inject-invariant-conditions", cl::Hidden,
+    cl::desc("Whether we should inject new invariants and unswitch them to "
+             "eliminate some existing (non-invariant) conditions."),
+    cl::init(true));
+
+static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold(
+    "simple-loop-unswitch-inject-invariant-condition-hotness-threshold",
+    cl::Hidden, cl::desc("Only try to inject loop invariant conditions and "
+                         "unswitch on them to eliminate branches that are "
+                         "not-taken 1/<this option> times or less."),
+    cl::init(16));
+
 namespace {
+struct CompareDesc {
+  BranchInst *Term;
+  Value *Invariant;
+  BasicBlock *InLoopSucc;
+
+  CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc)
+      : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {}
+};
+
+struct InjectedInvariant {
+  ICmpInst::Predicate Pred;
+  Value *LHS;
+  Value *RHS;
+  BasicBlock *InLoopSucc;
+
+  InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS,
+                    BasicBlock *InLoopSucc)
+      : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {}
+};
+
 struct NonTrivialUnswitchCandidate {
   Instruction *TI = nullptr;
   TinyPtrVector<Value *> Invariants;
   std::optional<InstructionCost> Cost;
+  std::optional<InjectedInvariant> PendingInjection;
   NonTrivialUnswitchCandidate(
       Instruction *TI, ArrayRef<Value *> Invariants,
-      std::optional<InstructionCost> Cost = std::nullopt)
-      : TI(TI), Invariants(Invariants), Cost(Cost){};
+      std::optional<InstructionCost> Cost = std::nullopt,
+      std::optional<InjectedInvariant> PendingInjection = std::nullopt)
+      : TI(TI), Invariants(Invariants), Cost(Cost),
+        PendingInjection(PendingInjection) {};
+
+  bool hasPendingInjection() const { return PendingInjection.has_value(); }
 };
 } // end anonymous namespace.
 
@@ -434,10 +477,10 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
 // Return the top-most loop containing ExitBB and having ExitBB as exiting block
 // or the loop containing ExitBB, if there is no parent loop containing ExitBB
 // as exiting block.
-static const Loop *getTopMostExitingLoop(const BasicBlock *ExitBB,
-                                         const LoopInfo &LI) {
-  const Loop *TopMost = LI.getLoopFor(ExitBB);
-  const Loop *Current = TopMost;
+static Loop *getTopMostExitingLoop(const BasicBlock *ExitBB,
+                                   const LoopInfo &LI) {
+  Loop *TopMost = LI.getLoopFor(ExitBB);
+  Loop *Current = TopMost;
   while (Current) {
     if (Current->isLoopExiting(ExitBB))
       TopMost = Current;
@@ -750,15 +793,32 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   Loop *OuterL = &L;
 
   if (DefaultExitBB) {
-    // Clear out the default destination temporarily to allow accurate
-    // predecessor lists to be examined below.
-    SI.setDefaultDest(nullptr);
     // Check the loop containing this exit.
-    Loop *ExitL = LI.getLoopFor(DefaultExitBB);
+    Loop *ExitL = getTopMostExitingLoop(DefaultExitBB, LI);
+    if (!ExitL || ExitL->contains(OuterL))
+      OuterL = ExitL;
+  }
+  for (unsigned Index : ExitCaseIndices) {
+    auto CaseI = SI.case_begin() + Index;
+    // Compute the outer loop from this exit.
+    Loop *ExitL = getTopMostExitingLoop(CaseI->getCaseSuccessor(), LI);
     if (!ExitL || ExitL->contains(OuterL))
       OuterL = ExitL;
   }
 
+  if (SE) {
+    if (OuterL)
+      SE->forgetLoop(OuterL);
+    else
+      SE->forgetTopmostLoop(&L);
+  }
+
+  if (DefaultExitBB) {
+    // Clear out the default destination temporarily to allow accurate
+    // predecessor lists to be examined below.
+    SI.setDefaultDest(nullptr);
+  }
+
   // Store the exit cases into a separate data structure and remove them from
   // the switch.
   SmallVector<std::tuple<ConstantInt *, BasicBlock *,
@@ -770,10 +830,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   // and don't disrupt the earlier indices.
   for (unsigned Index : reverse(ExitCaseIndices)) {
     auto CaseI = SI.case_begin() + Index;
-    // Compute the outer loop from this exit.
-    Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
-    if (!ExitL || ExitL->contains(OuterL))
-      OuterL = ExitL;
     // Save the value of this case.
     auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex());
     ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W);
@@ -781,13 +837,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     SIW.removeCase(CaseI);
   }
 
-  if (SE) {
-    if (OuterL)
-      SE->forgetLoop(OuterL);
-    else
-      SE->forgetTopmostLoop(&L);
-  }
-
   // Check if after this all of the remaining cases point at the same
   // successor.
   BasicBlock *CommonSuccBB = nullptr;
@@ -2079,7 +2128,7 @@ static void unswitchNontrivialInvariants(
     AssumptionCache &AC,
     function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB,
     ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
-    function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
+    function_ref<void(Loop &, StringRef)> DestroyLoopCB, bool InsertFreeze) {
   auto *ParentBB = TI.getParent();
   BranchInst *BI = dyn_cast<BranchInst>(&TI);
   SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
@@ -2160,7 +2209,9 @@ static void unswitchNontrivialInvariants(
   SmallVector<BasicBlock *, 4> ExitBlocks;
   L.getUniqueExitBlocks(ExitBlocks);
   for (auto *ExitBB : ExitBlocks) {
-    Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
+    // ExitBB can be an exit block for several levels in the loop nest. Make
+    // sure we find the top most.
+    Loop *NewOuterExitL = getTopMostExitingLoop(ExitBB, LI);
     if (!NewOuterExitL) {
       // We exited the entire nest with this block, so we're done.
       OuterExitL = nullptr;
@@ -2181,25 +2232,6 @@ static void unswitchNontrivialInvariants(
     SE->forgetBlockAndLoopDispositions();
   }
 
-  bool InsertFreeze = false;
-  if (FreezeLoopUnswitchCond) {
-    ICFLoopSafetyInfo SafetyInfo;
-    SafetyInfo.computeLoopSafetyInfo(&L);
-    InsertFreeze = !SafetyInfo.isGuaranteedToExecute(TI, &DT, &L);
-  }
-
-  // Perform the isGuaranteedNotToBeUndefOrPoison() query before the transform,
-  // otherwise the branch instruction will have been moved outside the loop
-  // already, and may imply that a poison condition is always UB.
-  Value *FullUnswitchCond = nullptr;
-  if (FullUnswitch) {
-    FullUnswitchCond =
-        BI ? skipTrivialSelect(BI->getCondition()) : SI->getCondition();
-    if (InsertFreeze)
-      InsertFreeze = !isGuaranteedNotToBeUndefOrPoison(
-          FullUnswitchCond, &AC, L.getLoopPreheader()->getTerminator(), &DT);
-  }
-
   // If the edge from this terminator to a successor dominates that successor,
   // store a map from each block in its dominator subtree to it. This lets us
   // tell when cloning for a particular successor if a block is dominated by
@@ -2274,10 +2306,11 @@ static void unswitchNontrivialInvariants(
       BasicBlock *ClonedPH = ClonedPHs.begin()->second;
       BI->setSuccessor(ClonedSucc, ClonedPH);
       BI->setSuccessor(1 - ClonedSucc, LoopPH);
+      Value *Cond = skipTrivialSelect(BI->getCondition());
       if (InsertFreeze)
-        FullUnswitchCond = new FreezeInst(
-            FullUnswitchCond, FullUnswitchCond->getName() + ".fr", BI);
-      BI->setCondition(FullUnswitchCond);
+        Cond = new FreezeInst(
+            Cond, Cond->getName() + ".fr", BI);
+      BI->setCondition(Cond);
       DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
     } else {
       assert(SI && "Must either be a branch or switch!");
@@ -2294,7 +2327,7 @@ static void unswitchNontrivialInvariants(
 
       if (InsertFreeze)
         SI->setCondition(new FreezeInst(
-            FullUnswitchCond, FullUnswitchCond->getName() + ".fr", SI));
+            SI->getCondition(), SI->getCondition()->getName() + ".fr", SI));
 
       // We need to use the set to populate domtree updates as even when there
       // are multiple cases pointing at the same successor we only want to
@@ -2593,6 +2626,57 @@ static InstructionCost computeDomSubtreeCost(
   return Cost;
 }
 
+/// Turns a select instruction into implicit control flow branch,
+/// making the following replacement:
+///
+/// head:
+///   --code before select--
+///   select %cond, %trueval, %falseval
+///   --code after select--
+///
+/// into
+///
+/// head:
+///   --code before select--
+///   br i1 %cond, label %then, label %tail
+///
+/// then:
+///   br %tail
+///
+/// tail:
+///   phi [ %trueval, %then ], [ %falseval, %head]
+///   unreachable
+///
+/// It also makes all relevant DT and LI updates, so that all structures are in
+/// valid state after this transform.
+static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT,
+                                        LoopInfo &LI, MemorySSAUpdater *MSSAU,
+                                        AssumptionCache *AC) {
+  LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n");
+  BasicBlock *HeadBB = SI->getParent();
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  SplitBlockAndInsertIfThen(SI->getCondition(), SI, false,
+                            SI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
+  auto *CondBr = cast<BranchInst>(HeadBB->getTerminator());
+  BasicBlock *ThenBB = CondBr->getSuccessor(0),
+             *TailBB = CondBr->getSuccessor(1);
+  if (MSSAU)
+    MSSAU->moveAllAfterSpliceBlocks(HeadBB, TailBB, SI);
+
+  PHINode *Phi = PHINode::Create(SI->getType(), 2, "unswitched.select", SI);
+  Phi->addIncoming(SI->getTrueValue(), ThenBB);
+  Phi->addIncoming(SI->getFalseValue(), HeadBB);
+  SI->replaceAllUsesWith(Phi);
+  SI->eraseFromParent();
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  ++NumSelects;
+  return CondBr;
+}
+
 /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
 /// making the following replacement:
 ///
@@ -2624,15 +2708,10 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
   if (MSSAU && VerifyMemorySSA)
      MSSAU->getMemorySSA()->verifyMemorySSA();
 
-  // Remove all CheckBB's successors from DomTree. A block can be seen among
-  // successors more than once, but for DomTree it should be added only once.
-  SmallPtrSet<BasicBlock *, 4> Successors;
-  for (auto *Succ : successors(CheckBB))
-    if (Successors.insert(Succ).second)
-      DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
-
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   Instruction *DeoptBlockTerm =
-      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
+      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
+                                GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
   BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
   // SplitBlockAndInsertIfThen inserts control flow that branches to
   // DeoptBlockTerm if the condition is true.  We want the opposite.
@@ -2649,20 +2728,6 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
   GI->moveBefore(DeoptBlockTerm);
   GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
 
-  // Add new successors of CheckBB into DomTree.
-  for (auto *Succ : successors(CheckBB))
-    DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
-
-  // Now the blocks that used to be CheckBB's successors are GuardedBlock's
-  // successors.
-  for (auto *Succ : Successors)
-    DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
-
-  // Make proper changes to DT.
-  DT.applyUpdates(DTUpdates);
-  // Inform LI of a new loop block.
-  L.addBasicBlockToLoop(GuardedBlock, LI);
-
   if (MSSAU) {
     MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI));
     MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator);
@@ -2670,6 +2735,8 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
       MSSAU->getMemorySSA()->verifyMemorySSA();
   }
 
+  if (VerifyLoopInfo)
+    LI.verify(DT);
   ++NumGuards;
   return CheckBI;
 }
@@ -2700,9 +2767,10 @@ static int CalculateUnswitchCostMultiplier(
   const BasicBlock *CondBlock = TI.getParent();
   if (DT.dominates(CondBlock, Latch) &&
       (isGuard(&TI) ||
-       llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) {
-         return L.contains(SuccBB);
-       }) <= 1)) {
+       (TI.isTerminator() &&
+        llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) {
+          return L.contains(SuccBB);
+        }) <= 1))) {
     NumCostMultiplierSkipped++;
     return 1;
   }
@@ -2711,12 +2779,17 @@ static int CalculateUnswitchCostMultiplier(
   int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
                                : std::distance(LI.begin(), LI.end()));
   // Count amount of clones that all the candidates might cause during
-  // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
+  // unswitching. Branch/guard/select counts as 1, switch counts as log2 of its
+  // cases.
   int UnswitchedClones = 0;
-  for (auto Candidate : UnswitchCandidates) {
+  for (const auto &Candidate : UnswitchCandidates) {
     const Instruction *CI = Candidate.TI;
     const BasicBlock *CondBlock = CI->getParent();
     bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
+    if (isa<SelectInst>(CI)) {
+      UnswitchedClones++;
+      continue;
+    }
     if (isGuard(CI)) {
       if (!SkipExitingSuccessors)
         UnswitchedClones++;
@@ -2766,6 +2839,24 @@ static bool collectUnswitchCandidates(
     const Loop &L, const LoopInfo &LI, AAResults &AA,
     const MemorySSAUpdater *MSSAU) {
   assert(UnswitchCandidates.empty() && "Should be!");
+
+  auto AddUnswitchCandidatesForInst = [&](Instruction *I, Value *Cond) {
+    Cond = skipTrivialSelect(Cond);
+    if (isa<Constant>(Cond))
+      return;
+    if (L.isLoopInvariant(Cond)) {
+      UnswitchCandidates.push_back({I, {Cond}});
+      return;
+    }
+    if (match(Cond, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) {
+      TinyPtrVector<Value *> Invariants =
+          collectHomogenousInstGraphLoopInvariants(
+              L, *static_cast<Instruction *>(Cond), LI);
+      if (!Invariants.empty())
+        UnswitchCandidates.push_back({I, std::move(Invariants)});
+    }
+  };
+
   // Whether or not we should also collect guards in the loop.
   bool CollectGuards = false;
   if (UnswitchGuards) {
@@ -2779,15 +2870,20 @@ static bool collectUnswitchCandidates(
     if (LI.getLoopFor(BB) != &L)
       continue;
 
-    if (CollectGuards)
-      for (auto &I : *BB)
-        if (isGuard(&I)) {
-          auto *Cond =
-              skipTrivialSelect(cast<IntrinsicInst>(&I)->getArgOperand(0));
-          // TODO: Support AND, OR conditions and partial unswitching.
-          if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
-            UnswitchCandidates.push_back({&I, {Cond}});
-        }
+    for (auto &I : *BB) {
+      if (auto *SI = dyn_cast<SelectInst>(&I)) {
+        auto *Cond = SI->getCondition();
+        // Do not unswitch vector selects and logical and/or selects
+        if (Cond->getType()->isIntegerTy(1) && !SI->getType()->isIntegerTy(1))
+          AddUnswitchCandidatesForInst(SI, Cond);
+      } else if (CollectGuards && isGuard(&I)) {
+        auto *Cond =
+            skipTrivialSelect(cast<IntrinsicInst>(&I)->getArgOperand(0));
+        // TODO: Support AND, OR conditions and partial unswitching.
+        if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
+          UnswitchCandidates.push_back({&I, {Cond}});
+      }
+    }
 
     if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
       // We can only consider fully loop-invariant switch conditions as we need
@@ -2799,29 +2895,11 @@ static bool collectUnswitchCandidates(
     }
 
     auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
+    if (!BI || !BI->isConditional() ||
         BI->getSuccessor(0) == BI->getSuccessor(1))
       continue;
 
-    Value *Cond = skipTrivialSelect(BI->getCondition());
-    if (isa<Constant>(Cond))
-      continue;
-
-    if (L.isLoopInvariant(Cond)) {
-      UnswitchCandidates.push_back({BI, {Cond}});
-      continue;
-    }
-
-    Instruction &CondI = *cast<Instruction>(Cond);
-    if (match(&CondI, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) {
-      TinyPtrVector<Value *> Invariants =
-          collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
-      if (Invariants.empty())
-        continue;
-
-      UnswitchCandidates.push_back({BI, std::move(Invariants)});
-      continue;
-    }
+    AddUnswitchCandidatesForInst(BI, BI->getCondition());
   }
 
   if (MSSAU && !findOptionMDForLoop(&L, "llvm.loop.unswitch.partial.disable") &&
@@ -2844,6 +2922,303 @@ static bool collectUnswitchCandidates(
   return !UnswitchCandidates.empty();
 }
 
+/// Tries to canonicalize condition described by:
+///
+///   br (LHS pred RHS), label IfTrue, label IfFalse
+///
+/// into its equivalent where `Pred` is something that we support for injected
+/// invariants (so far it is limited to ult), LHS in canonicalized form is
+/// non-invariant and RHS is an invariant.
+static void canonicalizeForInvariantConditionInjection(
+    ICmpInst::Predicate &Pred, Value *&LHS, Value *&RHS, BasicBlock *&IfTrue,
+    BasicBlock *&IfFalse, const Loop &L) {
+  if (!L.contains(IfTrue)) {
+    Pred = ICmpInst::getInversePredicate(Pred);
+    std::swap(IfTrue, IfFalse);
+  }
+
+  // Move loop-invariant argument to RHS position.
+  if (L.isLoopInvariant(LHS)) {
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+    std::swap(LHS, RHS);
+  }
+
+  if (Pred == ICmpInst::ICMP_SGE && match(RHS, m_Zero())) {
+    // Turn "x >=s 0" into "x <u UMIN_INT"
+    Pred = ICmpInst::ICMP_ULT;
+    RHS = ConstantInt::get(
+        RHS->getContext(),
+        APInt::getSignedMinValue(RHS->getType()->getIntegerBitWidth()));
+  }
+}
+
+/// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS )
+/// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by
+/// injecting a loop-invariant condition.
+static bool shouldTryInjectInvariantCondition(
+    const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS,
+    const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) {
+  if (L.isLoopInvariant(LHS) || !L.isLoopInvariant(RHS))
+    return false;
+  // TODO: Support other predicates.
+  if (Pred != ICmpInst::ICMP_ULT)
+    return false;
+  // TODO: Support non-loop-exiting branches?
+  if (!L.contains(IfTrue) || L.contains(IfFalse))
+    return false;
+  // FIXME: For some reason this causes problems with MSSA updates, need to
+  // investigate why. So far, just don't unswitch latch.
+  if (L.getHeader() == IfTrue)
+    return false;
+  return true;
+}
+
+/// Returns true, if metadata on \p BI allows us to optimize branching into \p
+/// TakenSucc via injection of invariant conditions. The branch should be not
+/// enough and not previously unswitched, the information about this comes from
+/// the metadata.
+bool shouldTryInjectBasingOnMetadata(const BranchInst *BI,
+                                     const BasicBlock *TakenSucc) {
+  // Skip branches that have already been unswithed this way. After successful
+  // unswitching of injected condition, we will still have a copy of this loop
+  // which looks exactly the same as original one. To prevent the 2nd attempt
+  // of unswitching it in the same pass, mark this branch as "nothing to do
+  // here".
+  if (BI->hasMetadata("llvm.invariant.condition.injection.disabled"))
+    return false;
+  SmallVector<uint32_t> Weights;
+  if (!extractBranchWeights(*BI, Weights))
+    return false;
+  unsigned T = InjectInvariantConditionHotnesThreshold;
+  BranchProbability LikelyTaken(T - 1, T);
+
+  assert(Weights.size() == 2 && "Unexpected profile data!");
+  size_t Idx = BI->getSuccessor(0) == TakenSucc ? 0 : 1;
+  auto Num = Weights[Idx];
+  auto Denom = Weights[0] + Weights[1];
+  // Degenerate or overflowed metadata.
+  if (Denom == 0 || Num > Denom)
+    return false;
+  BranchProbability ActualTaken(Num, Denom);
+  if (LikelyTaken > ActualTaken)
+    return false;
+  return true;
+}
+
+/// Materialize pending invariant condition of the given candidate into IR. The
+/// injected loop-invariant condition implies the original loop-variant branch
+/// condition, so the materialization turns
+///
+/// loop_block:
+///   ...
+///   br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc
+///
+/// into
+///
+/// preheader:
+///   %invariant_cond = LHS pred RHS
+/// ...
+/// loop_block:
+///   br i1 %invariant_cond, label InLoopSucc, label OriginalCheck
+/// OriginalCheck:
+///   br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc
+/// ...
+static NonTrivialUnswitchCandidate
+injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
+                                 DominatorTree &DT, LoopInfo &LI,
+                                 AssumptionCache &AC, MemorySSAUpdater *MSSAU) {
+  assert(Candidate.hasPendingInjection() && "Nothing to inject!");
+  BasicBlock *Preheader = L.getLoopPreheader();
+  assert(Preheader && "Loop is not in simplified form?");
+  assert(LI.getLoopFor(Candidate.TI->getParent()) == &L &&
+         "Unswitching branch of inner loop!");
+
+  auto Pred = Candidate.PendingInjection->Pred;
+  auto *LHS = Candidate.PendingInjection->LHS;
+  auto *RHS = Candidate.PendingInjection->RHS;
+  auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc;
+  auto *TI = cast<BranchInst>(Candidate.TI);
+  auto *BB = Candidate.TI->getParent();
+  auto *OutOfLoopSucc = InLoopSucc == TI->getSuccessor(0) ? TI->getSuccessor(1)
+                                                          : TI->getSuccessor(0);
+  // FIXME: Remove this once limitation on successors is lifted.
+  assert(L.contains(InLoopSucc) && "Not supported yet!");
+  assert(!L.contains(OutOfLoopSucc) && "Not supported yet!");
+  auto &Ctx = BB->getContext();
+
+  IRBuilder<> Builder(Preheader->getTerminator());
+  assert(ICmpInst::isUnsigned(Pred) && "Not supported yet!");
+  if (LHS->getType() != RHS->getType()) {
+    if (LHS->getType()->getIntegerBitWidth() <
+        RHS->getType()->getIntegerBitWidth())
+      LHS = Builder.CreateZExt(LHS, RHS->getType(), LHS->getName() + ".wide");
+    else
+      RHS = Builder.CreateZExt(RHS, LHS->getType(), RHS->getName() + ".wide");
+  }
+  // Do not use builder here: CreateICmp may simplify this into a constant and
+  // unswitching will break. Better optimize it away later.
+  auto *InjectedCond =
+      ICmpInst::Create(Instruction::ICmp, Pred, LHS, RHS, "injected.cond",
+                       Preheader->getTerminator());
+  auto *OldCond = TI->getCondition();
+
+  BasicBlock *CheckBlock = BasicBlock::Create(Ctx, BB->getName() + ".check",
+                                              BB->getParent(), InLoopSucc);
+  Builder.SetInsertPoint(TI);
+  auto *InvariantBr =
+      Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+
+  Builder.SetInsertPoint(CheckBlock);
+  auto *NewTerm = Builder.CreateCondBr(OldCond, InLoopSucc, OutOfLoopSucc);
+
+  TI->eraseFromParent();
+  // Prevent infinite unswitching.
+  NewTerm->setMetadata("llvm.invariant.condition.injection.disabled",
+                       MDNode::get(BB->getContext(), {}));
+
+  // Fixup phis.
+  for (auto &I : *InLoopSucc) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+    auto *Inc = PN->getIncomingValueForBlock(BB);
+    PN->addIncoming(Inc, CheckBlock);
+  }
+  OutOfLoopSucc->replacePhiUsesWith(BB, CheckBlock);
+
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates = {
+    { DominatorTree::Insert, BB, CheckBlock },
+    { DominatorTree::Insert, CheckBlock, InLoopSucc },
+    { DominatorTree::Insert, CheckBlock, OutOfLoopSucc },
+    { DominatorTree::Delete, BB, OutOfLoopSucc }
+  };
+
+  DT.applyUpdates(DTUpdates);
+  if (MSSAU)
+    MSSAU->applyUpdates(DTUpdates, DT);
+  L.addBasicBlockToLoop(CheckBlock, LI);
+
+#ifndef NDEBUG
+  DT.verify();
+  LI.verify(DT);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+#endif
+
+  // TODO: In fact, cost of unswitching a new invariant candidate is *slightly*
+  // higher because we have just inserted a new block. Need to think how to
+  // adjust the cost of injected candidates when it was first computed.
+  LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr
+                    << " and considering it for unswitching.");
+  ++NumInvariantConditionsInjected;
+  return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond },
+                                     Candidate.Cost);
+}
+
+/// Given chain of loop branch conditions looking like:
+///   br (Variant < Invariant1)
+///   br (Variant < Invariant2)
+///   br (Variant < Invariant3)
+///   ...
+/// collect set of invariant conditions on which we want to unswitch, which
+/// look like:
+///   Invariant1 <= Invariant2
+///   Invariant2 <= Invariant3
+///   ...
+/// Though they might not immediately exist in the IR, we can still inject them.
+static bool insertCandidatesWithPendingInjections(
+    SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, Loop &L,
+    ICmpInst::Predicate Pred, ArrayRef<CompareDesc> Compares,
+    const DominatorTree &DT) {
+
+  assert(ICmpInst::isRelational(Pred));
+  assert(ICmpInst::isStrictPredicate(Pred));
+  if (Compares.size() < 2)
+    return false;
+  ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(Pred);
+  for (auto Prev = Compares.begin(), Next = Compares.begin() + 1;
+       Next != Compares.end(); ++Prev, ++Next) {
+    Value *LHS = Next->Invariant;
+    Value *RHS = Prev->Invariant;
+    BasicBlock *InLoopSucc = Prev->InLoopSucc;
+    InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc);
+    NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS },
+                                          std::nullopt, std::move(ToInject));
+    UnswitchCandidates.push_back(std::move(Candidate));
+  }
+  return true;
+}
+
+/// Collect unswitch candidates by invariant conditions that are not immediately
+/// present in the loop. However, they can be injected into the code if we
+/// decide it's profitable.
+/// An example of such conditions is following:
+///
+///   for (...) {
+///     x = load ...
+///     if (! x <u C1) break;
+///     if (! x <u C2) break;
+///     <do something>
+///   }
+///
+/// We can unswitch by condition "C1 <=u C2". If that is true, then "x <u C1 <=
+/// C2" automatically implies "x <u C2", so we can get rid of one of
+/// loop-variant checks in unswitched loop version.
+static bool collectUnswitchCandidatesWithInjections(
+    SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates,
+    IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L,
+    const DominatorTree &DT, const LoopInfo &LI, AAResults &AA,
+    const MemorySSAUpdater *MSSAU) {
+  if (!InjectInvariantConditions)
+    return false;
+
+  if (!DT.isReachableFromEntry(L.getHeader()))
+    return false;
+  auto *Latch = L.getLoopLatch();
+  // Need to have a single latch and a preheader.
+  if (!Latch)
+    return false;
+  assert(L.getLoopPreheader() && "Must have a preheader!");
+
+  DenseMap<Value *, SmallVector<CompareDesc, 4> > CandidatesULT;
+  // Traverse the conditions that dominate latch (and therefore dominate each
+  // other).
+  for (auto *DTN = DT.getNode(Latch); L.contains(DTN->getBlock());
+       DTN = DTN->getIDom()) {
+    ICmpInst::Predicate Pred;
+    Value *LHS = nullptr, *RHS = nullptr;
+    BasicBlock *IfTrue = nullptr, *IfFalse = nullptr;
+    auto *BB = DTN->getBlock();
+    // Ignore inner loops.
+    if (LI.getLoopFor(BB) != &L)
+      continue;
+    auto *Term = BB->getTerminator();
+    if (!match(Term, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+                          m_BasicBlock(IfTrue), m_BasicBlock(IfFalse))))
+      continue;
+    if (!LHS->getType()->isIntegerTy())
+      continue;
+    canonicalizeForInvariantConditionInjection(Pred, LHS, RHS, IfTrue, IfFalse,
+                                               L);
+    if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L))
+      continue;
+    if (!shouldTryInjectBasingOnMetadata(cast<BranchInst>(Term), IfTrue))
+      continue;
+    // Strip ZEXT for unsigned predicate.
+    // TODO: once signed predicates are supported, also strip SEXT.
+    CompareDesc Desc(cast<BranchInst>(Term), RHS, IfTrue);
+    while (auto *Zext = dyn_cast<ZExtInst>(LHS))
+      LHS = Zext->getOperand(0);
+    CandidatesULT[LHS].push_back(Desc);
+  }
+
+  bool Found = false;
+  for (auto &It : CandidatesULT)
+    Found |= insertCandidatesWithPendingInjections(
+        UnswitchCandidates, L, ICmpInst::ICMP_ULT, It.second, DT);
+  return Found;
+}
+
 static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) {
   if (!L.isSafeToClone())
     return false;
@@ -2943,6 +3318,10 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
   // cost for that terminator.
   auto ComputeUnswitchedCost = [&](Instruction &TI,
                                    bool FullUnswitch) -> InstructionCost {
+    // Unswitching selects unswitches the entire loop.
+    if (isa<SelectInst>(TI))
+      return LoopCost;
+
     BasicBlock &BB = *TI.getParent();
     SmallPtrSet<BasicBlock *, 4> Visited;
 
@@ -3003,10 +3382,11 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
     Instruction &TI = *Candidate.TI;
     ArrayRef<Value *> Invariants = Candidate.Invariants;
     BranchInst *BI = dyn_cast<BranchInst>(&TI);
-    InstructionCost CandidateCost = ComputeUnswitchedCost(
-        TI, /*FullUnswitch*/ !BI ||
-                (Invariants.size() == 1 &&
-                 Invariants[0] == skipTrivialSelect(BI->getCondition())));
+    bool FullUnswitch =
+        !BI || Candidate.hasPendingInjection() ||
+        (Invariants.size() == 1 &&
+         Invariants[0] == skipTrivialSelect(BI->getCondition()));
+    InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch);
     // Calculate cost multiplier which is a tool to limit potentially
     // exponential behavior of loop-unswitch.
     if (EnableUnswitchCostMultiplier) {
@@ -3033,6 +3413,32 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
   return *Best;
 }
 
+// Insert a freeze on an unswitched branch if all is true:
+// 1. freeze-loop-unswitch-cond option is true
+// 2. The branch may not execute in the loop pre-transformation. If a branch may
+// not execute and could cause UB, it would always cause UB if it is hoisted outside
+// of the loop. Insert a freeze to prevent this case.
+// 3. The branch condition may be poison or undef
+static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT,
+                               AssumptionCache &AC) {
+  assert(isa<BranchInst>(TI) || isa<SwitchInst>(TI));
+  if (!FreezeLoopUnswitchCond)
+    return false;
+
+  ICFLoopSafetyInfo SafetyInfo;
+  SafetyInfo.computeLoopSafetyInfo(&L);
+  if (SafetyInfo.isGuaranteedToExecute(TI, &DT, &L))
+    return false;
+
+  Value *Cond;
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI))
+    Cond = skipTrivialSelect(BI->getCondition());
+  else
+    Cond = skipTrivialSelect(cast<SwitchInst>(&TI)->getCondition());
+  return !isGuaranteedNotToBeUndefOrPoison(
+      Cond, &AC, L.getLoopPreheader()->getTerminator(), &DT);
+}
+
 static bool unswitchBestCondition(
     Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
     AAResults &AA, TargetTransformInfo &TTI,
@@ -3044,9 +3450,13 @@ static bool unswitchBestCondition(
   SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates;
   IVConditionInfo PartialIVInfo;
   Instruction *PartialIVCondBranch = nullptr;
+  collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
+                            PartialIVCondBranch, L, LI, AA, MSSAU);
+  collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo,
+                                          PartialIVCondBranch, L, DT, LI, AA,
+                                          MSSAU);
   // If we didn't find any candidates, we're done.
-  if (!collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
-                                 PartialIVCondBranch, L, LI, AA, MSSAU))
+  if (UnswitchCandidates.empty())
     return false;
 
   LLVM_DEBUG(
@@ -3065,18 +3475,36 @@ static bool unswitchBestCondition(
     return false;
   }
 
+  if (Best.hasPendingInjection())
+    Best = injectPendingInvariantConditions(Best, L, DT, LI, AC, MSSAU);
+  assert(!Best.hasPendingInjection() &&
+         "All injections should have been done by now!");
+
   if (Best.TI != PartialIVCondBranch)
     PartialIVInfo.InstToDuplicate.clear();
 
-  // If the best candidate is a guard, turn it into a branch.
-  if (isGuard(Best.TI))
-    Best.TI =
-        turnGuardIntoBranch(cast<IntrinsicInst>(Best.TI), L, DT, LI, MSSAU);
+  bool InsertFreeze;
+  if (auto *SI = dyn_cast<SelectInst>(Best.TI)) {
+    // If the best candidate is a select, turn it into a branch. Select
+    // instructions with a poison conditional do not propagate poison, but
+    // branching on poison causes UB. Insert a freeze on the select
+    // conditional to prevent UB after turning the select into a branch.
+    InsertFreeze = !isGuaranteedNotToBeUndefOrPoison(
+        SI->getCondition(), &AC, L.getLoopPreheader()->getTerminator(), &DT);
+    Best.TI = turnSelectIntoBranch(SI, DT, LI, MSSAU, &AC);
+  } else {
+    // If the best candidate is a guard, turn it into a branch.
+    if (isGuard(Best.TI))
+      Best.TI =
+          turnGuardIntoBranch(cast<IntrinsicInst>(Best.TI), L, DT, LI, MSSAU);
+    InsertFreeze = shouldInsertFreeze(L, *Best.TI, DT, AC);
+  }
 
   LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = " << Best.Cost
                     << ") terminator: " << *Best.TI << "\n");
   unswitchNontrivialInvariants(L, *Best.TI, Best.Invariants, PartialIVInfo, DT,
-                               LI, AC, UnswitchCB, SE, MSSAU, DestroyLoopCB);
+                               LI, AC, UnswitchCB, SE, MSSAU, DestroyLoopCB,
+                               InsertFreeze);
   return true;
 }
 
@@ -3124,6 +3552,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
     return true;
   }
 
+  const Function *F = L.getHeader()->getParent();
+
   // Check whether we should continue with non-trivial conditions.
   // EnableNonTrivialUnswitch: Global variable that forces non-trivial
   //                           unswitching for testing and debugging.
@@ -3136,18 +3566,41 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
   // branches even on targets that have divergence.
   // https://bugs.llvm.org/show_bug.cgi?id=48819
   bool ContinueWithNonTrivial =
-      EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence());
+      EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence(F));
   if (!ContinueWithNonTrivial)
     return false;
 
   // Skip non-trivial unswitching for optsize functions.
-  if (L.getHeader()->getParent()->hasOptSize())
+  if (F->hasOptSize())
     return false;
 
-  // Skip cold loops, as unswitching them brings little benefit
-  // but increases the code size
-  if (PSI && PSI->hasProfileSummary() && BFI &&
-      PSI->isFunctionColdInCallGraph(L.getHeader()->getParent(), *BFI)) {
+  // Returns true if Loop L's loop nest is cold, i.e. if the headers of L,
+  // of the loops L is nested in, and of the loops nested in L are all cold.
+  auto IsLoopNestCold = [&](const Loop *L) {
+    // Check L and all of its parent loops.
+    auto *Parent = L;
+    while (Parent) {
+      if (!PSI->isColdBlock(Parent->getHeader(), BFI))
+        return false;
+      Parent = Parent->getParentLoop();
+    }
+    // Next check all loops nested within L.
+    SmallVector<const Loop *, 4> Worklist;
+    Worklist.insert(Worklist.end(), L->getSubLoops().begin(),
+                    L->getSubLoops().end());
+    while (!Worklist.empty()) {
+      auto *CurLoop = Worklist.pop_back_val();
+      if (!PSI->isColdBlock(CurLoop->getHeader(), BFI))
+        return false;
+      Worklist.insert(Worklist.end(), CurLoop->getSubLoops().begin(),
+                      CurLoop->getSubLoops().end());
+    }
+    return true;
+  };
+
+  // Skip cold loops in cold loop nests, as unswitching them brings little
+  // benefit but increases the code size
+  if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) {
     LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n");
     return false;
   }
@@ -3249,10 +3702,10 @@ void SimpleLoopUnswitchPass::printPipeline(
   static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline(
       OS, MapClassName2PassName);
 
-  OS << "<";
+  OS << '<';
   OS << (NonTrivial ? "" : "no-") << "nontrivial;";
   OS << (Trivial ? "" : "no-") << "trivial";
-  OS << ">";
+  OS << '>';
 }
 
 namespace {