diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp')
-rw-r--r-- | llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 695 |
1 files changed, 574 insertions, 121 deletions
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 7e08120f923d..ad7d34b61470 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" @@ -42,6 +43,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" @@ -73,11 +75,14 @@ using namespace llvm::PatternMatch; STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); +STATISTIC(NumSelects, "Number of selects turned into branches for unswitching"); STATISTIC(NumGuards, "Number of guards turned into branches for unswitching"); STATISTIC(NumTrivial, "Number of unswitches that are trivial"); STATISTIC( NumCostMultiplierSkipped, "Number of unswitch candidates that had their cost multiplier skipped"); +STATISTIC(NumInvariantConditionsInjected, + "Number of invariant conditions injected and unswitched"); static cl::opt<bool> EnableNonTrivialUnswitch( "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, @@ -118,15 +123,53 @@ static cl::opt<bool> FreezeLoopUnswitchCond( cl::desc("If enabled, the freeze instruction will be added to condition " "of loop unswitch to prevent miscompilation.")); +static cl::opt<bool> InjectInvariantConditions( + "simple-loop-unswitch-inject-invariant-conditions", cl::Hidden, + cl::desc("Whether we should inject new invariants and unswitch them to " + "eliminate some existing (non-invariant) conditions."), + cl::init(true)); + +static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( + "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", + cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " + "unswitch on them to eliminate branches that are " + "not-taken 1/<this option> times or less."), + cl::init(16)); + namespace { +struct CompareDesc { + BranchInst *Term; + Value *Invariant; + BasicBlock *InLoopSucc; + + CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc) + : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {} +}; + +struct InjectedInvariant { + ICmpInst::Predicate Pred; + Value *LHS; + Value *RHS; + BasicBlock *InLoopSucc; + + InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS, + BasicBlock *InLoopSucc) + : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {} +}; + struct NonTrivialUnswitchCandidate { Instruction *TI = nullptr; TinyPtrVector<Value *> Invariants; std::optional<InstructionCost> Cost; + std::optional<InjectedInvariant> PendingInjection; NonTrivialUnswitchCandidate( Instruction *TI, ArrayRef<Value *> Invariants, - std::optional<InstructionCost> Cost = std::nullopt) - : TI(TI), Invariants(Invariants), Cost(Cost){}; + std::optional<InstructionCost> Cost = std::nullopt, + std::optional<InjectedInvariant> PendingInjection = std::nullopt) + : TI(TI), Invariants(Invariants), Cost(Cost), + PendingInjection(PendingInjection) {}; + + bool hasPendingInjection() const { return PendingInjection.has_value(); } }; } // end anonymous namespace. @@ -434,10 +477,10 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, // Return the top-most loop containing ExitBB and having ExitBB as exiting block // or the loop containing ExitBB, if there is no parent loop containing ExitBB // as exiting block. -static const Loop *getTopMostExitingLoop(const BasicBlock *ExitBB, - const LoopInfo &LI) { - const Loop *TopMost = LI.getLoopFor(ExitBB); - const Loop *Current = TopMost; +static Loop *getTopMostExitingLoop(const BasicBlock *ExitBB, + const LoopInfo &LI) { + Loop *TopMost = LI.getLoopFor(ExitBB); + Loop *Current = TopMost; while (Current) { if (Current->isLoopExiting(ExitBB)) TopMost = Current; @@ -750,15 +793,32 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, Loop *OuterL = &L; if (DefaultExitBB) { - // Clear out the default destination temporarily to allow accurate - // predecessor lists to be examined below. - SI.setDefaultDest(nullptr); // Check the loop containing this exit. - Loop *ExitL = LI.getLoopFor(DefaultExitBB); + Loop *ExitL = getTopMostExitingLoop(DefaultExitBB, LI); + if (!ExitL || ExitL->contains(OuterL)) + OuterL = ExitL; + } + for (unsigned Index : ExitCaseIndices) { + auto CaseI = SI.case_begin() + Index; + // Compute the outer loop from this exit. + Loop *ExitL = getTopMostExitingLoop(CaseI->getCaseSuccessor(), LI); if (!ExitL || ExitL->contains(OuterL)) OuterL = ExitL; } + if (SE) { + if (OuterL) + SE->forgetLoop(OuterL); + else + SE->forgetTopmostLoop(&L); + } + + if (DefaultExitBB) { + // Clear out the default destination temporarily to allow accurate + // predecessor lists to be examined below. + SI.setDefaultDest(nullptr); + } + // Store the exit cases into a separate data structure and remove them from // the switch. SmallVector<std::tuple<ConstantInt *, BasicBlock *, @@ -770,10 +830,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // and don't disrupt the earlier indices. for (unsigned Index : reverse(ExitCaseIndices)) { auto CaseI = SI.case_begin() + Index; - // Compute the outer loop from this exit. - Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor()); - if (!ExitL || ExitL->contains(OuterL)) - OuterL = ExitL; // Save the value of this case. auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex()); ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W); @@ -781,13 +837,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, SIW.removeCase(CaseI); } - if (SE) { - if (OuterL) - SE->forgetLoop(OuterL); - else - SE->forgetTopmostLoop(&L); - } - // Check if after this all of the remaining cases point at the same // successor. BasicBlock *CommonSuccBB = nullptr; @@ -2079,7 +2128,7 @@ static void unswitchNontrivialInvariants( AssumptionCache &AC, function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, - function_ref<void(Loop &, StringRef)> DestroyLoopCB) { + function_ref<void(Loop &, StringRef)> DestroyLoopCB, bool InsertFreeze) { auto *ParentBB = TI.getParent(); BranchInst *BI = dyn_cast<BranchInst>(&TI); SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI); @@ -2160,7 +2209,9 @@ static void unswitchNontrivialInvariants( SmallVector<BasicBlock *, 4> ExitBlocks; L.getUniqueExitBlocks(ExitBlocks); for (auto *ExitBB : ExitBlocks) { - Loop *NewOuterExitL = LI.getLoopFor(ExitBB); + // ExitBB can be an exit block for several levels in the loop nest. Make + // sure we find the top most. + Loop *NewOuterExitL = getTopMostExitingLoop(ExitBB, LI); if (!NewOuterExitL) { // We exited the entire nest with this block, so we're done. OuterExitL = nullptr; @@ -2181,25 +2232,6 @@ static void unswitchNontrivialInvariants( SE->forgetBlockAndLoopDispositions(); } - bool InsertFreeze = false; - if (FreezeLoopUnswitchCond) { - ICFLoopSafetyInfo SafetyInfo; - SafetyInfo.computeLoopSafetyInfo(&L); - InsertFreeze = !SafetyInfo.isGuaranteedToExecute(TI, &DT, &L); - } - - // Perform the isGuaranteedNotToBeUndefOrPoison() query before the transform, - // otherwise the branch instruction will have been moved outside the loop - // already, and may imply that a poison condition is always UB. - Value *FullUnswitchCond = nullptr; - if (FullUnswitch) { - FullUnswitchCond = - BI ? skipTrivialSelect(BI->getCondition()) : SI->getCondition(); - if (InsertFreeze) - InsertFreeze = !isGuaranteedNotToBeUndefOrPoison( - FullUnswitchCond, &AC, L.getLoopPreheader()->getTerminator(), &DT); - } - // If the edge from this terminator to a successor dominates that successor, // store a map from each block in its dominator subtree to it. This lets us // tell when cloning for a particular successor if a block is dominated by @@ -2274,10 +2306,11 @@ static void unswitchNontrivialInvariants( BasicBlock *ClonedPH = ClonedPHs.begin()->second; BI->setSuccessor(ClonedSucc, ClonedPH); BI->setSuccessor(1 - ClonedSucc, LoopPH); + Value *Cond = skipTrivialSelect(BI->getCondition()); if (InsertFreeze) - FullUnswitchCond = new FreezeInst( - FullUnswitchCond, FullUnswitchCond->getName() + ".fr", BI); - BI->setCondition(FullUnswitchCond); + Cond = new FreezeInst( + Cond, Cond->getName() + ".fr", BI); + BI->setCondition(Cond); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); } else { assert(SI && "Must either be a branch or switch!"); @@ -2294,7 +2327,7 @@ static void unswitchNontrivialInvariants( if (InsertFreeze) SI->setCondition(new FreezeInst( - FullUnswitchCond, FullUnswitchCond->getName() + ".fr", SI)); + SI->getCondition(), SI->getCondition()->getName() + ".fr", SI)); // We need to use the set to populate domtree updates as even when there // are multiple cases pointing at the same successor we only want to @@ -2593,6 +2626,57 @@ static InstructionCost computeDomSubtreeCost( return Cost; } +/// Turns a select instruction into implicit control flow branch, +/// making the following replacement: +/// +/// head: +/// --code before select-- +/// select %cond, %trueval, %falseval +/// --code after select-- +/// +/// into +/// +/// head: +/// --code before select-- +/// br i1 %cond, label %then, label %tail +/// +/// then: +/// br %tail +/// +/// tail: +/// phi [ %trueval, %then ], [ %falseval, %head] +/// unreachable +/// +/// It also makes all relevant DT and LI updates, so that all structures are in +/// valid state after this transform. +static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, + LoopInfo &LI, MemorySSAUpdater *MSSAU, + AssumptionCache *AC) { + LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n"); + BasicBlock *HeadBB = SI->getParent(); + + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SplitBlockAndInsertIfThen(SI->getCondition(), SI, false, + SI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); + auto *CondBr = cast<BranchInst>(HeadBB->getTerminator()); + BasicBlock *ThenBB = CondBr->getSuccessor(0), + *TailBB = CondBr->getSuccessor(1); + if (MSSAU) + MSSAU->moveAllAfterSpliceBlocks(HeadBB, TailBB, SI); + + PHINode *Phi = PHINode::Create(SI->getType(), 2, "unswitched.select", SI); + Phi->addIncoming(SI->getTrueValue(), ThenBB); + Phi->addIncoming(SI->getFalseValue(), HeadBB); + SI->replaceAllUsesWith(Phi); + SI->eraseFromParent(); + + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + + ++NumSelects; + return CondBr; +} + /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, /// making the following replacement: /// @@ -2624,15 +2708,10 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); - // Remove all CheckBB's successors from DomTree. A block can be seen among - // successors more than once, but for DomTree it should be added only once. - SmallPtrSet<BasicBlock *, 4> Successors; - for (auto *Succ : successors(CheckBB)) - if (Successors.insert(Succ).second) - DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ}); - + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true); + SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, + GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. @@ -2649,20 +2728,6 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, GI->moveBefore(DeoptBlockTerm); GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext())); - // Add new successors of CheckBB into DomTree. - for (auto *Succ : successors(CheckBB)) - DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ}); - - // Now the blocks that used to be CheckBB's successors are GuardedBlock's - // successors. - for (auto *Succ : Successors) - DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ}); - - // Make proper changes to DT. - DT.applyUpdates(DTUpdates); - // Inform LI of a new loop block. - L.addBasicBlockToLoop(GuardedBlock, LI); - if (MSSAU) { MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI)); MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator); @@ -2670,6 +2735,8 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, MSSAU->getMemorySSA()->verifyMemorySSA(); } + if (VerifyLoopInfo) + LI.verify(DT); ++NumGuards; return CheckBI; } @@ -2700,9 +2767,10 @@ static int CalculateUnswitchCostMultiplier( const BasicBlock *CondBlock = TI.getParent(); if (DT.dominates(CondBlock, Latch) && (isGuard(&TI) || - llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) { - return L.contains(SuccBB); - }) <= 1)) { + (TI.isTerminator() && + llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) { + return L.contains(SuccBB); + }) <= 1))) { NumCostMultiplierSkipped++; return 1; } @@ -2711,12 +2779,17 @@ static int CalculateUnswitchCostMultiplier( int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() : std::distance(LI.begin(), LI.end())); // Count amount of clones that all the candidates might cause during - // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases. + // unswitching. Branch/guard/select counts as 1, switch counts as log2 of its + // cases. int UnswitchedClones = 0; - for (auto Candidate : UnswitchCandidates) { + for (const auto &Candidate : UnswitchCandidates) { const Instruction *CI = Candidate.TI; const BasicBlock *CondBlock = CI->getParent(); bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch); + if (isa<SelectInst>(CI)) { + UnswitchedClones++; + continue; + } if (isGuard(CI)) { if (!SkipExitingSuccessors) UnswitchedClones++; @@ -2766,6 +2839,24 @@ static bool collectUnswitchCandidates( const Loop &L, const LoopInfo &LI, AAResults &AA, const MemorySSAUpdater *MSSAU) { assert(UnswitchCandidates.empty() && "Should be!"); + + auto AddUnswitchCandidatesForInst = [&](Instruction *I, Value *Cond) { + Cond = skipTrivialSelect(Cond); + if (isa<Constant>(Cond)) + return; + if (L.isLoopInvariant(Cond)) { + UnswitchCandidates.push_back({I, {Cond}}); + return; + } + if (match(Cond, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) { + TinyPtrVector<Value *> Invariants = + collectHomogenousInstGraphLoopInvariants( + L, *static_cast<Instruction *>(Cond), LI); + if (!Invariants.empty()) + UnswitchCandidates.push_back({I, std::move(Invariants)}); + } + }; + // Whether or not we should also collect guards in the loop. bool CollectGuards = false; if (UnswitchGuards) { @@ -2779,15 +2870,20 @@ static bool collectUnswitchCandidates( if (LI.getLoopFor(BB) != &L) continue; - if (CollectGuards) - for (auto &I : *BB) - if (isGuard(&I)) { - auto *Cond = - skipTrivialSelect(cast<IntrinsicInst>(&I)->getArgOperand(0)); - // TODO: Support AND, OR conditions and partial unswitching. - if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond)) - UnswitchCandidates.push_back({&I, {Cond}}); - } + for (auto &I : *BB) { + if (auto *SI = dyn_cast<SelectInst>(&I)) { + auto *Cond = SI->getCondition(); + // Do not unswitch vector selects and logical and/or selects + if (Cond->getType()->isIntegerTy(1) && !SI->getType()->isIntegerTy(1)) + AddUnswitchCandidatesForInst(SI, Cond); + } else if (CollectGuards && isGuard(&I)) { + auto *Cond = + skipTrivialSelect(cast<IntrinsicInst>(&I)->getArgOperand(0)); + // TODO: Support AND, OR conditions and partial unswitching. + if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond)) + UnswitchCandidates.push_back({&I, {Cond}}); + } + } if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { // We can only consider fully loop-invariant switch conditions as we need @@ -2799,29 +2895,11 @@ static bool collectUnswitchCandidates( } auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); - if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) || + if (!BI || !BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) continue; - Value *Cond = skipTrivialSelect(BI->getCondition()); - if (isa<Constant>(Cond)) - continue; - - if (L.isLoopInvariant(Cond)) { - UnswitchCandidates.push_back({BI, {Cond}}); - continue; - } - - Instruction &CondI = *cast<Instruction>(Cond); - if (match(&CondI, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) { - TinyPtrVector<Value *> Invariants = - collectHomogenousInstGraphLoopInvariants(L, CondI, LI); - if (Invariants.empty()) - continue; - - UnswitchCandidates.push_back({BI, std::move(Invariants)}); - continue; - } + AddUnswitchCandidatesForInst(BI, BI->getCondition()); } if (MSSAU && !findOptionMDForLoop(&L, "llvm.loop.unswitch.partial.disable") && @@ -2844,6 +2922,303 @@ static bool collectUnswitchCandidates( return !UnswitchCandidates.empty(); } +/// Tries to canonicalize condition described by: +/// +/// br (LHS pred RHS), label IfTrue, label IfFalse +/// +/// into its equivalent where `Pred` is something that we support for injected +/// invariants (so far it is limited to ult), LHS in canonicalized form is +/// non-invariant and RHS is an invariant. +static void canonicalizeForInvariantConditionInjection( + ICmpInst::Predicate &Pred, Value *&LHS, Value *&RHS, BasicBlock *&IfTrue, + BasicBlock *&IfFalse, const Loop &L) { + if (!L.contains(IfTrue)) { + Pred = ICmpInst::getInversePredicate(Pred); + std::swap(IfTrue, IfFalse); + } + + // Move loop-invariant argument to RHS position. + if (L.isLoopInvariant(LHS)) { + Pred = ICmpInst::getSwappedPredicate(Pred); + std::swap(LHS, RHS); + } + + if (Pred == ICmpInst::ICMP_SGE && match(RHS, m_Zero())) { + // Turn "x >=s 0" into "x <u UMIN_INT" + Pred = ICmpInst::ICMP_ULT; + RHS = ConstantInt::get( + RHS->getContext(), + APInt::getSignedMinValue(RHS->getType()->getIntegerBitWidth())); + } +} + +/// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS ) +/// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by +/// injecting a loop-invariant condition. +static bool shouldTryInjectInvariantCondition( + const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS, + const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) { + if (L.isLoopInvariant(LHS) || !L.isLoopInvariant(RHS)) + return false; + // TODO: Support other predicates. + if (Pred != ICmpInst::ICMP_ULT) + return false; + // TODO: Support non-loop-exiting branches? + if (!L.contains(IfTrue) || L.contains(IfFalse)) + return false; + // FIXME: For some reason this causes problems with MSSA updates, need to + // investigate why. So far, just don't unswitch latch. + if (L.getHeader() == IfTrue) + return false; + return true; +} + +/// Returns true, if metadata on \p BI allows us to optimize branching into \p +/// TakenSucc via injection of invariant conditions. The branch should be not +/// enough and not previously unswitched, the information about this comes from +/// the metadata. +bool shouldTryInjectBasingOnMetadata(const BranchInst *BI, + const BasicBlock *TakenSucc) { + // Skip branches that have already been unswithed this way. After successful + // unswitching of injected condition, we will still have a copy of this loop + // which looks exactly the same as original one. To prevent the 2nd attempt + // of unswitching it in the same pass, mark this branch as "nothing to do + // here". + if (BI->hasMetadata("llvm.invariant.condition.injection.disabled")) + return false; + SmallVector<uint32_t> Weights; + if (!extractBranchWeights(*BI, Weights)) + return false; + unsigned T = InjectInvariantConditionHotnesThreshold; + BranchProbability LikelyTaken(T - 1, T); + + assert(Weights.size() == 2 && "Unexpected profile data!"); + size_t Idx = BI->getSuccessor(0) == TakenSucc ? 0 : 1; + auto Num = Weights[Idx]; + auto Denom = Weights[0] + Weights[1]; + // Degenerate or overflowed metadata. + if (Denom == 0 || Num > Denom) + return false; + BranchProbability ActualTaken(Num, Denom); + if (LikelyTaken > ActualTaken) + return false; + return true; +} + +/// Materialize pending invariant condition of the given candidate into IR. The +/// injected loop-invariant condition implies the original loop-variant branch +/// condition, so the materialization turns +/// +/// loop_block: +/// ... +/// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc +/// +/// into +/// +/// preheader: +/// %invariant_cond = LHS pred RHS +/// ... +/// loop_block: +/// br i1 %invariant_cond, label InLoopSucc, label OriginalCheck +/// OriginalCheck: +/// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc +/// ... +static NonTrivialUnswitchCandidate +injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, + DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, MemorySSAUpdater *MSSAU) { + assert(Candidate.hasPendingInjection() && "Nothing to inject!"); + BasicBlock *Preheader = L.getLoopPreheader(); + assert(Preheader && "Loop is not in simplified form?"); + assert(LI.getLoopFor(Candidate.TI->getParent()) == &L && + "Unswitching branch of inner loop!"); + + auto Pred = Candidate.PendingInjection->Pred; + auto *LHS = Candidate.PendingInjection->LHS; + auto *RHS = Candidate.PendingInjection->RHS; + auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc; + auto *TI = cast<BranchInst>(Candidate.TI); + auto *BB = Candidate.TI->getParent(); + auto *OutOfLoopSucc = InLoopSucc == TI->getSuccessor(0) ? TI->getSuccessor(1) + : TI->getSuccessor(0); + // FIXME: Remove this once limitation on successors is lifted. + assert(L.contains(InLoopSucc) && "Not supported yet!"); + assert(!L.contains(OutOfLoopSucc) && "Not supported yet!"); + auto &Ctx = BB->getContext(); + + IRBuilder<> Builder(Preheader->getTerminator()); + assert(ICmpInst::isUnsigned(Pred) && "Not supported yet!"); + if (LHS->getType() != RHS->getType()) { + if (LHS->getType()->getIntegerBitWidth() < + RHS->getType()->getIntegerBitWidth()) + LHS = Builder.CreateZExt(LHS, RHS->getType(), LHS->getName() + ".wide"); + else + RHS = Builder.CreateZExt(RHS, LHS->getType(), RHS->getName() + ".wide"); + } + // Do not use builder here: CreateICmp may simplify this into a constant and + // unswitching will break. Better optimize it away later. + auto *InjectedCond = + ICmpInst::Create(Instruction::ICmp, Pred, LHS, RHS, "injected.cond", + Preheader->getTerminator()); + auto *OldCond = TI->getCondition(); + + BasicBlock *CheckBlock = BasicBlock::Create(Ctx, BB->getName() + ".check", + BB->getParent(), InLoopSucc); + Builder.SetInsertPoint(TI); + auto *InvariantBr = + Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + + Builder.SetInsertPoint(CheckBlock); + auto *NewTerm = Builder.CreateCondBr(OldCond, InLoopSucc, OutOfLoopSucc); + + TI->eraseFromParent(); + // Prevent infinite unswitching. + NewTerm->setMetadata("llvm.invariant.condition.injection.disabled", + MDNode::get(BB->getContext(), {})); + + // Fixup phis. + for (auto &I : *InLoopSucc) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + break; + auto *Inc = PN->getIncomingValueForBlock(BB); + PN->addIncoming(Inc, CheckBlock); + } + OutOfLoopSucc->replacePhiUsesWith(BB, CheckBlock); + + SmallVector<DominatorTree::UpdateType, 4> DTUpdates = { + { DominatorTree::Insert, BB, CheckBlock }, + { DominatorTree::Insert, CheckBlock, InLoopSucc }, + { DominatorTree::Insert, CheckBlock, OutOfLoopSucc }, + { DominatorTree::Delete, BB, OutOfLoopSucc } + }; + + DT.applyUpdates(DTUpdates); + if (MSSAU) + MSSAU->applyUpdates(DTUpdates, DT); + L.addBasicBlockToLoop(CheckBlock, LI); + +#ifndef NDEBUG + DT.verify(); + LI.verify(DT); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); +#endif + + // TODO: In fact, cost of unswitching a new invariant candidate is *slightly* + // higher because we have just inserted a new block. Need to think how to + // adjust the cost of injected candidates when it was first computed. + LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr + << " and considering it for unswitching."); + ++NumInvariantConditionsInjected; + return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond }, + Candidate.Cost); +} + +/// Given chain of loop branch conditions looking like: +/// br (Variant < Invariant1) +/// br (Variant < Invariant2) +/// br (Variant < Invariant3) +/// ... +/// collect set of invariant conditions on which we want to unswitch, which +/// look like: +/// Invariant1 <= Invariant2 +/// Invariant2 <= Invariant3 +/// ... +/// Though they might not immediately exist in the IR, we can still inject them. +static bool insertCandidatesWithPendingInjections( + SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, Loop &L, + ICmpInst::Predicate Pred, ArrayRef<CompareDesc> Compares, + const DominatorTree &DT) { + + assert(ICmpInst::isRelational(Pred)); + assert(ICmpInst::isStrictPredicate(Pred)); + if (Compares.size() < 2) + return false; + ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(Pred); + for (auto Prev = Compares.begin(), Next = Compares.begin() + 1; + Next != Compares.end(); ++Prev, ++Next) { + Value *LHS = Next->Invariant; + Value *RHS = Prev->Invariant; + BasicBlock *InLoopSucc = Prev->InLoopSucc; + InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc); + NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS }, + std::nullopt, std::move(ToInject)); + UnswitchCandidates.push_back(std::move(Candidate)); + } + return true; +} + +/// Collect unswitch candidates by invariant conditions that are not immediately +/// present in the loop. However, they can be injected into the code if we +/// decide it's profitable. +/// An example of such conditions is following: +/// +/// for (...) { +/// x = load ... +/// if (! x <u C1) break; +/// if (! x <u C2) break; +/// <do something> +/// } +/// +/// We can unswitch by condition "C1 <=u C2". If that is true, then "x <u C1 <= +/// C2" automatically implies "x <u C2", so we can get rid of one of +/// loop-variant checks in unswitched loop version. +static bool collectUnswitchCandidatesWithInjections( + SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, + IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L, + const DominatorTree &DT, const LoopInfo &LI, AAResults &AA, + const MemorySSAUpdater *MSSAU) { + if (!InjectInvariantConditions) + return false; + + if (!DT.isReachableFromEntry(L.getHeader())) + return false; + auto *Latch = L.getLoopLatch(); + // Need to have a single latch and a preheader. + if (!Latch) + return false; + assert(L.getLoopPreheader() && "Must have a preheader!"); + + DenseMap<Value *, SmallVector<CompareDesc, 4> > CandidatesULT; + // Traverse the conditions that dominate latch (and therefore dominate each + // other). + for (auto *DTN = DT.getNode(Latch); L.contains(DTN->getBlock()); + DTN = DTN->getIDom()) { + ICmpInst::Predicate Pred; + Value *LHS = nullptr, *RHS = nullptr; + BasicBlock *IfTrue = nullptr, *IfFalse = nullptr; + auto *BB = DTN->getBlock(); + // Ignore inner loops. + if (LI.getLoopFor(BB) != &L) + continue; + auto *Term = BB->getTerminator(); + if (!match(Term, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), + m_BasicBlock(IfTrue), m_BasicBlock(IfFalse)))) + continue; + if (!LHS->getType()->isIntegerTy()) + continue; + canonicalizeForInvariantConditionInjection(Pred, LHS, RHS, IfTrue, IfFalse, + L); + if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L)) + continue; + if (!shouldTryInjectBasingOnMetadata(cast<BranchInst>(Term), IfTrue)) + continue; + // Strip ZEXT for unsigned predicate. + // TODO: once signed predicates are supported, also strip SEXT. + CompareDesc Desc(cast<BranchInst>(Term), RHS, IfTrue); + while (auto *Zext = dyn_cast<ZExtInst>(LHS)) + LHS = Zext->getOperand(0); + CandidatesULT[LHS].push_back(Desc); + } + + bool Found = false; + for (auto &It : CandidatesULT) + Found |= insertCandidatesWithPendingInjections( + UnswitchCandidates, L, ICmpInst::ICMP_ULT, It.second, DT); + return Found; +} + static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { if (!L.isSafeToClone()) return false; @@ -2943,6 +3318,10 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( // cost for that terminator. auto ComputeUnswitchedCost = [&](Instruction &TI, bool FullUnswitch) -> InstructionCost { + // Unswitching selects unswitches the entire loop. + if (isa<SelectInst>(TI)) + return LoopCost; + BasicBlock &BB = *TI.getParent(); SmallPtrSet<BasicBlock *, 4> Visited; @@ -3003,10 +3382,11 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( Instruction &TI = *Candidate.TI; ArrayRef<Value *> Invariants = Candidate.Invariants; BranchInst *BI = dyn_cast<BranchInst>(&TI); - InstructionCost CandidateCost = ComputeUnswitchedCost( - TI, /*FullUnswitch*/ !BI || - (Invariants.size() == 1 && - Invariants[0] == skipTrivialSelect(BI->getCondition()))); + bool FullUnswitch = + !BI || Candidate.hasPendingInjection() || + (Invariants.size() == 1 && + Invariants[0] == skipTrivialSelect(BI->getCondition())); + InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch); // Calculate cost multiplier which is a tool to limit potentially // exponential behavior of loop-unswitch. if (EnableUnswitchCostMultiplier) { @@ -3033,6 +3413,32 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( return *Best; } +// Insert a freeze on an unswitched branch if all is true: +// 1. freeze-loop-unswitch-cond option is true +// 2. The branch may not execute in the loop pre-transformation. If a branch may +// not execute and could cause UB, it would always cause UB if it is hoisted outside +// of the loop. Insert a freeze to prevent this case. +// 3. The branch condition may be poison or undef +static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT, + AssumptionCache &AC) { + assert(isa<BranchInst>(TI) || isa<SwitchInst>(TI)); + if (!FreezeLoopUnswitchCond) + return false; + + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(&L); + if (SafetyInfo.isGuaranteedToExecute(TI, &DT, &L)) + return false; + + Value *Cond; + if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) + Cond = skipTrivialSelect(BI->getCondition()); + else + Cond = skipTrivialSelect(cast<SwitchInst>(&TI)->getCondition()); + return !isGuaranteedNotToBeUndefOrPoison( + Cond, &AC, L.getLoopPreheader()->getTerminator(), &DT); +} + static bool unswitchBestCondition( Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, AAResults &AA, TargetTransformInfo &TTI, @@ -3044,9 +3450,13 @@ static bool unswitchBestCondition( SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; IVConditionInfo PartialIVInfo; Instruction *PartialIVCondBranch = nullptr; + collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, LI, AA, MSSAU); + collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, DT, LI, AA, + MSSAU); // If we didn't find any candidates, we're done. - if (!collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, - PartialIVCondBranch, L, LI, AA, MSSAU)) + if (UnswitchCandidates.empty()) return false; LLVM_DEBUG( @@ -3065,18 +3475,36 @@ static bool unswitchBestCondition( return false; } + if (Best.hasPendingInjection()) + Best = injectPendingInvariantConditions(Best, L, DT, LI, AC, MSSAU); + assert(!Best.hasPendingInjection() && + "All injections should have been done by now!"); + if (Best.TI != PartialIVCondBranch) PartialIVInfo.InstToDuplicate.clear(); - // If the best candidate is a guard, turn it into a branch. - if (isGuard(Best.TI)) - Best.TI = - turnGuardIntoBranch(cast<IntrinsicInst>(Best.TI), L, DT, LI, MSSAU); + bool InsertFreeze; + if (auto *SI = dyn_cast<SelectInst>(Best.TI)) { + // If the best candidate is a select, turn it into a branch. Select + // instructions with a poison conditional do not propagate poison, but + // branching on poison causes UB. Insert a freeze on the select + // conditional to prevent UB after turning the select into a branch. + InsertFreeze = !isGuaranteedNotToBeUndefOrPoison( + SI->getCondition(), &AC, L.getLoopPreheader()->getTerminator(), &DT); + Best.TI = turnSelectIntoBranch(SI, DT, LI, MSSAU, &AC); + } else { + // If the best candidate is a guard, turn it into a branch. + if (isGuard(Best.TI)) + Best.TI = + turnGuardIntoBranch(cast<IntrinsicInst>(Best.TI), L, DT, LI, MSSAU); + InsertFreeze = shouldInsertFreeze(L, *Best.TI, DT, AC); + } LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost << ") terminator: " << *Best.TI << "\n"); unswitchNontrivialInvariants(L, *Best.TI, Best.Invariants, PartialIVInfo, DT, - LI, AC, UnswitchCB, SE, MSSAU, DestroyLoopCB); + LI, AC, UnswitchCB, SE, MSSAU, DestroyLoopCB, + InsertFreeze); return true; } @@ -3124,6 +3552,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, return true; } + const Function *F = L.getHeader()->getParent(); + // Check whether we should continue with non-trivial conditions. // EnableNonTrivialUnswitch: Global variable that forces non-trivial // unswitching for testing and debugging. @@ -3136,18 +3566,41 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, // branches even on targets that have divergence. // https://bugs.llvm.org/show_bug.cgi?id=48819 bool ContinueWithNonTrivial = - EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence()); + EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence(F)); if (!ContinueWithNonTrivial) return false; // Skip non-trivial unswitching for optsize functions. - if (L.getHeader()->getParent()->hasOptSize()) + if (F->hasOptSize()) return false; - // Skip cold loops, as unswitching them brings little benefit - // but increases the code size - if (PSI && PSI->hasProfileSummary() && BFI && - PSI->isFunctionColdInCallGraph(L.getHeader()->getParent(), *BFI)) { + // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, + // of the loops L is nested in, and of the loops nested in L are all cold. + auto IsLoopNestCold = [&](const Loop *L) { + // Check L and all of its parent loops. + auto *Parent = L; + while (Parent) { + if (!PSI->isColdBlock(Parent->getHeader(), BFI)) + return false; + Parent = Parent->getParentLoop(); + } + // Next check all loops nested within L. + SmallVector<const Loop *, 4> Worklist; + Worklist.insert(Worklist.end(), L->getSubLoops().begin(), + L->getSubLoops().end()); + while (!Worklist.empty()) { + auto *CurLoop = Worklist.pop_back_val(); + if (!PSI->isColdBlock(CurLoop->getHeader(), BFI)) + return false; + Worklist.insert(Worklist.end(), CurLoop->getSubLoops().begin(), + CurLoop->getSubLoops().end()); + } + return true; + }; + + // Skip cold loops in cold loop nests, as unswitching them brings little + // benefit but increases the code size + if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); return false; } @@ -3249,10 +3702,10 @@ void SimpleLoopUnswitchPass::printPipeline( static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline( OS, MapClassName2PassName); - OS << "<"; + OS << '<'; OS << (NonTrivial ? "" : "no-") << "nontrivial;"; OS << (Trivial ? "" : "no-") << "trivial"; - OS << ">"; + OS << '>'; } namespace { |