diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp')
-rw-r--r-- | llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 319 |
1 files changed, 197 insertions, 122 deletions
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 0535608244cc..7e08120f923d 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/GuardUtils.h" @@ -26,6 +27,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -61,6 +63,7 @@ #include <cassert> #include <iterator> #include <numeric> +#include <optional> #include <utility> #define DEBUG_TYPE "simple-loop-unswitch" @@ -115,6 +118,18 @@ static cl::opt<bool> FreezeLoopUnswitchCond( cl::desc("If enabled, the freeze instruction will be added to condition " "of loop unswitch to prevent miscompilation.")); +namespace { +struct NonTrivialUnswitchCandidate { + Instruction *TI = nullptr; + TinyPtrVector<Value *> Invariants; + std::optional<InstructionCost> Cost; + NonTrivialUnswitchCandidate( + Instruction *TI, ArrayRef<Value *> Invariants, + std::optional<InstructionCost> Cost = std::nullopt) + : TI(TI), Invariants(Invariants), Cost(Cost){}; +}; +} // end anonymous namespace. + // Helper to skip (select x, true, false), which matches both a logical AND and // OR and can confuse code that tries to determine if \p Cond is either a // logical AND or OR but not both. @@ -133,8 +148,8 @@ static Value *skipTrivialSelect(Value *Cond) { /// inputs which are loop invariant. For some operations these can be /// re-associated and unswitched out of the loop entirely. static TinyPtrVector<Value *> -collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root, - LoopInfo &LI) { +collectHomogenousInstGraphLoopInvariants(const Loop &L, Instruction &Root, + const LoopInfo &LI) { assert(!L.isLoopInvariant(&Root) && "Only need to walk the graph if root itself is not invariant."); TinyPtrVector<Value *> Invariants; @@ -175,7 +190,7 @@ collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root, return Invariants; } -static void replaceLoopInvariantUses(Loop &L, Value *Invariant, +static void replaceLoopInvariantUses(const Loop &L, Value *Invariant, Constant &Replacement) { assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?"); @@ -192,9 +207,10 @@ static void replaceLoopInvariantUses(Loop &L, Value *Invariant, /// Check that all the LCSSA PHI nodes in the loop exit block have trivial /// incoming values along this edge. -static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, - BasicBlock &ExitBB) { - for (Instruction &I : ExitBB) { +static bool areLoopExitPHIsLoopInvariant(const Loop &L, + const BasicBlock &ExitingBB, + const BasicBlock &ExitBB) { + for (const Instruction &I : ExitBB) { auto *PN = dyn_cast<PHINode>(&I); if (!PN) // No more PHIs to check. @@ -214,7 +230,7 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, static void buildPartialUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, - Instruction *I, AssumptionCache *AC, DominatorTree &DT) { + const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { IRBuilder<> IRB(&BB); SmallVector<Value *> FrozenInvariants; @@ -239,7 +255,7 @@ static void buildPartialInvariantUnswitchConditionalBranch( for (auto *Val : reverse(ToDuplicate)) { Instruction *Inst = cast<Instruction>(Val); Instruction *NewInst = Inst->clone(); - BB.getInstList().insert(BB.end(), NewInst); + NewInst->insertInto(&BB, BB.end()); RemapInstruction(NewInst, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); VMap[Val] = NewInst; @@ -418,9 +434,10 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, // Return the top-most loop containing ExitBB and having ExitBB as exiting block // or the loop containing ExitBB, if there is no parent loop containing ExitBB // as exiting block. -static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) { - Loop *TopMost = LI.getLoopFor(ExitBB); - Loop *Current = TopMost; +static const Loop *getTopMostExitingLoop(const BasicBlock *ExitBB, + const LoopInfo &LI) { + const Loop *TopMost = LI.getLoopFor(ExitBB); + const Loop *Current = TopMost; while (Current) { if (Current->isLoopExiting(ExitBB)) TopMost = Current; @@ -521,11 +538,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // loop, the loop containing the exit block and the topmost parent loop // exiting via LoopExitBB. if (SE) { - if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI)) + if (const Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI)) SE->forgetLoop(ExitL); else // Forget the entire nest as this exits the entire nest. SE->forgetTopmostLoop(&L); + SE->forgetBlockAndLoopDispositions(); } if (MSSAU && VerifyMemorySSA) @@ -562,13 +580,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // If fully unswitching, we can use the existing branch instruction. // Splice it into the old PH to gate reaching the new preheader and re-point // its successors. - OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(), - BI); + OldPH->splice(OldPH->end(), BI.getParent(), BI.getIterator()); BI.setCondition(Cond); if (MSSAU) { // Temporarily clone the terminator, to make MSSA update cheaper by // separating "insert edge" updates from "remove edge" ones. - ParentBB->getInstList().push_back(BI.clone()); + BI.clone()->insertInto(ParentBB, ParentBB->end()); } else { // Create a new unconditional branch that will continue the loop as a new // terminator. @@ -1098,7 +1115,8 @@ static BasicBlock *buildClonedLoopBlocks( const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, ValueToValueMapTy &VMap, SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, - DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) { + DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, + ScalarEvolution *SE) { SmallVector<BasicBlock *, 4> NewBlocks; NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size()); @@ -1174,6 +1192,10 @@ static BasicBlock *buildClonedLoopBlocks( // We should have a value map between the instruction and its clone. assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!"); + // Forget SCEVs based on exit phis in case SCEV looked through the phi. + if (SE && isa<PHINode>(I)) + SE->forgetValue(&I); + auto *MergePN = PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi", &*MergeBB->getFirstInsertionPt()); @@ -1550,7 +1572,7 @@ static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, // We need a stable insertion order. We use the order of the original loop // order and map into the correct parent loop. for (auto *BB : llvm::concat<BasicBlock *const>( - makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops)) + ArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops)) if (Loop *OuterL = ExitLoopMap.lookup(BB)) OuterL->addBasicBlockToLoop(BB, LI); @@ -1590,7 +1612,7 @@ deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, // Find all the dead clones, and remove them from their successors. SmallVector<BasicBlock *, 16> DeadBlocks; for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks)) - for (auto &VMap : VMaps) + for (const auto &VMap : VMaps) if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB))) if (!DT.isReachableFromEntry(ClonedBB)) { for (BasicBlock *SuccBB : successors(ClonedBB)) @@ -1618,6 +1640,7 @@ deleteDeadBlocksFromLoop(Loop &L, SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, + ScalarEvolution *SE, function_ref<void(Loop &, StringRef)> DestroyLoopCB) { // Find all the dead blocks tied to this loop, and remove them from their // successors. @@ -1669,6 +1692,8 @@ deleteDeadBlocksFromLoop(Loop &L, "If the child loop header is dead all blocks in the child loop must " "be dead as well!"); DestroyLoopCB(*ChildL, ChildL->getName()); + if (SE) + SE->forgetBlockAndLoopDispositions(); LI.destroy(ChildL); return true; }); @@ -1818,7 +1843,8 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, /// referenced). static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, LoopInfo &LI, - SmallVectorImpl<Loop *> &HoistedLoops) { + SmallVectorImpl<Loop *> &HoistedLoops, + ScalarEvolution *SE) { auto *PH = L.getLoopPreheader(); // Compute the actual parent loop from the exit blocks. Because we may have @@ -2011,6 +2037,8 @@ static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, LI.removeLoop(llvm::find(LI, &L)); // markLoopAsDeleted for L should be triggered by the caller (it is typically // done by using the UnswitchCB callback). + if (SE) + SE->forgetBlockAndLoopDispositions(); LI.destroy(&L); return false; } @@ -2047,8 +2075,8 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { static void unswitchNontrivialInvariants( Loop &L, Instruction &TI, ArrayRef<Value *> Invariants, - SmallVectorImpl<BasicBlock *> &ExitBlocks, IVConditionInfo &PartialIVInfo, - DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, + IVConditionInfo &PartialIVInfo, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, function_ref<void(Loop &, StringRef)> DestroyLoopCB) { @@ -2129,6 +2157,8 @@ static void unswitchNontrivialInvariants( // furthest up our loopnest which can be mutated, which we will use below to // update things. Loop *OuterExitL = &L; + SmallVector<BasicBlock *, 4> ExitBlocks; + L.getUniqueExitBlocks(ExitBlocks); for (auto *ExitBB : ExitBlocks) { Loop *NewOuterExitL = LI.getLoopFor(ExitBB); if (!NewOuterExitL) { @@ -2148,6 +2178,7 @@ static void unswitchNontrivialInvariants( SE->forgetLoop(OuterExitL); else SE->forgetTopmostLoop(&L); + SE->forgetBlockAndLoopDispositions(); } bool InsertFreeze = false; @@ -2157,14 +2188,26 @@ static void unswitchNontrivialInvariants( InsertFreeze = !SafetyInfo.isGuaranteedToExecute(TI, &DT, &L); } + // Perform the isGuaranteedNotToBeUndefOrPoison() query before the transform, + // otherwise the branch instruction will have been moved outside the loop + // already, and may imply that a poison condition is always UB. + Value *FullUnswitchCond = nullptr; + if (FullUnswitch) { + FullUnswitchCond = + BI ? skipTrivialSelect(BI->getCondition()) : SI->getCondition(); + if (InsertFreeze) + InsertFreeze = !isGuaranteedNotToBeUndefOrPoison( + FullUnswitchCond, &AC, L.getLoopPreheader()->getTerminator(), &DT); + } + // If the edge from this terminator to a successor dominates that successor, // store a map from each block in its dominator subtree to it. This lets us // tell when cloning for a particular successor if a block is dominated by // some *other* successor with a single data structure. We use this to // significantly reduce cloning. SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; - for (auto *SuccBB : llvm::concat<BasicBlock *const>( - makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs)) + for (auto *SuccBB : llvm::concat<BasicBlock *const>(ArrayRef(RetainedSuccBB), + UnswitchedSuccBBs)) if (SuccBB->getUniquePredecessor() || llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { return PredBB == ParentBB || DT.dominates(SuccBB, PredBB); @@ -2193,7 +2236,7 @@ static void unswitchNontrivialInvariants( VMaps.emplace_back(new ValueToValueMapTy()); ClonedPHs[SuccBB] = buildClonedLoopBlocks( L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB, - DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU); + DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU, SE); } // Drop metadata if we may break its semantics by moving this instr into the @@ -2220,23 +2263,21 @@ static void unswitchNontrivialInvariants( if (FullUnswitch) { // Splice the terminator from the original loop and rewrite its // successors. - SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI); + SplitBB->splice(SplitBB->end(), ParentBB, TI.getIterator()); // Keep a clone of the terminator for MSSA updates. Instruction *NewTI = TI.clone(); - ParentBB->getInstList().push_back(NewTI); + NewTI->insertInto(ParentBB, ParentBB->end()); // First wire up the moved terminator to the preheaders. if (BI) { BasicBlock *ClonedPH = ClonedPHs.begin()->second; BI->setSuccessor(ClonedSucc, ClonedPH); BI->setSuccessor(1 - ClonedSucc, LoopPH); - Value *Cond = skipTrivialSelect(BI->getCondition()); - if (InsertFreeze) { - if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, BI, &DT)) - Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI); - } - BI->setCondition(Cond); + if (InsertFreeze) + FullUnswitchCond = new FreezeInst( + FullUnswitchCond, FullUnswitchCond->getName() + ".fr", BI); + BI->setCondition(FullUnswitchCond); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); } else { assert(SI && "Must either be a branch or switch!"); @@ -2245,17 +2286,16 @@ static void unswitchNontrivialInvariants( assert(SI->getDefaultDest() == RetainedSuccBB && "Not retaining default successor!"); SI->setDefaultDest(LoopPH); - for (auto &Case : SI->cases()) + for (const auto &Case : SI->cases()) if (Case.getCaseSuccessor() == RetainedSuccBB) Case.setSuccessor(LoopPH); else Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); - if (InsertFreeze) { - auto Cond = SI->getCondition(); - if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, SI, &DT)) - SI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", SI)); - } + if (InsertFreeze) + SI->setCondition(new FreezeInst( + FullUnswitchCond, FullUnswitchCond->getName() + ".fr", SI)); + // We need to use the set to populate domtree updates as even when there // are multiple cases pointing at the same successor we only want to // remove and insert one edge in the domtree. @@ -2306,7 +2346,7 @@ static void unswitchNontrivialInvariants( SwitchInst *NewSI = cast<SwitchInst>(NewTI); assert(NewSI->getDefaultDest() == RetainedSuccBB && "Not retaining default successor!"); - for (auto &Case : NewSI->cases()) + for (const auto &Case : NewSI->cases()) Case.getCaseSuccessor()->removePredecessor( ParentBB, /*KeepOneInputPHIs*/ true); @@ -2372,13 +2412,14 @@ static void unswitchNontrivialInvariants( // Now that our cloned loops have been built, we can update the original loop. // First we delete the dead blocks from it and then we rebuild the loop // structure taking these deletions into account. - deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, DestroyLoopCB); + deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, SE,DestroyLoopCB); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); SmallVector<Loop *, 4> HoistedLoops; - bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops); + bool IsStillLoop = + rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops, SE); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -2573,10 +2614,9 @@ static InstructionCost computeDomSubtreeCost( /// /// It also makes all relevant DT and LI updates, so that all structures are in /// valid state after this transform. -static BranchInst * -turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, - SmallVectorImpl<BasicBlock *> &ExitBlocks, - DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) { +static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, + DominatorTree &DT, LoopInfo &LI, + MemorySSAUpdater *MSSAU) { SmallVector<DominatorTree::UpdateType, 4> DTUpdates; LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n"); BasicBlock *CheckBB = GI->getParent(); @@ -2603,9 +2643,6 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, CheckBI->getSuccessor(1)->setName("deopt"); BasicBlock *DeoptBlock = CheckBI->getSuccessor(1); - // We now have a new exit block. - ExitBlocks.push_back(CheckBI->getSuccessor(1)); - if (MSSAU) MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI); @@ -2651,19 +2688,19 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, /// That requires knowing not just the number of "remaining" candidates but /// also costs of unswitching for each of these candidates. static int CalculateUnswitchCostMultiplier( - Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT, - ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>> - UnswitchCandidates) { + const Instruction &TI, const Loop &L, const LoopInfo &LI, + const DominatorTree &DT, + ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates) { // Guards and other exiting conditions do not contribute to exponential // explosion as soon as they dominate the latch (otherwise there might be // another path to the latch remaining that does not allow to eliminate the // loop copy on unswitch). - BasicBlock *Latch = L.getLoopLatch(); - BasicBlock *CondBlock = TI.getParent(); + const BasicBlock *Latch = L.getLoopLatch(); + const BasicBlock *CondBlock = TI.getParent(); if (DT.dominates(CondBlock, Latch) && (isGuard(&TI) || - llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) { + llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) { return L.contains(SuccBB); }) <= 1)) { NumCostMultiplierSkipped++; @@ -2677,16 +2714,17 @@ static int CalculateUnswitchCostMultiplier( // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases. int UnswitchedClones = 0; for (auto Candidate : UnswitchCandidates) { - Instruction *CI = Candidate.first; - BasicBlock *CondBlock = CI->getParent(); + const Instruction *CI = Candidate.TI; + const BasicBlock *CondBlock = CI->getParent(); bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch); if (isGuard(CI)) { if (!SkipExitingSuccessors) UnswitchedClones++; continue; } - int NonExitingSuccessors = llvm::count_if( - successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) { + int NonExitingSuccessors = + llvm::count_if(successors(CondBlock), + [SkipExitingSuccessors, &L](const BasicBlock *SuccBB) { return !SkipExitingSuccessors || L.contains(SuccBB); }); UnswitchedClones += Log2_32(NonExitingSuccessors); @@ -2722,17 +2760,12 @@ static int CalculateUnswitchCostMultiplier( return CostMultiplier; } -static bool unswitchBestCondition( - Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, - AAResults &AA, TargetTransformInfo &TTI, - function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB, - ScalarEvolution *SE, MemorySSAUpdater *MSSAU, - function_ref<void(Loop &, StringRef)> DestroyLoopCB) { - // Collect all invariant conditions within this loop (as opposed to an inner - // loop which would be handled when visiting that inner loop). - SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4> - UnswitchCandidates; - +static bool collectUnswitchCandidates( + SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, + IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, + const Loop &L, const LoopInfo &LI, AAResults &AA, + const MemorySSAUpdater *MSSAU) { + assert(UnswitchCandidates.empty() && "Should be!"); // Whether or not we should also collect guards in the loop. bool CollectGuards = false; if (UnswitchGuards) { @@ -2742,7 +2775,6 @@ static bool unswitchBestCondition( CollectGuards = true; } - IVConditionInfo PartialIVInfo; for (auto *BB : L.blocks()) { if (LI.getLoopFor(BB) != &L) continue; @@ -2750,7 +2782,8 @@ static bool unswitchBestCondition( if (CollectGuards) for (auto &I : *BB) if (isGuard(&I)) { - auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0); + auto *Cond = + skipTrivialSelect(cast<IntrinsicInst>(&I)->getArgOperand(0)); // TODO: Support AND, OR conditions and partial unswitching. if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond)) UnswitchCandidates.push_back({&I, {Cond}}); @@ -2791,11 +2824,10 @@ static bool unswitchBestCondition( } } - Instruction *PartialIVCondBranch = nullptr; if (MSSAU && !findOptionMDForLoop(&L, "llvm.loop.unswitch.partial.disable") && !any_of(UnswitchCandidates, [&L](auto &TerminatorAndInvariants) { - return TerminatorAndInvariants.first == L.getHeader()->getTerminator(); - })) { + return TerminatorAndInvariants.TI == L.getHeader()->getTerminator(); + })) { MemorySSA *MSSA = MSSAU->getMemorySSA(); if (auto Info = hasPartialIVCondition(L, MSSAThreshold, *MSSA, AA)) { LLVM_DEBUG( @@ -2809,10 +2841,22 @@ static bool unswitchBestCondition( {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)}); } } + return !UnswitchCandidates.empty(); +} - // If we didn't find any candidates, we're done. - if (UnswitchCandidates.empty()) +static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { + if (!L.isSafeToClone()) return false; + for (auto *BB : L.blocks()) + for (auto &I : *BB) { + if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) + return false; + if (auto *CB = dyn_cast<CallBase>(&I)) { + assert(!CB->cannotDuplicate() && "Checked by L.isSafeToClone()."); + if (CB->isConvergent()) + return false; + } + } // Check if there are irreducible CFG cycles in this loop. If so, we cannot // easily unswitch non-trivial edges out of the loop. Doing so might turn the @@ -2827,7 +2871,6 @@ static bool unswitchBestCondition( SmallVector<BasicBlock *, 4> ExitBlocks; L.getUniqueExitBlocks(ExitBlocks); - // We cannot unswitch if exit blocks contain a cleanuppad/catchswitch // instruction as we don't know how to split those exit blocks. // FIXME: We should teach SplitBlock to handle this and remove this @@ -2841,10 +2884,13 @@ static bool unswitchBestCondition( } } - LLVM_DEBUG( - dbgs() << "Considering " << UnswitchCandidates.size() - << " non-trivial loop invariant conditions for unswitching.\n"); + return true; +} +static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( + ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates, const Loop &L, + const DominatorTree &DT, const LoopInfo &LI, AssumptionCache &AC, + const TargetTransformInfo &TTI, const IVConditionInfo &PartialIVInfo) { // Given that unswitching these terminators will require duplicating parts of // the loop, so we need to be able to model that cost. Compute the ephemeral // values and set up a data structure to hold per-BB costs. We cache each @@ -2869,14 +2915,7 @@ static bool unswitchBestCondition( for (auto &I : *BB) { if (EphValues.count(&I)) continue; - - if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) - return false; - if (auto *CB = dyn_cast<CallBase>(&I)) - if (CB->isConvergent() || CB->cannotDuplicate()) - return false; - - Cost += TTI.getUserCost(&I, CostKind); + Cost += TTI.getInstructionCost(&I, CostKind); } assert(Cost >= 0 && "Must not have negative costs!"); LoopCost += Cost; @@ -2958,12 +2997,11 @@ static bool unswitchBestCondition( "Cannot unswitch a condition without multiple distinct successors!"); return (LoopCost - Cost) * (SuccessorsCount - 1); }; - Instruction *BestUnswitchTI = nullptr; - InstructionCost BestUnswitchCost = 0; - ArrayRef<Value *> BestUnswitchInvariants; - for (auto &TerminatorAndInvariants : UnswitchCandidates) { - Instruction &TI = *TerminatorAndInvariants.first; - ArrayRef<Value *> Invariants = TerminatorAndInvariants.second; + + std::optional<NonTrivialUnswitchCandidate> Best; + for (auto &Candidate : UnswitchCandidates) { + Instruction &TI = *Candidate.TI; + ArrayRef<Value *> Invariants = Candidate.Invariants; BranchInst *BI = dyn_cast<BranchInst>(&TI); InstructionCost CandidateCost = ComputeUnswitchedCost( TI, /*FullUnswitch*/ !BI || @@ -2986,34 +3024,59 @@ static bool unswitchBestCondition( << " for unswitch candidate: " << TI << "\n"); } - if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) { - BestUnswitchTI = &TI; - BestUnswitchCost = CandidateCost; - BestUnswitchInvariants = Invariants; + if (!Best || CandidateCost < Best->Cost) { + Best = Candidate; + Best->Cost = CandidateCost; } } - assert(BestUnswitchTI && "Failed to find loop unswitch candidate"); + assert(Best && "Must be!"); + return *Best; +} - if (BestUnswitchCost >= UnswitchThreshold) { - LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " - << BestUnswitchCost << "\n"); +static bool unswitchBestCondition( + Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, + AAResults &AA, TargetTransformInfo &TTI, + function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB, + ScalarEvolution *SE, MemorySSAUpdater *MSSAU, + function_ref<void(Loop &, StringRef)> DestroyLoopCB) { + // Collect all invariant conditions within this loop (as opposed to an inner + // loop which would be handled when visiting that inner loop). + SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; + IVConditionInfo PartialIVInfo; + Instruction *PartialIVCondBranch = nullptr; + // If we didn't find any candidates, we're done. + if (!collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, LI, AA, MSSAU)) + return false; + + LLVM_DEBUG( + dbgs() << "Considering " << UnswitchCandidates.size() + << " non-trivial loop invariant conditions for unswitching.\n"); + + NonTrivialUnswitchCandidate Best = findBestNonTrivialUnswitchCandidate( + UnswitchCandidates, L, DT, LI, AC, TTI, PartialIVInfo); + + assert(Best.TI && "Failed to find loop unswitch candidate"); + assert(Best.Cost && "Failed to compute cost"); + + if (*Best.Cost >= UnswitchThreshold) { + LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << *Best.Cost + << "\n"); return false; } - if (BestUnswitchTI != PartialIVCondBranch) + if (Best.TI != PartialIVCondBranch) PartialIVInfo.InstToDuplicate.clear(); // If the best candidate is a guard, turn it into a branch. - if (isGuard(BestUnswitchTI)) - BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L, - ExitBlocks, DT, LI, MSSAU); - - LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " - << BestUnswitchCost << ") terminator: " << *BestUnswitchTI - << "\n"); - unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants, - ExitBlocks, PartialIVInfo, DT, LI, AC, - UnswitchCB, SE, MSSAU, DestroyLoopCB); + if (isGuard(Best.TI)) + Best.TI = + turnGuardIntoBranch(cast<IntrinsicInst>(Best.TI), L, DT, LI, MSSAU); + + LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost + << ") terminator: " << *Best.TI << "\n"); + unswitchNontrivialInvariants(L, *Best.TI, Best.Invariants, PartialIVInfo, DT, + LI, AC, UnswitchCB, SE, MSSAU, DestroyLoopCB); return true; } @@ -3044,6 +3107,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, bool NonTrivial, function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, function_ref<void(Loop &, StringRef)> DestroyLoopCB) { assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); @@ -3080,8 +3144,16 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, if (L.getHeader()->getParent()->hasOptSize()) return false; - // Skip non-trivial unswitching for loops that cannot be cloned. - if (!L.isSafeToClone()) + // Skip cold loops, as unswitching them brings little benefit + // but increases the code size + if (PSI && PSI->hasProfileSummary() && BFI && + PSI->isFunctionColdInCallGraph(L.getHeader()->getParent(), *BFI)) { + LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); + return false; + } + + // Perform legality checks. + if (!isSafeForNoNTrivialUnswitching(L, LI)) return false; // For non-trivial unswitching, because it often creates new loops, we rely on @@ -3105,7 +3177,11 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, LPMUpdater &U) { Function &F = *L.getHeader()->getParent(); (void)F; - + ProfileSummaryInfo *PSI = nullptr; + if (auto OuterProxy = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR) + .getCachedResult<ModuleAnalysisManagerFunctionProxy>(F)) + PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); @@ -3144,14 +3220,14 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, U.markLoopAsDeleted(L, Name); }; - Optional<MemorySSAUpdater> MSSAU; + std::optional<MemorySSAUpdater> MSSAU; if (AR.MSSA) { MSSAU = MemorySSAUpdater(AR.MSSA); if (VerifyMemorySSA) AR.MSSA->verifyMemorySSA(); } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, - UnswitchCB, &AR.SE, MSSAU ? MSSAU.getPointer() : nullptr, + UnswitchCB, &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, DestroyLoopCB)) return PreservedAnalyses::all(); @@ -3214,7 +3290,6 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n"); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -3251,9 +3326,9 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { if (VerifyMemorySSA) MSSA->verifyMemorySSA(); - - bool Changed = unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, - UnswitchCB, SE, &MSSAU, DestroyLoopCB); + bool Changed = + unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, UnswitchCB, SE, + &MSSAU, nullptr, nullptr, DestroyLoopCB); if (VerifyMemorySSA) MSSA->verifyMemorySSA(); |