diff options
Diffstat (limited to 'lib/Transforms/Scalar/SimpleLoopUnswitch.cpp')
-rw-r--r-- | lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 1517 |
1 files changed, 945 insertions, 572 deletions
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 3d0fca0bc3a5..34510cb40732 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -1,4 +1,4 @@ -//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// +///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// // // The LLVM Compiler Infrastructure // @@ -17,10 +17,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -66,180 +70,65 @@ static cl::opt<int> UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, cl::desc("The cost threshold for unswitching a loop.")); -static void replaceLoopUsesWithConstant(Loop &L, Value &LIC, - Constant &Replacement) { - assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); - - // Replace uses of LIC in the loop with the given constant. - for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) { - // Grab the use and walk past it so we can clobber it in the use list. - Use *U = &*UI++; - Instruction *UserI = dyn_cast<Instruction>(U->getUser()); - if (!UserI || !L.contains(UserI)) - continue; - - // Replace this use within the loop body. - *U = &Replacement; - } -} - -/// Update the IDom for a basic block whose predecessor set has changed. -/// -/// This routine is designed to work when the domtree update is relatively -/// localized by leveraging a known common dominator, often a loop header. -/// -/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS -/// approach here as we can do that easily by persisting the candidate IDom's -/// dominating set between each predecessor. +/// Collect all of the loop invariant input values transitively used by the +/// homogeneous instruction graph from a given root. /// -/// FIXME: Longer term, many uses of this can be replaced by an incremental -/// domtree update strategy that starts from a known dominating block and -/// rebuilds that subtree. -static bool updateIDomWithKnownCommonDominator(BasicBlock *BB, - BasicBlock *KnownDominatingBB, - DominatorTree &DT) { - assert(pred_begin(BB) != pred_end(BB) && - "This routine does not handle unreachable blocks!"); - - BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock(); - - BasicBlock *IDom = *pred_begin(BB); - assert(DT.dominates(KnownDominatingBB, IDom) && - "Bad known dominating block!"); - - // Walk all of the other predecessors finding the nearest common dominator - // until all predecessors are covered or we reach the loop header. The loop - // header necessarily dominates all loop exit blocks in loop simplified form - // so we can early-exit the moment we hit that block. - for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB); - PI != PE && IDom != KnownDominatingBB; ++PI) { - assert(DT.dominates(KnownDominatingBB, *PI) && - "Bad known dominating block!"); - IDom = DT.findNearestCommonDominator(IDom, *PI); - } +/// This essentially walks from a root recursively through loop variant operands +/// which have the exact same opcode and finds all inputs which are loop +/// invariant. For some operations these can be re-associated and unswitched out +/// of the loop entirely. +static TinyPtrVector<Value *> +collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root, + LoopInfo &LI) { + assert(!L.isLoopInvariant(&Root) && + "Only need to walk the graph if root itself is not invariant."); + TinyPtrVector<Value *> Invariants; + + // Build a worklist and recurse through operators collecting invariants. + SmallVector<Instruction *, 4> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + Worklist.push_back(&Root); + Visited.insert(&Root); + do { + Instruction &I = *Worklist.pop_back_val(); + for (Value *OpV : I.operand_values()) { + // Skip constants as unswitching isn't interesting for them. + if (isa<Constant>(OpV)) + continue; - if (IDom == OrigIDom) - return false; + // Add it to our result if loop invariant. + if (L.isLoopInvariant(OpV)) { + Invariants.push_back(OpV); + continue; + } - DT.changeImmediateDominator(BB, IDom); - return true; -} + // If not an instruction with the same opcode, nothing we can do. + Instruction *OpI = dyn_cast<Instruction>(OpV); + if (!OpI || OpI->getOpcode() != Root.getOpcode()) + continue; -// Note that we don't currently use the IDFCalculator here for two reasons: -// 1) It computes dominator tree levels for the entire function on each run -// of 'compute'. While this isn't terrible, given that we expect to update -// relatively small subtrees of the domtree, it isn't necessarily the right -// tradeoff. -// 2) The interface doesn't fit this usage well. It doesn't operate in -// append-only, and builds several sets that we don't need. -// -// FIXME: Neither of these issues are a big deal and could be addressed with -// some amount of refactoring of IDFCalculator. That would allow us to share -// the core logic here (which is solving the same core problem). -static void appendDomFrontier(DomTreeNode *Node, - SmallSetVector<BasicBlock *, 4> &Worklist, - SmallVectorImpl<DomTreeNode *> &DomNodes, - SmallPtrSetImpl<BasicBlock *> &DomSet) { - assert(DomNodes.empty() && "Must start with no dominator nodes."); - assert(DomSet.empty() && "Must start with an empty dominator set."); - - // First flatten this subtree into sequence of nodes by doing a pre-order - // walk. - DomNodes.push_back(Node); - // We intentionally re-evaluate the size as each node can add new children. - // Because this is a tree walk, this cannot add any duplicates. - for (int i = 0; i < (int)DomNodes.size(); ++i) - DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end()); - - // Now create a set of the basic blocks so we can quickly test for - // dominated successors. We could in theory use the DFS numbers of the - // dominator tree for this, but we want this to remain predictably fast - // even while we mutate the dominator tree in ways that would invalidate - // the DFS numbering. - for (DomTreeNode *InnerN : DomNodes) - DomSet.insert(InnerN->getBlock()); - - // Now re-walk the nodes, appending every successor of every node that isn't - // in the set. Note that we don't append the node itself, even though if it - // is a successor it does not strictly dominate itself and thus it would be - // part of the dominance frontier. The reason we don't append it is that - // the node passed in came *from* the worklist and so it has already been - // processed. - for (DomTreeNode *InnerN : DomNodes) - for (BasicBlock *SuccBB : successors(InnerN->getBlock())) - if (!DomSet.count(SuccBB)) - Worklist.insert(SuccBB); - - DomNodes.clear(); - DomSet.clear(); -} + // Visit this operand. + if (Visited.insert(OpI).second) + Worklist.push_back(OpI); + } + } while (!Worklist.empty()); -/// Update the dominator tree after unswitching a particular former exit block. -/// -/// This handles the full update of the dominator tree after hoisting a block -/// that previously was an exit block (or split off of an exit block) up to be -/// reached from the new immediate dominator of the preheader. -/// -/// The common case is simple -- we just move the unswitched block to have an -/// immediate dominator of the old preheader. But in complex cases, there may -/// be other blocks reachable from the unswitched block that are immediately -/// dominated by some node between the unswitched one and the old preheader. -/// All of these also need to be hoisted in the dominator tree. We also want to -/// minimize queries to the dominator tree because each step of this -/// invalidates any DFS numbers that would make queries fast. -static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, - DominatorTree &DT) { - DomTreeNode *OldPHNode = DT[OldPH]; - DomTreeNode *UnswitchedNode = DT[UnswitchedBB]; - // If the dominator tree has already been updated for this unswitched node, - // we're done. This makes it easier to use this routine if there are multiple - // paths to the same unswitched destination. - if (UnswitchedNode->getIDom() == OldPHNode) - return; + return Invariants; +} - // First collect the domtree nodes that we are hoisting over. These are the - // set of nodes which may have children that need to be hoisted as well. - SmallPtrSet<DomTreeNode *, 4> DomChain; - for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode; - IDom = IDom->getIDom()) - DomChain.insert(IDom); - - // The unswitched block ends up immediately dominated by the old preheader -- - // regardless of whether it is the loop exit block or split off of the loop - // exit block. - DT.changeImmediateDominator(UnswitchedNode, OldPHNode); - - // For everything that moves up the dominator tree, we need to examine the - // dominator frontier to see if it additionally should move up the dominator - // tree. This lambda appends the dominator frontier for a node on the - // worklist. - SmallSetVector<BasicBlock *, 4> Worklist; - - // Scratch data structures reused by domfrontier finding. - SmallVector<DomTreeNode *, 4> DomNodes; - SmallPtrSet<BasicBlock *, 4> DomSet; - - // Append the initial dom frontier nodes. - appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet); - - // Walk the worklist. We grow the list in the loop and so must recompute size. - for (int i = 0; i < (int)Worklist.size(); ++i) { - auto *BB = Worklist[i]; - - DomTreeNode *Node = DT[BB]; - assert(!DomChain.count(Node) && - "Cannot be dominated by a block you can reach!"); - - // If this block had an immediate dominator somewhere in the chain - // we hoisted over, then its position in the domtree needs to move as it is - // reachable from a node hoisted over this chain. - if (!DomChain.count(Node->getIDom())) - continue; +static void replaceLoopInvariantUses(Loop &L, Value *Invariant, + Constant &Replacement) { + assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?"); - DT.changeImmediateDominator(Node, OldPHNode); + // Replace uses of LIC in the loop with the given constant. + for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) { + // Grab the use and walk past it so we can clobber it in the use list. + Use *U = &*UI++; + Instruction *UserI = dyn_cast<Instruction>(U->getUser()); - // Now add this node's dominator frontier to the worklist as well. - appendDomFrontier(Node, Worklist, DomNodes, DomSet); + // Replace this use within the loop body. + if (UserI && L.contains(UserI)) + U->set(&Replacement); } } @@ -261,6 +150,26 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, llvm_unreachable("Basic blocks should never be empty!"); } +/// Insert code to test a set of loop invariant values, and conditionally branch +/// on them. +static void buildPartialUnswitchConditionalBranch(BasicBlock &BB, + ArrayRef<Value *> Invariants, + bool Direction, + BasicBlock &UnswitchedSucc, + BasicBlock &NormalSucc) { + IRBuilder<> IRB(&BB); + Value *Cond = Invariants.front(); + for (Value *Invariant : + make_range(std::next(Invariants.begin()), Invariants.end())) + if (Direction) + Cond = IRB.CreateOr(Cond, Invariant); + else + Cond = IRB.CreateAnd(Cond, Invariant); + + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc); +} + /// Rewrite the PHI nodes in an unswitched loop exit basic block. /// /// Requires that the loop exit and unswitched basic block are the same, and @@ -271,19 +180,14 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, BasicBlock &OldExitingBB, BasicBlock &OldPH) { - for (Instruction &I : UnswitchedBB) { - auto *PN = dyn_cast<PHINode>(&I); - if (!PN) - // No more PHIs to check. - break; - + for (PHINode &PN : UnswitchedBB.phis()) { // When the loop exit is directly unswitched we just need to update the // incoming basic block. We loop to handle weird cases with repeated // incoming blocks, but expect to typically only have one operand here. - for (auto i : seq<int>(0, PN->getNumOperands())) { - assert(PN->getIncomingBlock(i) == &OldExitingBB && + for (auto i : seq<int>(0, PN.getNumOperands())) { + assert(PN.getIncomingBlock(i) == &OldExitingBB && "Found incoming block different from unique predecessor!"); - PN->setIncomingBlock(i, &OldPH); + PN.setIncomingBlock(i, &OldPH); } } } @@ -298,18 +202,14 @@ static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, BasicBlock &UnswitchedBB, BasicBlock &OldExitingBB, - BasicBlock &OldPH) { + BasicBlock &OldPH, + bool FullUnswitch) { assert(&ExitBB != &UnswitchedBB && "Must have different loop exit and unswitched blocks!"); Instruction *InsertPt = &*UnswitchedBB.begin(); - for (Instruction &I : ExitBB) { - auto *PN = dyn_cast<PHINode>(&I); - if (!PN) - // No more PHIs to check. - break; - - auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2, - PN->getName() + ".split", InsertPt); + for (PHINode &PN : ExitBB.phis()) { + auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2, + PN.getName() + ".split", InsertPt); // Walk backwards over the old PHI node's inputs to minimize the cost of // removing each one. We have to do this weird loop manually so that we @@ -320,18 +220,92 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, // allowed us to create a single entry for a predecessor block without // having separate entries for each "edge" even though these edges are // required to produce identical results. - for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) { - if (PN->getIncomingBlock(i) != &OldExitingBB) + for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) { + if (PN.getIncomingBlock(i) != &OldExitingBB) continue; - Value *Incoming = PN->removeIncomingValue(i); + Value *Incoming = PN.getIncomingValue(i); + if (FullUnswitch) + // No more edge from the old exiting block to the exit block. + PN.removeIncomingValue(i); + NewPN->addIncoming(Incoming, &OldPH); } // Now replace the old PHI with the new one and wire the old one in as an // input to the new one. - PN->replaceAllUsesWith(NewPN); - NewPN->addIncoming(PN, &ExitBB); + PN.replaceAllUsesWith(NewPN); + NewPN->addIncoming(&PN, &ExitBB); + } +} + +/// Hoist the current loop up to the innermost loop containing a remaining exit. +/// +/// Because we've removed an exit from the loop, we may have changed the set of +/// loops reachable and need to move the current loop up the loop nest or even +/// to an entirely separate nest. +static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, + DominatorTree &DT, LoopInfo &LI) { + // If the loop is already at the top level, we can't hoist it anywhere. + Loop *OldParentL = L.getParentLoop(); + if (!OldParentL) + return; + + SmallVector<BasicBlock *, 4> Exits; + L.getExitBlocks(Exits); + Loop *NewParentL = nullptr; + for (auto *ExitBB : Exits) + if (Loop *ExitL = LI.getLoopFor(ExitBB)) + if (!NewParentL || NewParentL->contains(ExitL)) + NewParentL = ExitL; + + if (NewParentL == OldParentL) + return; + + // The new parent loop (if different) should always contain the old one. + if (NewParentL) + assert(NewParentL->contains(OldParentL) && + "Can only hoist this loop up the nest!"); + + // The preheader will need to move with the body of this loop. However, + // because it isn't in this loop we also need to update the primary loop map. + assert(OldParentL == LI.getLoopFor(&Preheader) && + "Parent loop of this loop should contain this loop's preheader!"); + LI.changeLoopFor(&Preheader, NewParentL); + + // Remove this loop from its old parent. + OldParentL->removeChildLoop(&L); + + // Add the loop either to the new parent or as a top-level loop. + if (NewParentL) + NewParentL->addChildLoop(&L); + else + LI.addTopLevelLoop(&L); + + // Remove this loops blocks from the old parent and every other loop up the + // nest until reaching the new parent. Also update all of these + // no-longer-containing loops to reflect the nesting change. + for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL; + OldContainingL = OldContainingL->getParentLoop()) { + llvm::erase_if(OldContainingL->getBlocksVector(), + [&](const BasicBlock *BB) { + return BB == &Preheader || L.contains(BB); + }); + + OldContainingL->getBlocksSet().erase(&Preheader); + for (BasicBlock *BB : L.blocks()) + OldContainingL->getBlocksSet().erase(BB); + + // Because we just hoisted a loop out of this one, we have essentially + // created new exit paths from it. That means we need to form LCSSA PHI + // nodes for values used in the no-longer-nested loop. + formLCSSA(*OldContainingL, DT, &LI, nullptr); + + // We shouldn't need to form dedicated exits because the exit introduced + // here is the (just split by unswitching) preheader. As such, it is + // necessarily dedicated. + assert(OldContainingL->hasDedicatedExits() && + "Unexpected predecessor of hoisted loop preheader!"); } } @@ -349,48 +323,83 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, /// (splitting the exit block as necessary). It simplifies the branch within /// the loop to an unconditional branch but doesn't remove it entirely. Further /// cleanup can be done with some simplify-cfg like pass. +/// +/// If `SE` is not null, it will be updated based on the potential loop SCEVs +/// invalidated by this. static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, - LoopInfo &LI) { + LoopInfo &LI, ScalarEvolution *SE) { assert(BI.isConditional() && "Can only unswitch a conditional branch!"); - DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n"); + LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n"); - Value *LoopCond = BI.getCondition(); + // The loop invariant values that we want to unswitch. + TinyPtrVector<Value *> Invariants; - // Need a trivial loop condition to unswitch. - if (!L.isLoopInvariant(LoopCond)) - return false; + // When true, we're fully unswitching the branch rather than just unswitching + // some input conditions to the branch. + bool FullUnswitch = false; - // FIXME: We should compute this once at the start and update it! - SmallVector<BasicBlock *, 16> ExitBlocks; - L.getExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - - // Check to see if a successor of the branch is guaranteed to - // exit through a unique exit block without having any - // side-effects. If so, determine the value of Cond that causes - // it to do this. - ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext()); - ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext()); + if (L.isLoopInvariant(BI.getCondition())) { + Invariants.push_back(BI.getCondition()); + FullUnswitch = true; + } else { + if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition())) + Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI); + if (Invariants.empty()) + // Couldn't find invariant inputs! + return false; + } + + // Check that one of the branch's successors exits, and which one. + bool ExitDirection = true; int LoopExitSuccIdx = 0; auto *LoopExitBB = BI.getSuccessor(0); - if (!ExitBlockSet.count(LoopExitBB)) { - std::swap(CondVal, Replacement); + if (L.contains(LoopExitBB)) { + ExitDirection = false; LoopExitSuccIdx = 1; LoopExitBB = BI.getSuccessor(1); - if (!ExitBlockSet.count(LoopExitBB)) + if (L.contains(LoopExitBB)) return false; } auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx); - assert(L.contains(ContinueBB) && - "Cannot have both successors exit and still be in the loop!"); - auto *ParentBB = BI.getParent(); if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) return false; - DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal - << " == " << LoopCond << "\n"); + // When unswitching only part of the branch's condition, we need the exit + // block to be reached directly from the partially unswitched input. This can + // be done when the exit block is along the true edge and the branch condition + // is a graph of `or` operations, or the exit block is along the false edge + // and the condition is a graph of `and` operations. + if (!FullUnswitch) { + if (ExitDirection) { + if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or) + return false; + } else { + if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And) + return false; + } + } + + LLVM_DEBUG({ + dbgs() << " unswitching trivial invariant conditions for: " << BI + << "\n"; + for (Value *Invariant : Invariants) { + dbgs() << " " << *Invariant << " == true"; + if (Invariant != Invariants.back()) + dbgs() << " ||"; + dbgs() << "\n"; + } + }); + + // If we have scalar evolutions, we need to invalidate them including this + // loop and the loop containing the exit block. + if (SE) { + if (Loop *ExitL = LI.getLoopFor(LoopExitBB)) + SE->forgetLoop(ExitL); + else + // Forget the entire nest as this exits the entire nest. + SE->forgetTopmostLoop(&L); + } // Split the preheader, so that we know that there is a safe place to insert // the conditional branch. We will change the preheader to have a conditional @@ -403,45 +412,73 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // unswitching. We need to split this if there are other loop predecessors. // Because the loop is in simplified form, *any* other predecessor is enough. BasicBlock *UnswitchedBB; - if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) { - (void)PredBB; - assert(PredBB == BI.getParent() && + if (FullUnswitch && LoopExitBB->getUniquePredecessor()) { + assert(LoopExitBB->getUniquePredecessor() == BI.getParent() && "A branch's parent isn't a predecessor!"); UnswitchedBB = LoopExitBB; } else { UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI); } - // Now splice the branch to gate reaching the new preheader and re-point its - // successors. - OldPH->getInstList().splice(std::prev(OldPH->end()), - BI.getParent()->getInstList(), BI); + // Actually move the invariant uses into the unswitched position. If possible, + // we do this by moving the instructions, but when doing partial unswitching + // we do it by building a new merge of the values in the unswitched position. OldPH->getTerminator()->eraseFromParent(); - BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); - BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); - - // Create a new unconditional branch that will continue the loop as a new - // terminator. - BranchInst::Create(ContinueBB, ParentBB); + if (FullUnswitch) { + // If fully unswitching, we can use the existing branch instruction. + // Splice it into the old PH to gate reaching the new preheader and re-point + // its successors. + OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(), + BI); + BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); + BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); + + // Create a new unconditional branch that will continue the loop as a new + // terminator. + BranchInst::Create(ContinueBB, ParentBB); + } else { + // Only unswitching a subset of inputs to the condition, so we will need to + // build a new branch that merges the invariant inputs. + if (ExitDirection) + assert(cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::Or && + "Must have an `or` of `i1`s for the condition!"); + else + assert(cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::And && + "Must have an `and` of `i1`s for the condition!"); + buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection, + *UnswitchedBB, *NewPH); + } // Rewrite the relevant PHI nodes. if (UnswitchedBB == LoopExitBB) rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); else rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, - *ParentBB, *OldPH); + *ParentBB, *OldPH, FullUnswitch); // Now we need to update the dominator tree. - updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); - // But if we split something off of the loop exit block then we also removed - // one of the predecessors for the loop exit block and may need to update its - // idom. - if (UnswitchedBB != LoopExitBB) - updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT); + DT.insertEdge(OldPH, UnswitchedBB); + if (FullUnswitch) + DT.deleteEdge(ParentBB, UnswitchedBB); + + // The constant we can replace all of our invariants with inside the loop + // body. If any of the invariants have a value other than this the loop won't + // be entered. + ConstantInt *Replacement = ExitDirection + ? ConstantInt::getFalse(BI.getContext()) + : ConstantInt::getTrue(BI.getContext()); // Since this is an i1 condition we can also trivially replace uses of it // within the loop with a constant. - replaceLoopUsesWithConstant(L, *LoopCond, *Replacement); + for (Value *Invariant : Invariants) + replaceLoopInvariantUses(L, Invariant, *Replacement); + + // If this was full unswitching, we may have changed the nesting relationship + // for this loop so hoist it to its correct parent if needed. + if (FullUnswitch) + hoistLoopToNewParent(L, *NewPH, DT, LI); ++NumTrivial; ++NumBranches; @@ -471,9 +508,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, /// switch will not be revisited. If after unswitching there is only a single /// in-loop successor, the switch is further simplified to an unconditional /// branch. Still more cleanup can be done with some simplify-cfg like pass. +/// +/// If `SE` is not null, it will be updated based on the potential loop SCEVs +/// invalidated by this. static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, - LoopInfo &LI) { - DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n"); + LoopInfo &LI, ScalarEvolution *SE) { + LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n"); Value *LoopCond = SI.getCondition(); // If this isn't switching on an invariant condition, we can't unswitch it. @@ -482,41 +522,62 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, auto *ParentBB = SI.getParent(); - // FIXME: We should compute this once at the start and update it! - SmallVector<BasicBlock *, 16> ExitBlocks; - L.getExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - SmallVector<int, 4> ExitCaseIndices; for (auto Case : SI.cases()) { auto *SuccBB = Case.getCaseSuccessor(); - if (ExitBlockSet.count(SuccBB) && + if (!L.contains(SuccBB) && areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB)) ExitCaseIndices.push_back(Case.getCaseIndex()); } BasicBlock *DefaultExitBB = nullptr; - if (ExitBlockSet.count(SI.getDefaultDest()) && + if (!L.contains(SI.getDefaultDest()) && areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) && !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator())) DefaultExitBB = SI.getDefaultDest(); else if (ExitCaseIndices.empty()) return false; - DEBUG(dbgs() << " unswitching trivial cases...\n"); + LLVM_DEBUG(dbgs() << " unswitching trivial cases...\n"); + // We may need to invalidate SCEVs for the outermost loop reached by any of + // the exits. + Loop *OuterL = &L; + + if (DefaultExitBB) { + // Clear out the default destination temporarily to allow accurate + // predecessor lists to be examined below. + SI.setDefaultDest(nullptr); + // Check the loop containing this exit. + Loop *ExitL = LI.getLoopFor(DefaultExitBB); + if (!ExitL || ExitL->contains(OuterL)) + OuterL = ExitL; + } + + // Store the exit cases into a separate data structure and remove them from + // the switch. SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases; ExitCases.reserve(ExitCaseIndices.size()); // We walk the case indices backwards so that we remove the last case first // and don't disrupt the earlier indices. for (unsigned Index : reverse(ExitCaseIndices)) { auto CaseI = SI.case_begin() + Index; + // Compute the outer loop from this exit. + Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor()); + if (!ExitL || ExitL->contains(OuterL)) + OuterL = ExitL; // Save the value of this case. ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()}); // Delete the unswitched cases. SI.removeCase(CaseI); } + if (SE) { + if (OuterL) + SE->forgetLoop(OuterL); + else + SE->forgetTopmostLoop(&L); + } + // Check if after this all of the remaining cases point at the same // successor. BasicBlock *CommonSuccBB = nullptr; @@ -527,23 +588,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, SI.case_begin()->getCaseSuccessor(); })) CommonSuccBB = SI.case_begin()->getCaseSuccessor(); - - if (DefaultExitBB) { - // We can't remove the default edge so replace it with an edge to either - // the single common remaining successor (if we have one) or an unreachable - // block. - if (CommonSuccBB) { - SI.setDefaultDest(CommonSuccBB); - } else { - BasicBlock *UnreachableBB = BasicBlock::Create( - ParentBB->getContext(), - Twine(ParentBB->getName()) + ".unreachable_default", - ParentBB->getParent()); - new UnreachableInst(ParentBB->getContext(), UnreachableBB); - SI.setDefaultDest(UnreachableBB); - DT.addNewBlock(UnreachableBB, ParentBB); - } - } else { + if (!DefaultExitBB) { // If we're not unswitching the default, we need it to match any cases to // have a common successor or if we have no cases it is the common // successor. @@ -580,9 +625,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, } else { auto *SplitBB = SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); - rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, - *ParentBB, *OldPH); - updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT); + rewritePHINodesForExitAndUnswitchedBlocks( + *DefaultExitBB, *SplitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true); DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; } } @@ -607,9 +651,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (!SplitExitBB) { // If this is the first time we see this, do the split and remember it. SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); - rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, - *ParentBB, *OldPH); - updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT); + rewritePHINodesForExitAndUnswitchedBlocks( + *ExitBB, *SplitExitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true); } // Update the case pair to point to the split block. CasePair.second = SplitExitBB; @@ -622,14 +665,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, BasicBlock *UnswitchedBB = CasePair.second; NewSI->addCase(CaseVal, UnswitchedBB); - updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); } // If the default was unswitched, re-point it and add explicit cases for // entering the loop. if (DefaultExitBB) { NewSI->setDefaultDest(DefaultExitBB); - updateDTAfterUnswitch(DefaultExitBB, OldPH, DT); // We removed all the exit cases, so we just copy the cases to the // unswitched switch. @@ -643,11 +684,57 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // pointing at unreachable and other complexity. if (CommonSuccBB) { BasicBlock *BB = SI.getParent(); + // We may have had multiple edges to this common successor block, so remove + // them as predecessors. We skip the first one, either the default or the + // actual first case. + bool SkippedFirst = DefaultExitBB == nullptr; + for (auto Case : SI.cases()) { + assert(Case.getCaseSuccessor() == CommonSuccBB && + "Non-common successor!"); + (void)Case; + if (!SkippedFirst) { + SkippedFirst = true; + continue; + } + CommonSuccBB->removePredecessor(BB, + /*DontDeleteUselessPHIs*/ true); + } + // Now nuke the switch and replace it with a direct branch. SI.eraseFromParent(); BranchInst::Create(CommonSuccBB, BB); + } else if (DefaultExitBB) { + assert(SI.getNumCases() > 0 && + "If we had no cases we'd have a common successor!"); + // Move the last case to the default successor. This is valid as if the + // default got unswitched it cannot be reached. This has the advantage of + // being simple and keeping the number of edges from this switch to + // successors the same, and avoiding any PHI update complexity. + auto LastCaseI = std::prev(SI.case_end()); + SI.setDefaultDest(LastCaseI->getCaseSuccessor()); + SI.removeCase(LastCaseI); + } + + // Walk the unswitched exit blocks and the unswitched split blocks and update + // the dominator tree based on the CFG edits. While we are walking unordered + // containers here, the API for applyUpdates takes an unordered list of + // updates and requires them to not contain duplicates. + SmallVector<DominatorTree::UpdateType, 4> DTUpdates; + for (auto *UnswitchedExitBB : UnswitchedExitBBs) { + DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB}); + DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB}); } + for (auto SplitUnswitchedPair : SplitExitBBMap) { + auto *UnswitchedBB = SplitUnswitchedPair.second; + DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedBB}); + DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB}); + } + DT.applyUpdates(DTUpdates); + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + + // We may have changed the nesting relationship for this loop so hoist it to + // its correct parent if needed. + hoistLoopToNewParent(L, *NewPH, DT, LI); - DT.verifyDomTree(); ++NumTrivial; ++NumSwitches; return true; @@ -662,8 +749,11 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, /// /// The return value indicates whether anything was unswitched (and therefore /// changed). +/// +/// If `SE` is not null, it will be updated based on the potential loop SCEVs +/// invalidated by this. static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, - LoopInfo &LI) { + LoopInfo &LI, ScalarEvolution *SE) { bool Changed = false; // If loop header has only one reachable successor we should keep looking for @@ -697,8 +787,8 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, if (isa<Constant>(SI->getCondition())) return Changed; - if (!unswitchTrivialSwitch(L, *SI, DT, LI)) - // Coludn't unswitch this one so we're done. + if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE)) + // Couldn't unswitch this one so we're done. return Changed; // Mark that we managed to unswitch something. @@ -729,17 +819,19 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, // Found a trivial condition candidate: non-foldable conditional branch. If // we fail to unswitch this, we can't do anything else that is trivial. - if (!unswitchTrivialBranch(L, *BI, DT, LI)) + if (!unswitchTrivialBranch(L, *BI, DT, LI, SE)) return Changed; // Mark that we managed to unswitch something. Changed = true; - // We unswitched the branch. This should always leave us with an - // unconditional branch that we can follow now. + // If we only unswitched some of the conditions feeding the branch, we won't + // have collapsed it to a single successor. BI = cast<BranchInst>(CurrentBB->getTerminator()); - assert(!BI->isConditional() && - "Cannot form a conditional branch by unswitching1"); + if (BI->isConditional()) + return Changed; + + // Follow the newly unconditional branch into its successor. CurrentBB = BI->getSuccessor(0); // When continuing, if we exit the loop or reach a previous visited block, @@ -758,8 +850,12 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, /// /// This routine handles cloning all of the necessary loop blocks and exit /// blocks including rewriting their instructions and the relevant PHI nodes. -/// It skips loop and exit blocks that are not necessary based on the provided -/// set. It also correctly creates the unconditional branch in the cloned +/// Any loop blocks or exit blocks which are dominated by a different successor +/// than the one for this clone of the loop blocks can be trivially skipped. We +/// use the `DominatingSucc` map to determine whether a block satisfies that +/// property with a simple map lookup. +/// +/// It also correctly creates the unconditional branch in the cloned /// unswitched parent block to only point at the unswitched successor. /// /// This does not handle most of the necessary updates to `LoopInfo`. Only exit @@ -773,9 +869,10 @@ static BasicBlock *buildClonedLoopBlocks( Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, - const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks, - ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT, - LoopInfo &LI) { + const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, + ValueToValueMapTy &VMap, + SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, + DominatorTree &DT, LoopInfo &LI) { SmallVector<BasicBlock *, 4> NewBlocks; NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size()); @@ -790,26 +887,29 @@ static BasicBlock *buildClonedLoopBlocks( NewBlocks.push_back(NewBB); VMap[OldBB] = NewBB; - // Add the block to the domtree. We'll move it to the correct position - // below. - DT.addNewBlock(NewBB, SplitBB); - return NewBB; }; + // We skip cloning blocks when they have a dominating succ that is not the + // succ we are cloning for. + auto SkipBlock = [&](BasicBlock *BB) { + auto It = DominatingSucc.find(BB); + return It != DominatingSucc.end() && It->second != UnswitchedSuccBB; + }; + // First, clone the preheader. auto *ClonedPH = CloneBlock(LoopPH); // Then clone all the loop blocks, skipping the ones that aren't necessary. for (auto *LoopBB : L.blocks()) - if (!SkippedLoopAndExitBlocks.count(LoopBB)) + if (!SkipBlock(LoopBB)) CloneBlock(LoopBB); // Split all the loop exit edges so that when we clone the exit blocks, if // any of the exit blocks are *also* a preheader for some other loop, we // don't create multiple predecessors entering the loop header. for (auto *ExitBB : ExitBlocks) { - if (SkippedLoopAndExitBlocks.count(ExitBB)) + if (SkipBlock(ExitBB)) continue; // When we are going to clone an exit, we don't need to clone all the @@ -832,17 +932,6 @@ static BasicBlock *buildClonedLoopBlocks( assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && "Cloned exit block has the wrong successor!"); - // Move the merge block's idom to be the split point as one exit is - // dominated by one header, and the other by another, so we know the split - // point dominates both. While the dominator tree isn't fully accurate, we - // want sub-trees within the original loop to be correctly reflect - // dominance within that original loop (at least) and that requires moving - // the merge block out of that subtree. - // FIXME: This is very brittle as we essentially have a partial contract on - // the dominator tree. We really need to instead update it and keep it - // valid or stop relying on it. - DT.changeImmediateDominator(MergeBB, SplitBB); - // Remap any cloned instructions and create a merge phi node for them. for (auto ZippedInsts : llvm::zip_first( llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())), @@ -882,28 +971,63 @@ static BasicBlock *buildClonedLoopBlocks( AC.registerAssumption(II); } - // Remove the cloned parent as a predecessor of the cloned continue successor - // if we did in fact clone it. - auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB)); - if (auto *ClonedContinueSuccBB = - cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB))) - ClonedContinueSuccBB->removePredecessor(ClonedParentBB, - /*DontDeleteUselessPHIs*/ true); - // Replace the cloned branch with an unconditional branch to the cloneed - // unswitched successor. - auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB)); - ClonedParentBB->getTerminator()->eraseFromParent(); - BranchInst::Create(ClonedSuccBB, ClonedParentBB); - // Update any PHI nodes in the cloned successors of the skipped blocks to not // have spurious incoming values. for (auto *LoopBB : L.blocks()) - if (SkippedLoopAndExitBlocks.count(LoopBB)) + if (SkipBlock(LoopBB)) for (auto *SuccBB : successors(LoopBB)) if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB))) for (PHINode &PN : ClonedSuccBB->phis()) PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false); + // Remove the cloned parent as a predecessor of any successor we ended up + // cloning other than the unswitched one. + auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB)); + for (auto *SuccBB : successors(ParentBB)) { + if (SuccBB == UnswitchedSuccBB) + continue; + + auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)); + if (!ClonedSuccBB) + continue; + + ClonedSuccBB->removePredecessor(ClonedParentBB, + /*DontDeleteUselessPHIs*/ true); + } + + // Replace the cloned branch with an unconditional branch to the cloned + // unswitched successor. + auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB)); + ClonedParentBB->getTerminator()->eraseFromParent(); + BranchInst::Create(ClonedSuccBB, ClonedParentBB); + + // If there are duplicate entries in the PHI nodes because of multiple edges + // to the unswitched successor, we need to nuke all but one as we replaced it + // with a direct branch. + for (PHINode &PN : ClonedSuccBB->phis()) { + bool Found = false; + // Loop over the incoming operands backwards so we can easily delete as we + // go without invalidating the index. + for (int i = PN.getNumOperands() - 1; i >= 0; --i) { + if (PN.getIncomingBlock(i) != ClonedParentBB) + continue; + if (!Found) { + Found = true; + continue; + } + PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false); + } + } + + // Record the domtree updates for the new blocks. + SmallPtrSet<BasicBlock *, 4> SuccSet; + for (auto *ClonedBB : NewBlocks) { + for (auto *SuccBB : successors(ClonedBB)) + if (SuccSet.insert(SuccBB).second) + DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB}); + SuccSet.clear(); + } + return ClonedPH; } @@ -921,11 +1045,8 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, for (auto *BB : OrigL.blocks()) { auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB)); ClonedL.addBlockEntry(ClonedBB); - if (LI.getLoopFor(BB) == &OrigL) { - assert(!LI.getLoopFor(ClonedBB) && - "Should not have an existing loop for this block!"); + if (LI.getLoopFor(BB) == &OrigL) LI.changeLoopFor(ClonedBB, &ClonedL); - } } }; @@ -975,9 +1096,9 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, /// original loop, multiple cloned sibling loops may be created. All of them /// are returned so that the newly introduced loop nest roots can be /// identified. -static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, - const ValueToValueMapTy &VMap, LoopInfo &LI, - SmallVectorImpl<Loop *> &NonChildClonedLoops) { +static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, + const ValueToValueMapTy &VMap, LoopInfo &LI, + SmallVectorImpl<Loop *> &NonChildClonedLoops) { Loop *ClonedL = nullptr; auto *OrigPH = OrigL.getLoopPreheader(); @@ -1070,6 +1191,7 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, } else { LI.addTopLevelLoop(ClonedL); } + NonChildClonedLoops.push_back(ClonedL); ClonedL->reserveBlocks(BlocksInClonedLoop.size()); // We don't want to just add the cloned loop blocks based on how we @@ -1138,11 +1260,11 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, // matter as we're just trying to build up the map from inside-out; we use // the map in a more stably ordered way below. auto OrderedClonedExitsInLoops = ClonedExitsInLoops; - std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(), - [&](BasicBlock *LHS, BasicBlock *RHS) { - return ExitLoopMap.lookup(LHS)->getLoopDepth() < - ExitLoopMap.lookup(RHS)->getLoopDepth(); - }); + llvm::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(), + [&](BasicBlock *LHS, BasicBlock *RHS) { + return ExitLoopMap.lookup(LHS)->getLoopDepth() < + ExitLoopMap.lookup(RHS)->getLoopDepth(); + }); // Populate the existing ExitLoopMap with everything reachable from each // exit, starting from the inner most exit. @@ -1222,60 +1344,69 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, NonChildClonedLoops.push_back(cloneLoopNest( *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI)); } +} + +static void +deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, + ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, + DominatorTree &DT) { + // Find all the dead clones, and remove them from their successors. + SmallVector<BasicBlock *, 16> DeadBlocks; + for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks)) + for (auto &VMap : VMaps) + if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB))) + if (!DT.isReachableFromEntry(ClonedBB)) { + for (BasicBlock *SuccBB : successors(ClonedBB)) + SuccBB->removePredecessor(ClonedBB); + DeadBlocks.push_back(ClonedBB); + } - // Return the main cloned loop if any. - return ClonedL; + // Drop any remaining references to break cycles. + for (BasicBlock *BB : DeadBlocks) + BB->dropAllReferences(); + // Erase them from the IR. + for (BasicBlock *BB : DeadBlocks) + BB->eraseFromParent(); } -static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot, - SmallVectorImpl<BasicBlock *> &ExitBlocks, - DominatorTree &DT, LoopInfo &LI) { - // Walk the dominator tree to build up the set of blocks we will delete here. - // The order is designed to allow us to always delete bottom-up and avoid any - // dangling uses. - SmallSetVector<BasicBlock *, 16> DeadBlocks; - DeadBlocks.insert(DeadSubtreeRoot); - for (int i = 0; i < (int)DeadBlocks.size(); ++i) - for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) { - // FIXME: This assert should pass and that means we don't change nearly - // as much below! Consider rewriting all of this to avoid deleting - // blocks. They are always cloned before being deleted, and so instead - // could just be moved. - // FIXME: This in turn means that we might actually be more able to - // update the domtree. - assert((L.contains(ChildN->getBlock()) || - llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) && - "Should never reach beyond the loop and exits when deleting!"); - DeadBlocks.insert(ChildN->getBlock()); +static void +deleteDeadBlocksFromLoop(Loop &L, + SmallVectorImpl<BasicBlock *> &ExitBlocks, + DominatorTree &DT, LoopInfo &LI) { + // Find all the dead blocks, and remove them from their successors. + SmallVector<BasicBlock *, 16> DeadBlocks; + for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks)) + if (!DT.isReachableFromEntry(BB)) { + for (BasicBlock *SuccBB : successors(BB)) + SuccBB->removePredecessor(BB); + DeadBlocks.push_back(BB); } + SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(), + DeadBlocks.end()); + // Filter out the dead blocks from the exit blocks list so that it can be // used in the caller. llvm::erase_if(ExitBlocks, - [&](BasicBlock *BB) { return DeadBlocks.count(BB); }); - - // Remove these blocks from their successors. - for (auto *BB : DeadBlocks) - for (BasicBlock *SuccBB : successors(BB)) - SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true); + [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); // Walk from this loop up through its parents removing all of the dead blocks. for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { for (auto *BB : DeadBlocks) ParentL->getBlocksSet().erase(BB); llvm::erase_if(ParentL->getBlocksVector(), - [&](BasicBlock *BB) { return DeadBlocks.count(BB); }); + [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); } // Now delete the dead child loops. This raw delete will clear them // recursively. llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) { - if (!DeadBlocks.count(ChildL->getHeader())) + if (!DeadBlockSet.count(ChildL->getHeader())) return false; assert(llvm::all_of(ChildL->blocks(), [&](BasicBlock *ChildBB) { - return DeadBlocks.count(ChildBB); + return DeadBlockSet.count(ChildBB); }) && "If the child loop header is dead all blocks in the child loop must " "be dead as well!"); @@ -1283,19 +1414,20 @@ static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot, return true; }); - // Remove the mappings for the dead blocks. - for (auto *BB : DeadBlocks) + // Remove the loop mappings for the dead blocks and drop all the references + // from these blocks to others to handle cyclic references as we start + // deleting the blocks themselves. + for (auto *BB : DeadBlocks) { + // Check that the dominator tree has already been updated. + assert(!DT.getNode(BB) && "Should already have cleared domtree!"); LI.changeLoopFor(BB, nullptr); - - // Drop all the references from these blocks to others to handle cyclic - // references as we start deleting the blocks themselves. - for (auto *BB : DeadBlocks) BB->dropAllReferences(); + } - for (auto *BB : llvm::reverse(DeadBlocks)) { - DT.eraseNode(BB); + // Actually delete the blocks now that they've been fully unhooked from the + // IR. + for (auto *BB : DeadBlocks) BB->eraseFromParent(); - } } /// Recompute the set of blocks in a loop after unswitching. @@ -1343,14 +1475,15 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, if (LoopBlockSet.empty()) return LoopBlockSet; - // Add the loop header to the set. - LoopBlockSet.insert(Header); - // We found backedges, recurse through them to identify the loop blocks. while (!Worklist.empty()) { BasicBlock *BB = Worklist.pop_back_val(); assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!"); + // No need to walk past the header. + if (BB == Header) + continue; + // Because we know the inner loop structure remains valid we can use the // loop structure to jump immediately across the entire nested loop. // Further, because it is in loop simplified form, we can directly jump @@ -1371,9 +1504,10 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, continue; // Insert all of the blocks (other than those already present) into - // the loop set. The only block we expect to already be in the set is - // the one we used to find this loop as we immediately handle the - // others the first time we encounter the loop. + // the loop set. We expect at least the block that led us to find the + // inner loop to be in the block set, but we may also have other loop + // blocks if they were already enqueued as predecessors of some other + // outer loop block. for (auto *InnerBB : InnerL->blocks()) { if (InnerBB == BB) { assert(LoopBlockSet.count(InnerBB) && @@ -1381,9 +1515,7 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, continue; } - bool Inserted = LoopBlockSet.insert(InnerBB).second; - (void)Inserted; - assert(Inserted && "Should only insert an inner loop once!"); + LoopBlockSet.insert(InnerBB); } // Add the preheader to the worklist so we will continue past the @@ -1399,6 +1531,8 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, Worklist.push_back(Pred); } + assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!"); + // We've found all the blocks participating in the loop, return our completed // set. return LoopBlockSet; @@ -1646,32 +1780,58 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { } while (!DomWorklist.empty()); } -/// Take an invariant branch that has been determined to be safe and worthwhile -/// to unswitch despite being non-trivial to do so and perform the unswitch. -/// -/// This directly updates the CFG to hoist the predicate out of the loop, and -/// clone the necessary parts of the loop to maintain behavior. -/// -/// It also updates both dominator tree and loopinfo based on the unswitching. -/// -/// Once unswitching has been performed it runs the provided callback to report -/// the new loops and no-longer valid loops to the caller. -static bool unswitchInvariantBranch( - Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI, - AssumptionCache &AC, - function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) { - assert(BI.isConditional() && "Can only unswitch a conditional branch!"); - assert(L.isLoopInvariant(BI.getCondition()) && - "Can only unswitch an invariant branch condition!"); +static bool unswitchNontrivialInvariants( + Loop &L, TerminatorInst &TI, ArrayRef<Value *> Invariants, + DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, + function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, + ScalarEvolution *SE) { + auto *ParentBB = TI.getParent(); + BranchInst *BI = dyn_cast<BranchInst>(&TI); + SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI); + + // We can only unswitch switches, conditional branches with an invariant + // condition, or combining invariant conditions with an instruction. + assert((SI || BI->isConditional()) && + "Can only unswitch switches and conditional branch!"); + bool FullUnswitch = SI || BI->getCondition() == Invariants[0]; + if (FullUnswitch) + assert(Invariants.size() == 1 && + "Cannot have other invariants with full unswitching!"); + else + assert(isa<Instruction>(BI->getCondition()) && + "Partial unswitching requires an instruction as the condition!"); + + // Constant and BBs tracking the cloned and continuing successor. When we are + // unswitching the entire condition, this can just be trivially chosen to + // unswitch towards `true`. However, when we are unswitching a set of + // invariants combined with `and` or `or`, the combining operation determines + // the best direction to unswitch: we want to unswitch the direction that will + // collapse the branch. + bool Direction = true; + int ClonedSucc = 0; + if (!FullUnswitch) { + if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) { + assert(cast<Instruction>(BI->getCondition())->getOpcode() == + Instruction::And && + "Only `or` and `and` instructions can combine invariants being " + "unswitched."); + Direction = false; + ClonedSucc = 1; + } + } - // Constant and BBs tracking the cloned and continuing successor. - const int ClonedSucc = 0; - auto *ParentBB = BI.getParent(); - auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc); - auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc); + BasicBlock *RetainedSuccBB = + BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest(); + SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs; + if (BI) + UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc)); + else + for (auto Case : SI->cases()) + if (Case.getCaseSuccessor() != RetainedSuccBB) + UnswitchedSuccBBs.insert(Case.getCaseSuccessor()); - assert(UnswitchedSuccBB != ContinueSuccBB && - "Should not unswitch a branch that always goes to the same place!"); + assert(!UnswitchedSuccBBs.count(RetainedSuccBB) && + "Should not unswitch the same successor we are retaining!"); // The branch should be in this exact loop. Any inner loop's invariant branch // should be handled by unswitching that inner loop. The caller of this @@ -1690,9 +1850,6 @@ static bool unswitchInvariantBranch( if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) return false; - SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - // Compute the parent loop now before we start hacking on things. Loop *ParentL = L.getParentLoop(); @@ -1711,27 +1868,31 @@ static bool unswitchInvariantBranch( OuterExitL = NewOuterExitL; } - // If the edge we *aren't* cloning in the unswitch (the continuing edge) - // dominates its target, we can skip cloning the dominated region of the loop - // and its exits. We compute this as a set of nodes to be skipped. - SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks; - if (ContinueSuccBB->getUniquePredecessor() || - llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) { - return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB); - })) { - visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) { - SkippedLoopAndExitBlocks.insert(BB); - return true; - }); + // At this point, we're definitely going to unswitch something so invalidate + // any cached information in ScalarEvolution for the outer most loop + // containing an exit block and all nested loops. + if (SE) { + if (OuterExitL) + SE->forgetLoop(OuterExitL); + else + SE->forgetTopmostLoop(&L); } - // Similarly, if the edge we *are* cloning in the unswitch (the unswitched - // edge) dominates its target, we will end up with dead nodes in the original - // loop and its exits that will need to be deleted. Here, we just retain that - // the property holds and will compute the deleted set later. - bool DeleteUnswitchedSucc = - UnswitchedSuccBB->getUniquePredecessor() || - llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) { - return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB); + + // If the edge from this terminator to a successor dominates that successor, + // store a map from each block in its dominator subtree to it. This lets us + // tell when cloning for a particular successor if a block is dominated by + // some *other* successor with a single data structure. We use this to + // significantly reduce cloning. + SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; + for (auto *SuccBB : llvm::concat<BasicBlock *const>( + makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs)) + if (SuccBB->getUniquePredecessor() || + llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { + return PredBB == ParentBB || DT.dominates(SuccBB, PredBB); + })) + visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) { + DominatingSucc[BB] = SuccBB; + return true; }); // Split the preheader, so that we know that there is a safe place to insert @@ -1742,52 +1903,162 @@ static bool unswitchInvariantBranch( BasicBlock *SplitBB = L.getLoopPreheader(); BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI); - // Keep a mapping for the cloned values. - ValueToValueMapTy VMap; + // Keep track of the dominator tree updates needed. + SmallVector<DominatorTree::UpdateType, 4> DTUpdates; + + // Clone the loop for each unswitched successor. + SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; + VMaps.reserve(UnswitchedSuccBBs.size()); + SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs; + for (auto *SuccBB : UnswitchedSuccBBs) { + VMaps.emplace_back(new ValueToValueMapTy()); + ClonedPHs[SuccBB] = buildClonedLoopBlocks( + L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB, + DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI); + } + + // The stitching of the branched code back together depends on whether we're + // doing full unswitching or not with the exception that we always want to + // nuke the initial terminator placed in the split block. + SplitBB->getTerminator()->eraseFromParent(); + if (FullUnswitch) { + // First we need to unhook the successor relationship as we'll be replacing + // the terminator with a direct branch. This is much simpler for branches + // than switches so we handle those first. + if (BI) { + // Remove the parent as a predecessor of the unswitched successor. + assert(UnswitchedSuccBBs.size() == 1 && + "Only one possible unswitched block for a branch!"); + BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin(); + UnswitchedSuccBB->removePredecessor(ParentBB, + /*DontDeleteUselessPHIs*/ true); + DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB}); + } else { + // Note that we actually want to remove the parent block as a predecessor + // of *every* case successor. The case successor is either unswitched, + // completely eliminating an edge from the parent to that successor, or it + // is a duplicate edge to the retained successor as the retained successor + // is always the default successor and as we'll replace this with a direct + // branch we no longer need the duplicate entries in the PHI nodes. + assert(SI->getDefaultDest() == RetainedSuccBB && + "Not retaining default successor!"); + for (auto &Case : SI->cases()) + Case.getCaseSuccessor()->removePredecessor( + ParentBB, + /*DontDeleteUselessPHIs*/ true); + + // We need to use the set to populate domtree updates as even when there + // are multiple cases pointing at the same successor we only want to + // remove and insert one edge in the domtree. + for (BasicBlock *SuccBB : UnswitchedSuccBBs) + DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB}); + } + + // Now that we've unhooked the successor relationship, splice the terminator + // from the original loop to the split. + SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI); + + // Now wire up the terminator to the preheaders. + if (BI) { + BasicBlock *ClonedPH = ClonedPHs.begin()->second; + BI->setSuccessor(ClonedSucc, ClonedPH); + BI->setSuccessor(1 - ClonedSucc, LoopPH); + DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); + } else { + assert(SI && "Must either be a branch or switch!"); + + // Walk the cases and directly update their successors. + SI->setDefaultDest(LoopPH); + for (auto &Case : SI->cases()) + if (Case.getCaseSuccessor() == RetainedSuccBB) + Case.setSuccessor(LoopPH); + else + Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); + + // We need to use the set to populate domtree updates as even when there + // are multiple cases pointing at the same successor we only want to + // remove and insert one edge in the domtree. + for (BasicBlock *SuccBB : UnswitchedSuccBBs) + DTUpdates.push_back( + {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second}); + } + + // Create a new unconditional branch to the continuing block (as opposed to + // the one cloned). + BranchInst::Create(RetainedSuccBB, ParentBB); + } else { + assert(BI && "Only branches have partial unswitching."); + assert(UnswitchedSuccBBs.size() == 1 && + "Only one possible unswitched block for a branch!"); + BasicBlock *ClonedPH = ClonedPHs.begin()->second; + // When doing a partial unswitch, we have to do a bit more work to build up + // the branch in the split block. + buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, + *ClonedPH, *LoopPH); + DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); + } - // Build the cloned blocks from the loop. - auto *ClonedPH = buildClonedLoopBlocks( - L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB, - ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI); + // Apply the updates accumulated above to get an up-to-date dominator tree. + DT.applyUpdates(DTUpdates); + + // Now that we have an accurate dominator tree, first delete the dead cloned + // blocks so that we can accurately build any cloned loops. It is important to + // not delete the blocks from the original loop yet because we still want to + // reference the original loop to understand the cloned loop's structure. + deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT); // Build the cloned loop structure itself. This may be substantially // different from the original structure due to the simplified CFG. This also // handles inserting all the cloned blocks into the correct loops. SmallVector<Loop *, 4> NonChildClonedLoops; - Loop *ClonedL = - buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops); - - // Remove the parent as a predecessor of the unswitched successor. - UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true); - - // Now splice the branch from the original loop and use it to select between - // the two loops. - SplitBB->getTerminator()->eraseFromParent(); - SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI); - BI.setSuccessor(ClonedSucc, ClonedPH); - BI.setSuccessor(1 - ClonedSucc, LoopPH); - - // Create a new unconditional branch to the continuing block (as opposed to - // the one cloned). - BranchInst::Create(ContinueSuccBB, ParentBB); - - // Delete anything that was made dead in the original loop due to - // unswitching. - if (DeleteUnswitchedSucc) - deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI); + for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps) + buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops); + // Now that our cloned loops have been built, we can update the original loop. + // First we delete the dead blocks from it and then we rebuild the loop + // structure taking these deletions into account. + deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI); SmallVector<Loop *, 4> HoistedLoops; bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops); - // This will have completely invalidated the dominator tree. We can't easily - // bound how much is invalid because in some cases we will refine the - // predecessor set of exit blocks of the loop which can move large unrelated - // regions of code into a new subtree. - // - // FIXME: Eventually, we should use an incremental update utility that - // leverages the existing information in the dominator tree (and potentially - // the nature of the change) to more efficiently update things. - DT.recalculate(*SplitBB->getParent()); + // This transformation has a high risk of corrupting the dominator tree, and + // the below steps to rebuild loop structures will result in hard to debug + // errors in that case so verify that the dominator tree is sane first. + // FIXME: Remove this when the bugs stop showing up and rely on existing + // verification steps. + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + + if (BI) { + // If we unswitched a branch which collapses the condition to a known + // constant we want to replace all the uses of the invariants within both + // the original and cloned blocks. We do this here so that we can use the + // now updated dominator tree to identify which side the users are on. + assert(UnswitchedSuccBBs.size() == 1 && + "Only one possible unswitched block for a branch!"); + BasicBlock *ClonedPH = ClonedPHs.begin()->second; + ConstantInt *UnswitchedReplacement = + Direction ? ConstantInt::getTrue(BI->getContext()) + : ConstantInt::getFalse(BI->getContext()); + ConstantInt *ContinueReplacement = + Direction ? ConstantInt::getFalse(BI->getContext()) + : ConstantInt::getTrue(BI->getContext()); + for (Value *Invariant : Invariants) + for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); + UI != UE;) { + // Grab the use and walk past it so we can clobber it in the use list. + Use *U = &*UI++; + Instruction *UserI = dyn_cast<Instruction>(U->getUser()); + if (!UserI) + continue; + + // Replace it with the 'continue' side if in the main loop body, and the + // unswitched if in the cloned blocks. + if (DT.dominates(LoopPH, UserI->getParent())) + U->set(ContinueReplacement); + else if (DT.dominates(ClonedPH, UserI->getParent())) + U->set(UnswitchedReplacement); + } + } // We can change which blocks are exit blocks of all the cloned sibling // loops, the current loop, and any parent loops which shared exit blocks @@ -1801,57 +2072,50 @@ static bool unswitchInvariantBranch( // also need to cover any intervening loops. We add all of these loops to // a list and sort them by loop depth to achieve this without updating // unnecessary loops. - auto UpdateLCSSA = [&](Loop &UpdateL) { + auto UpdateLoop = [&](Loop &UpdateL) { #ifndef NDEBUG - for (Loop *ChildL : UpdateL) + UpdateL.verifyLoop(); + for (Loop *ChildL : UpdateL) { + ChildL->verifyLoop(); assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && "Perturbed a child loop's LCSSA form!"); + } #endif + // First build LCSSA for this loop so that we can preserve it when + // forming dedicated exits. We don't want to perturb some other loop's + // LCSSA while doing that CFG edit. formLCSSA(UpdateL, DT, &LI, nullptr); + + // For loops reached by this loop's original exit blocks we may + // introduced new, non-dedicated exits. At least try to re-form dedicated + // exits for these loops. This may fail if they couldn't have dedicated + // exits to start with. + formDedicatedExitBlocks(&UpdateL, &DT, &LI, /*PreserveLCSSA*/ true); }; // For non-child cloned loops and hoisted loops, we just need to update LCSSA // and we can do it in any order as they don't nest relative to each other. - for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) - UpdateLCSSA(*UpdatedL); + // + // Also check if any of the loops we have updated have become top-level loops + // as that will necessitate widening the outer loop scope. + for (Loop *UpdatedL : + llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) { + UpdateLoop(*UpdatedL); + if (!UpdatedL->getParentLoop()) + OuterExitL = nullptr; + } + if (IsStillLoop) { + UpdateLoop(L); + if (!L.getParentLoop()) + OuterExitL = nullptr; + } // If the original loop had exit blocks, walk up through the outer most loop // of those exit blocks to update LCSSA and form updated dedicated exits. - if (OuterExitL != &L) { - SmallVector<Loop *, 4> OuterLoops; - // We start with the cloned loop and the current loop if they are loops and - // move toward OuterExitL. Also, if either the cloned loop or the current - // loop have become top level loops we need to walk all the way out. - if (ClonedL) { - OuterLoops.push_back(ClonedL); - if (!ClonedL->getParentLoop()) - OuterExitL = nullptr; - } - if (IsStillLoop) { - OuterLoops.push_back(&L); - if (!L.getParentLoop()) - OuterExitL = nullptr; - } - // Grab all of the enclosing loops now. + if (OuterExitL != &L) for (Loop *OuterL = ParentL; OuterL != OuterExitL; OuterL = OuterL->getParentLoop()) - OuterLoops.push_back(OuterL); - - // Finally, update our list of outer loops. This is nicely ordered to work - // inside-out. - for (Loop *OuterL : OuterLoops) { - // First build LCSSA for this loop so that we can preserve it when - // forming dedicated exits. We don't want to perturb some other loop's - // LCSSA while doing that CFG edit. - UpdateLCSSA(*OuterL); - - // For loops reached by this loop's original exit blocks we may - // introduced new, non-dedicated exits. At least try to re-form dedicated - // exits for these loops. This may fail if they couldn't have dedicated - // exits to start with. - formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true); - } - } + UpdateLoop(*OuterL); #ifndef NDEBUG // Verify the entire loop structure to catch any incorrect updates before we @@ -1866,7 +2130,7 @@ static bool unswitchInvariantBranch( for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) if (UpdatedL->getParentLoop() == ParentL) SibLoops.push_back(UpdatedL); - NonTrivialUnswitchCB(IsStillLoop, SibLoops); + UnswitchCB(IsStillLoop, SibLoops); ++NumBranches; return true; @@ -1905,50 +2169,69 @@ computeDomSubtreeCost(DomTreeNode &N, return Cost; } -/// Unswitch control flow predicated on loop invariant conditions. -/// -/// This first hoists all branches or switches which are trivial (IE, do not -/// require duplicating any part of the loop) out of the loop body. It then -/// looks at other loop invariant control flows and tries to unswitch those as -/// well by cloning the loop if the result is small enough. static bool -unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, - TargetTransformInfo &TTI, bool NonTrivial, - function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) { - assert(L.isRecursivelyLCSSAForm(DT, LI) && - "Loops must be in LCSSA form before unswitching."); - bool Changed = false; +unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, TargetTransformInfo &TTI, + function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, + ScalarEvolution *SE) { + // Collect all invariant conditions within this loop (as opposed to an inner + // loop which would be handled when visiting that inner loop). + SmallVector<std::pair<TerminatorInst *, TinyPtrVector<Value *>>, 4> + UnswitchCandidates; + for (auto *BB : L.blocks()) { + if (LI.getLoopFor(BB) != &L) + continue; - // Must be in loop simplified form: we need a preheader and dedicated exits. - if (!L.isLoopSimplifyForm()) - return false; + if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { + // We can only consider fully loop-invariant switch conditions as we need + // to completely eliminate the switch after unswitching. + if (!isa<Constant>(SI->getCondition()) && + L.isLoopInvariant(SI->getCondition())) + UnswitchCandidates.push_back({SI, {SI->getCondition()}}); + continue; + } - // Try trivial unswitch first before loop over other basic blocks in the loop. - Changed |= unswitchAllTrivialConditions(L, DT, LI); + auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) || + BI->getSuccessor(0) == BI->getSuccessor(1)) + continue; - // If we're not doing non-trivial unswitching, we're done. We both accept - // a parameter but also check a local flag that can be used for testing - // a debugging. - if (!NonTrivial && !EnableNonTrivialUnswitch) - return Changed; - - // Collect all remaining invariant branch conditions within this loop (as - // opposed to an inner loop which would be handled when visiting that inner - // loop). - SmallVector<TerminatorInst *, 4> UnswitchCandidates; - for (auto *BB : L.blocks()) - if (LI.getLoopFor(BB) == &L) - if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) - if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) && - BI->getSuccessor(0) != BI->getSuccessor(1)) - UnswitchCandidates.push_back(BI); + if (L.isLoopInvariant(BI->getCondition())) { + UnswitchCandidates.push_back({BI, {BI->getCondition()}}); + continue; + } + + Instruction &CondI = *cast<Instruction>(BI->getCondition()); + if (CondI.getOpcode() != Instruction::And && + CondI.getOpcode() != Instruction::Or) + continue; + + TinyPtrVector<Value *> Invariants = + collectHomogenousInstGraphLoopInvariants(L, CondI, LI); + if (Invariants.empty()) + continue; + + UnswitchCandidates.push_back({BI, std::move(Invariants)}); + } // If we didn't find any candidates, we're done. if (UnswitchCandidates.empty()) - return Changed; + return false; + + // Check if there are irreducible CFG cycles in this loop. If so, we cannot + // easily unswitch non-trivial edges out of the loop. Doing so might turn the + // irreducible control flow into reducible control flow and introduce new + // loops "out of thin air". If we ever discover important use cases for doing + // this, we can add support to loop unswitch, but it is a lot of complexity + // for what seems little or no real world benefit. + LoopBlocksRPO RPOT(&L); + RPOT.perform(&LI); + if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI)) + return false; - DEBUG(dbgs() << "Considering " << UnswitchCandidates.size() - << " non-trivial loop invariant conditions for unswitching.\n"); + LLVM_DEBUG( + dbgs() << "Considering " << UnswitchCandidates.size() + << " non-trivial loop invariant conditions for unswitching.\n"); // Given that unswitching these terminators will require duplicating parts of // the loop, so we need to be able to model that cost. Compute the ephemeral @@ -1972,10 +2255,10 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, continue; if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) - return Changed; + return false; if (auto CS = CallSite(&I)) if (CS.isConvergent() || CS.cannotDuplicate()) - return Changed; + return false; Cost += TTI.getUserCost(&I); } @@ -1984,7 +2267,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, assert(LoopCost >= 0 && "Must not have negative loop costs!"); BBCostMap[BB] = Cost; } - DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n"); + LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n"); // Now we find the best candidate by searching for the one with the following // properties in order: @@ -2003,8 +2286,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, SmallDenseMap<DomTreeNode *, int, 4> DTCostMap; // Given a terminator which might be unswitched, computes the non-duplicated // cost for that terminator. - auto ComputeUnswitchedCost = [&](TerminatorInst *TI) { - BasicBlock &BB = *TI->getParent(); + auto ComputeUnswitchedCost = [&](TerminatorInst &TI, bool FullUnswitch) { + BasicBlock &BB = *TI.getParent(); SmallPtrSet<BasicBlock *, 4> Visited; int Cost = LoopCost; @@ -2013,6 +2296,26 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, if (!Visited.insert(SuccBB).second) continue; + // If this is a partial unswitch candidate, then it must be a conditional + // branch with a condition of either `or` or `and`. In that case, one of + // the successors is necessarily duplicated, so don't even try to remove + // its cost. + if (!FullUnswitch) { + auto &BI = cast<BranchInst>(TI); + if (cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::And) { + if (SuccBB == BI.getSuccessor(1)) + continue; + } else { + assert(cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::Or && + "Only `and` and `or` conditions can result in a partial " + "unswitch!"); + if (SuccBB == BI.getSuccessor(0)) + continue; + } + } + // This successor's domtree will not need to be duplicated after // unswitching if the edge to the successor dominates it (and thus the // entire tree). This essentially means there is no other path into this @@ -2036,27 +2339,95 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, }; TerminatorInst *BestUnswitchTI = nullptr; int BestUnswitchCost; - for (TerminatorInst *CandidateTI : UnswitchCandidates) { - int CandidateCost = ComputeUnswitchedCost(CandidateTI); - DEBUG(dbgs() << " Computed cost of " << CandidateCost - << " for unswitch candidate: " << *CandidateTI << "\n"); + ArrayRef<Value *> BestUnswitchInvariants; + for (auto &TerminatorAndInvariants : UnswitchCandidates) { + TerminatorInst &TI = *TerminatorAndInvariants.first; + ArrayRef<Value *> Invariants = TerminatorAndInvariants.second; + BranchInst *BI = dyn_cast<BranchInst>(&TI); + int CandidateCost = ComputeUnswitchedCost( + TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 && + Invariants[0] == BI->getCondition())); + LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost + << " for unswitch candidate: " << TI << "\n"); if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) { - BestUnswitchTI = CandidateTI; + BestUnswitchTI = &TI; BestUnswitchCost = CandidateCost; + BestUnswitchInvariants = Invariants; } } - if (BestUnswitchCost < UnswitchThreshold) { - DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = " - << BestUnswitchCost << ") branch: " << *BestUnswitchTI - << "\n"); - Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT, - LI, AC, NonTrivialUnswitchCB); - } else { - DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost - << "\n"); + if (BestUnswitchCost >= UnswitchThreshold) { + LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " + << BestUnswitchCost << "\n"); + return false; + } + + LLVM_DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = " + << BestUnswitchCost << ") terminator: " << *BestUnswitchTI + << "\n"); + return unswitchNontrivialInvariants( + L, *BestUnswitchTI, BestUnswitchInvariants, DT, LI, AC, UnswitchCB, SE); +} + +/// Unswitch control flow predicated on loop invariant conditions. +/// +/// This first hoists all branches or switches which are trivial (IE, do not +/// require duplicating any part of the loop) out of the loop body. It then +/// looks at other loop invariant control flows and tries to unswitch those as +/// well by cloning the loop if the result is small enough. +/// +/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also +/// updated based on the unswitch. +/// +/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is +/// true, we will attempt to do non-trivial unswitching as well as trivial +/// unswitching. +/// +/// The `UnswitchCB` callback provided will be run after unswitching is +/// complete, with the first parameter set to `true` if the provided loop +/// remains a loop, and a list of new sibling loops created. +/// +/// If `SE` is non-null, we will update that analysis based on the unswitching +/// done. +static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, TargetTransformInfo &TTI, + bool NonTrivial, + function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, + ScalarEvolution *SE) { + assert(L.isRecursivelyLCSSAForm(DT, LI) && + "Loops must be in LCSSA form before unswitching."); + bool Changed = false; + + // Must be in loop simplified form: we need a preheader and dedicated exits. + if (!L.isLoopSimplifyForm()) + return false; + + // Try trivial unswitch first before loop over other basic blocks in the loop. + if (unswitchAllTrivialConditions(L, DT, LI, SE)) { + // If we unswitched successfully we will want to clean up the loop before + // processing it further so just mark it as unswitched and return. + UnswitchCB(/*CurrentLoopValid*/ true, {}); + return true; } + // If we're not doing non-trivial unswitching, we're done. We both accept + // a parameter but also check a local flag that can be used for testing + // a debugging. + if (!NonTrivial && !EnableNonTrivialUnswitch) + return false; + + // For non-trivial unswitching, because it often creates new loops, we rely on + // the pass manager to iterate on the loops rather than trying to immediately + // reach a fixed point. There is no substantial advantage to iterating + // internally, and if any of the new loops are simplified enough to contain + // trivial unswitching we want to prefer those. + + // Try to unswitch the best invariant condition. We prefer this full unswitch to + // a partial unswitch when possible below the threshold. + if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE)) + return true; + + // No other opportunities to unswitch. return Changed; } @@ -2066,16 +2437,18 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, Function &F = *L.getHeader()->getParent(); (void)F; - DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); + LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L + << "\n"); // Save the current loop name in a variable so that we can report it even // after it has been deleted. std::string LoopName = L.getName(); - auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid, - ArrayRef<Loop *> NewLoops) { + auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid, + ArrayRef<Loop *> NewLoops) { // If we did a non-trivial unswitch, we have added new (cloned) loops. - U.addSiblingLoops(NewLoops); + if (!NewLoops.empty()) + U.addSiblingLoops(NewLoops); // If the current loop remains valid, we should revisit it to catch any // other unswitch opportunities. Otherwise, we need to mark it as deleted. @@ -2085,15 +2458,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, U.markLoopAsDeleted(L, LoopName); }; - if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, - NonTrivialUnswitchCB)) + if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB, + &AR.SE)) return PreservedAnalyses::all(); -#ifndef NDEBUG // Historically this pass has had issues with the dominator tree so verify it // in asserts builds. - AR.DT.verifyDomTree(); -#endif + assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); return getLoopPassPreservedAnalyses(); } @@ -2128,15 +2499,19 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { Function &F = *L->getHeader()->getParent(); - DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n"); + LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L + << "\n"); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid, - ArrayRef<Loop *> NewLoops) { + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + auto *SE = SEWP ? &SEWP->getSE() : nullptr; + + auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid, + ArrayRef<Loop *> NewLoops) { // If we did a non-trivial unswitch, we have added new (cloned) loops. for (auto *NewL : NewLoops) LPM.addLoop(*NewL); @@ -2150,18 +2525,16 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { LPM.markLoopAsDeleted(*L); }; - bool Changed = - unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB); + bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE); // If anything was unswitched, also clear any cached information about this // loop. LPM.deleteSimpleAnalysisLoop(L); -#ifndef NDEBUG // Historically this pass has had issues with the dominator tree so verify it // in asserts builds. - DT.verifyDomTree(); -#endif + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + return Changed; } |