diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
| commit | 044eb2f6afba375a914ac9d8024f8f5142bb912e (patch) | |
| tree | 1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Transforms/Scalar | |
| parent | eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff) | |
Notes
Diffstat (limited to 'lib/Transforms/Scalar')
54 files changed, 9667 insertions, 3020 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index 5b467dc9fe12..1e683db50206 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -15,8 +15,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/ADCE.h" - +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -27,13 +29,29 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include <cassert> +#include <cstddef> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "adce" @@ -52,10 +70,12 @@ static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false), cl::Hidden); namespace { + /// Information about Instructions struct InstInfoType { /// True if the associated instruction is live. bool Live = false; + /// Quick access to information for block containing associated Instruction. struct BlockInfoType *Block = nullptr; }; @@ -64,10 +84,13 @@ struct InstInfoType { struct BlockInfoType { /// True when this block contains a live instructions. bool Live = false; + /// True when this block ends in an unconditional branch. bool UnconditionalBranch = false; + /// True when this block is known to have live PHI nodes. bool HasLivePhiNodes = false; + /// Control dependence sources need to be live for this block. bool CFLive = false; @@ -75,8 +98,6 @@ struct BlockInfoType { /// holds the value &InstInfo[Terminator] InstInfoType *TerminatorLiveInfo = nullptr; - bool terminatorIsLive() const { return TerminatorLiveInfo->Live; } - /// Corresponding BasicBlock. BasicBlock *BB = nullptr; @@ -85,14 +106,21 @@ struct BlockInfoType { /// Post-order numbering of reverse control flow graph. unsigned PostOrder; + + bool terminatorIsLive() const { return TerminatorLiveInfo->Live; } }; class AggressiveDeadCodeElimination { Function &F; + + // ADCE does not use DominatorTree per se, but it updates it to preserve the + // analysis. + DominatorTree &DT; PostDominatorTree &PDT; /// Mapping of blocks to associated information, an element in BlockInfoVec. - DenseMap<BasicBlock *, BlockInfoType> BlockInfo; + /// Use MapVector to get deterministic iteration order. + MapVector<BasicBlock *, BlockInfoType> BlockInfo; bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; } /// Mapping of instructions to associated information. @@ -102,6 +130,7 @@ class AggressiveDeadCodeElimination { /// Instructions known to be live where we need to mark /// reaching definitions as live. SmallVector<Instruction *, 128> Worklist; + /// Debug info scopes around a live instruction. SmallPtrSet<const Metadata *, 32> AliveScopes; @@ -116,15 +145,19 @@ class AggressiveDeadCodeElimination { /// Set up auxiliary data structures for Instructions and BasicBlocks and /// initialize the Worklist to the set of must-be-live Instruscions. void initialize(); + /// Return true for operations which are always treated as live. bool isAlwaysLive(Instruction &I); + /// Return true for instrumentation instructions for value profiling. bool isInstrumentsConstant(Instruction &I); /// Propagate liveness to reaching definitions. void markLiveInstructions(); + /// Mark an instruction as live. void markLive(Instruction *I); + /// Mark a block as live. void markLive(BlockInfoType &BB); void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); } @@ -157,11 +190,14 @@ class AggressiveDeadCodeElimination { void makeUnconditional(BasicBlock *BB, BasicBlock *Target); public: - AggressiveDeadCodeElimination(Function &F, PostDominatorTree &PDT) - : F(F), PDT(PDT) {} + AggressiveDeadCodeElimination(Function &F, DominatorTree &DT, + PostDominatorTree &PDT) + : F(F), DT(DT), PDT(PDT) {} + bool performDeadCodeElimination(); }; -} + +} // end anonymous namespace bool AggressiveDeadCodeElimination::performDeadCodeElimination() { initialize(); @@ -175,7 +211,6 @@ static bool isUnconditionalBranch(TerminatorInst *Term) { } void AggressiveDeadCodeElimination::initialize() { - auto NumBlocks = F.size(); // We will have an entry in the map for each block so we grow the @@ -217,7 +252,8 @@ void AggressiveDeadCodeElimination::initialize() { // to recording which nodes have been visited we also record whether // a node is currently on the "stack" of active ancestors of the current // node. - typedef DenseMap<BasicBlock *, bool> StatusMap ; + using StatusMap = DenseMap<BasicBlock *, bool>; + class DFState : public StatusMap { public: std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) { @@ -253,27 +289,23 @@ void AggressiveDeadCodeElimination::initialize() { } } - // Mark blocks live if there is no path from the block to the - // return of the function or a successor for which this is true. - // This protects IDFCalculator which cannot handle such blocks. - for (auto &BBInfoPair : BlockInfo) { - auto &BBInfo = BBInfoPair.second; - if (BBInfo.terminatorIsLive()) - continue; - auto *BB = BBInfo.BB; - if (!PDT.getNode(BB)) { - DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName() + // Mark blocks live if there is no path from the block to a + // return of the function. + // We do this by seeing which of the postdomtree root children exit the + // program, and for all others, mark the subtree live. + for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) { + auto *BB = PDTChild->getBlock(); + auto &Info = BlockInfo[BB]; + // Real function return + if (isa<ReturnInst>(Info.Terminator)) { + DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName() << '\n';); - markLive(BBInfo.Terminator); continue; } - for (auto *Succ : successors(BB)) - if (!PDT.getNode(Succ)) { - DEBUG(dbgs() << "Successor not post-dominated by return: " - << BB->getName() << '\n';); - markLive(BBInfo.Terminator); - break; - } + + // This child is something else, like an infinite loop. + for (auto DFNode : depth_first(PDTChild)) + markLive(BlockInfo[DFNode->getBlock()].Terminator); } // Treat the entry block as always live @@ -318,7 +350,6 @@ bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) { } void AggressiveDeadCodeElimination::markLiveInstructions() { - // Propagate liveness backwards to operands. do { // Worklist holds newly discovered live instructions @@ -343,7 +374,6 @@ void AggressiveDeadCodeElimination::markLiveInstructions() { } void AggressiveDeadCodeElimination::markLive(Instruction *I) { - auto &Info = InstInfo[I]; if (Info.Live) return; @@ -430,7 +460,6 @@ void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) { } void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() { - if (BlocksWithDeadTerminators.empty()) return; @@ -469,7 +498,6 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() { // //===----------------------------------------------------------------------===// bool AggressiveDeadCodeElimination::removeDeadInstructions() { - // Updates control and dataflow around dead blocks updateDeadRegions(); @@ -527,7 +555,6 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() { // A dead region is the set of dead blocks with a common live post-dominator. void AggressiveDeadCodeElimination::updateDeadRegions() { - DEBUG({ dbgs() << "final dead terminator blocks: " << '\n'; for (auto *BB : BlocksWithDeadTerminators) @@ -561,21 +588,40 @@ void AggressiveDeadCodeElimination::updateDeadRegions() { } assert((PreferredSucc && PreferredSucc->PostOrder > 0) && "Failed to find safe successor for dead branch"); + + // Collect removed successors to update the (Post)DominatorTrees. + SmallPtrSet<BasicBlock *, 4> RemovedSuccessors; bool First = true; for (auto *Succ : successors(BB)) { - if (!First || Succ != PreferredSucc->BB) + if (!First || Succ != PreferredSucc->BB) { Succ->removePredecessor(BB); - else + RemovedSuccessors.insert(Succ); + } else First = false; } makeUnconditional(BB, PreferredSucc->BB); + + // Inform the dominators about the deleted CFG edges. + SmallVector<DominatorTree::UpdateType, 4> DeletedEdges; + for (auto *Succ : RemovedSuccessors) { + // It might have happened that the same successor appeared multiple times + // and the CFG edge wasn't really removed. + if (Succ != PreferredSucc->BB) { + DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion" + << BB->getName() << " -> " << Succ->getName() << "\n"); + DeletedEdges.push_back({DominatorTree::Delete, BB, Succ}); + } + } + + DT.applyUpdates(DeletedEdges); + PDT.applyUpdates(DeletedEdges); + NumBranchesRemoved += 1; } } // reverse top-sort order void AggressiveDeadCodeElimination::computeReversePostOrder() { - // This provides a post-order numbering of the reverse control flow graph // Note that it is incomplete in the presence of infinite loops but we don't // need numbers blocks which don't reach the end of the functions since @@ -613,6 +659,9 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB, InstInfo[NewTerm].Live = true; if (const DILocation *DL = PredTerm->getDebugLoc()) NewTerm->setDebugLoc(DL); + + InstInfo.erase(PredTerm); + PredTerm->eraseFromParent(); } //===----------------------------------------------------------------------===// @@ -621,19 +670,24 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB, // //===----------------------------------------------------------------------===// PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) { + auto &DT = FAM.getResult<DominatorTreeAnalysis>(F); auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F); - if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination()) + if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination()) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<PostDominatorTreeAnalysis>(); return PA; } namespace { + struct ADCELegacyPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid + ADCELegacyPass() : FunctionPass(ID) { initializeADCELegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -641,22 +695,34 @@ struct ADCELegacyPass : public FunctionPass { bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; + + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); - return AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination(); + return AggressiveDeadCodeElimination(F, DT, PDT) + .performDeadCodeElimination(); } void getAnalysisUsage(AnalysisUsage &AU) const override { + // We require DominatorTree here only to update and thus preserve it. + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTreeWrapperPass>(); if (!RemoveControlFlowFlag) AU.setPreservesCFG(); + else { + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<PostDominatorTreeWrapperPass>(); + } AU.addPreserved<GlobalsAAWrapperPass>(); } }; -} + +} // end anonymous namespace char ADCELegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination", false, false) diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp index 2e5618686ec2..851efa000f65 100644 --- a/lib/Transforms/Scalar/BDCE.cpp +++ b/lib/Transforms/Scalar/BDCE.cpp @@ -20,11 +20,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/IR/CFG.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -48,8 +45,18 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { // If all bits of a user are demanded, then we know that nothing below that // in the def-use chain needs to be changed. auto *J = dyn_cast<Instruction>(JU); - if (J && !DB.getDemandedBits(J).isAllOnesValue()) + if (J && J->getType()->isSized() && + !DB.getDemandedBits(J).isAllOnesValue()) WorkList.push_back(J); + + // Note that we need to check for unsized types above before asking for + // demanded bits. Normally, the only way to reach an instruction with an + // unsized type is via an instruction that has side effects (or otherwise + // will demand its input bits). However, if we have a readnone function + // that returns an unsized type (e.g., void), we must avoid asking for the + // demanded bits of the function call's return value. A void-returning + // readnone function is always dead (and so we can stop walking the use/def + // chain here), but the check is necessary to avoid asserting. } // DFS through subsequent users while tracking visits to avoid cycles. @@ -70,7 +77,8 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { // If all bits of a user are demanded, then we know that nothing below // that in the def-use chain needs to be changed. auto *K = dyn_cast<Instruction>(KU); - if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue()) + if (K && !Visited.count(K) && K->getType()->isSized() && + !DB.getDemandedBits(K).isAllOnesValue()) WorkList.push_back(K); } } diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 457c9427ab9a..0562d3882f8b 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -2,11 +2,13 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp BDCE.cpp + CallSiteSplitting.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp DCE.cpp DeadStoreElimination.cpp + DivRemPairs.cpp EarlyCSE.cpp FlattenCFGPass.cpp Float2Int.cpp @@ -42,6 +44,7 @@ add_llvm_library(LLVMScalarOpts LowerExpectIntrinsic.cpp LowerGuardIntrinsic.cpp MemCpyOptimizer.cpp + MergeICmps.cpp MergedLoadStoreMotion.cpp NaryReassociate.cpp NewGVN.cpp @@ -59,6 +62,7 @@ add_llvm_library(LLVMScalarOpts SimplifyCFGPass.cpp Sink.cpp SpeculativeExecution.cpp + SpeculateAroundPHIs.cpp StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp new file mode 100644 index 000000000000..d8c408035038 --- /dev/null +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -0,0 +1,428 @@ +//===- CallSiteSplitting.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation that tries to split a call-site to pass +// more constrained arguments if its argument is predicated in the control flow +// so that we can expose better context to the later passes (e.g, inliner, jump +// threading, or IPA-CP based function cloning, etc.). +// As of now we support two cases : +// +// 1) If a call site is dominated by an OR condition and if any of its arguments +// are predicated on this OR condition, try to split the condition with more +// constrained arguments. For example, in the code below, we try to split the +// call site since we can predicate the argument(ptr) based on the OR condition. +// +// Split from : +// if (!ptr || c) +// callee(ptr); +// to : +// if (!ptr) +// callee(null) // set the known constant value +// else if (c) +// callee(nonnull ptr) // set non-null attribute in the argument +// +// 2) We can also split a call-site based on constant incoming values of a PHI +// For example, +// from : +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail, label %TBB +// TBB: +// br label Tail% +// Tail: +// %p = phi i32 [ 0, %Header], [ 1, %TBB] +// call void @bar(i32 %p) +// to +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail-split0, label %TBB +// TBB: +// br label %Tail-split1 +// Tail-split0: +// call void @bar(i32 0) +// br label %Tail +// Tail-split1: +// call void @bar(i32 1) +// br label %Tail +// Tail: +// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "callsite-splitting" + +STATISTIC(NumCallSiteSplit, "Number of call-site split"); + +static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI, + Value *Op) { + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.addParamAttr(ArgNo, Attribute::NonNull); + ++ArgNo; + } +} + +static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI, + Value *Op, Constant *ConstValue) { + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.setArgument(ArgNo, ConstValue); + ++ArgNo; + } +} + +static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { + assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand."); + Value *Op0 = Cmp->getOperand(0); + unsigned ArgNo = 0; + for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; + ++I, ++ArgNo) { + // Don't consider constant or arguments that are already known non-null. + if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull)) + continue; + + if (*I == Op0) + return true; + } + return false; +} + +/// If From has a conditional jump to To, add the condition to Conditions, +/// if it is relevant to any argument at CS. +static void +recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To, + SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) { + auto *BI = dyn_cast<BranchInst>(From->getTerminator()); + if (!BI || !BI->isConditional()) + return; + + CmpInst::Predicate Pred; + Value *Cond = BI->getCondition(); + if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) + return; + + ICmpInst *Cmp = cast<ICmpInst>(Cond); + if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) + if (isCondRelevantToAnyCallArgument(Cmp, CS)) + Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To + ? Pred + : Cmp->getInversePredicate()}); +} + +/// Record ICmp conditions relevant to any argument in CS following Pred's +/// single successors. If there are conflicting conditions along a path, like +/// x == 1 and x == 0, the first condition will be used. +static void +recordConditions(const CallSite &CS, BasicBlock *Pred, + SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) { + recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions); + BasicBlock *From = Pred; + BasicBlock *To = Pred; + SmallPtrSet<BasicBlock *, 4> Visited = {From}; + while (!Visited.count(From->getSinglePredecessor()) && + (From = From->getSinglePredecessor())) { + recordCondition(CS, From, To, Conditions); + To = From; + } +} + +static Instruction * +addConditions(CallSite &CS, + SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) { + if (Conditions.empty()) + return nullptr; + + Instruction *NewCI = CS.getInstruction()->clone(); + for (auto &Cond : Conditions) { + Value *Arg = Cond.first->getOperand(0); + Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1)); + if (Cond.second == ICmpInst::ICMP_EQ) + setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal); + else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { + assert(Cond.second == ICmpInst::ICMP_NE); + addNonNullAttribute(CS.getInstruction(), NewCI, Arg); + } + } + return NewCI; +} + +static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) { + SmallVector<BasicBlock *, 2> Preds(predecessors((BB))); + assert(Preds.size() == 2 && "Expected exactly 2 predecessors!"); + return Preds; +} + +static bool canSplitCallSite(CallSite CS) { + // FIXME: As of now we handle only CallInst. InvokeInst could be handled + // without too much effort. + Instruction *Instr = CS.getInstruction(); + if (!isa<CallInst>(Instr)) + return false; + + // Allow splitting a call-site only when there is no instruction before the + // call-site in the basic block. Based on this constraint, we only clone the + // call instruction, and we do not move a call-site across any other + // instruction. + BasicBlock *CallSiteBB = Instr->getParent(); + if (Instr != CallSiteBB->getFirstNonPHIOrDbg()) + return false; + + // Need 2 predecessors and cannot split an edge from an IndirectBrInst. + SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB)); + if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) || + isa<IndirectBrInst>(Preds[1]->getTerminator())) + return false; + + return CallSiteBB->canSplitPredecessors(); +} + +/// Return true if the CS is split into its new predecessors which are directly +/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. +/// In OR predicated case, PredBB1 will point the header, and PredBB2 will point +/// to the second compare block. CallInst1 and CallInst2 will be the new +/// call-sites placed in the new predecessors split for PredBB1 and PredBB2, +/// repectively. Therefore, CallInst1 will be the call-site placed +/// between Header and Tail, and CallInst2 will be the call-site between TBB and +/// Tail. For example, in the IR below with an OR condition, the call-site can +/// be split +/// +/// from : +/// +/// Header: +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail, %TBB +/// TBB: +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail, %End +/// Tail: +/// %ca = call i1 @callee (i32* %a, i32* %b) +/// +/// to : +/// +/// Header: // PredBB1 is Header +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail-split1, %TBB +/// TBB: // PredBB2 is TBB +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail-split2, %End +/// Tail-split1: +/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1 +/// br %Tail +/// Tail-split2: +/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 +/// br %Tail +/// Tail: +/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] +/// +/// Note that for an OR predicated case, CallInst1 and CallInst2 should be +/// created with more constrained arguments in +/// createCallSitesOnOrPredicatedArgument(). +static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, + Instruction *CallInst1, Instruction *CallInst2) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *TailBB = Instr->getParent(); + assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site"); + + BasicBlock *SplitBlock1 = + SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); + BasicBlock *SplitBlock2 = + SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); + + assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); + + if (!CallInst1) + CallInst1 = Instr->clone(); + if (!CallInst2) + CallInst2 = Instr->clone(); + + CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); + CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); + + CallSite CS1(CallInst1); + CallSite CS2(CallInst2); + + // Handle PHIs used as arguments in the call-site. + for (auto &PI : *TailBB) { + PHINode *PN = dyn_cast<PHINode>(&PI); + if (!PN) + break; + unsigned ArgNo = 0; + for (auto &CI : CS.args()) { + if (&*CI == PN) { + CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); + CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); + } + ++ArgNo; + } + } + + // Replace users of the original call with a PHI mering call-sites split. + if (Instr->getNumUses()) { + PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", + TailBB->getFirstNonPHI()); + PN->addIncoming(CallInst1, SplitBlock1); + PN->addIncoming(CallInst2, SplitBlock2); + Instr->replaceAllUsesWith(PN); + } + DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); + DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() + << "\n"); + DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() + << "\n"); + Instr->eraseFromParent(); + NumCallSiteSplit++; +} + +// Return true if the call-site has an argument which is a PHI with only +// constant incoming values. +static bool isPredicatedOnPHI(CallSite CS) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *Parent = Instr->getParent(); + if (Instr != Parent->getFirstNonPHIOrDbg()) + return false; + + for (auto &BI : *Parent) { + if (PHINode *PN = dyn_cast<PHINode>(&BI)) { + for (auto &I : CS.args()) + if (&*I == PN) { + assert(PN->getNumIncomingValues() == 2 && + "Unexpected number of incoming values"); + if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1)) + return false; + if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) + continue; + if (isa<Constant>(PN->getIncomingValue(0)) && + isa<Constant>(PN->getIncomingValue(1))) + return true; + } + } + break; + } + return false; +} + +static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) { + if (!isPredicatedOnPHI(CS)) + return false; + + auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); + splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr); + return true; +} +// Check if one of the predecessors is a single predecessors of the other. +// This is a requirement for control flow modeling an OR. HeaderBB points to +// the single predecessor and OrBB points to other node. HeaderBB potentially +// contains the first compare of the OR and OrBB the second. +static bool isOrHeader(BasicBlock *HeaderBB, BasicBlock *OrBB) { + return OrBB->getSinglePredecessor() == HeaderBB && + HeaderBB->getTerminator()->getNumSuccessors() == 2; +} + +static bool tryToSplitOnOrPredicatedArgument(CallSite CS) { + auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); + if (!isOrHeader(Preds[0], Preds[1]) && !isOrHeader(Preds[1], Preds[0])) + return false; + + SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2; + recordConditions(CS, Preds[0], C1); + recordConditions(CS, Preds[1], C2); + + Instruction *CallInst1 = addConditions(CS, C1); + Instruction *CallInst2 = addConditions(CS, C2); + if (!CallInst1 && !CallInst2) + return false; + + splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1); + return true; +} + +static bool tryToSplitCallSite(CallSite CS) { + if (!CS.arg_size() || !canSplitCallSite(CS)) + return false; + return tryToSplitOnOrPredicatedArgument(CS) || + tryToSplitOnPHIPredicatedArgument(CS); +} + +static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { + bool Changed = false; + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { + BasicBlock &BB = *BI++; + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II++; + CallSite CS(cast<Value>(I)); + if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI)) + continue; + + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + continue; + Changed |= tryToSplitCallSite(CS); + } + } + return Changed; +} + +namespace { +struct CallSiteSplittingLegacyPass : public FunctionPass { + static char ID; + CallSiteSplittingLegacyPass() : FunctionPass(ID) { + initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + return doCallSiteSplitting(F, TLI); + } +}; +} // namespace + +char CallSiteSplittingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +FunctionPass *llvm::createCallSiteSplittingPass() { + return new CallSiteSplittingLegacyPass(); +} + +PreservedAnalyses CallSiteSplittingPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + + if (!doCallSiteSplitting(F, TLI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + return PA; +} diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index 122c9314e022..e4b08c5ed305 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -34,18 +34,39 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/ConstantHoisting.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> #include <tuple> +#include <utility> using namespace llvm; using namespace consthoist; @@ -62,10 +83,12 @@ static cl::opt<bool> ConstHoistWithBlockFrequency( "without hoisting.")); namespace { + /// \brief The constant hoisting pass. class ConstantHoistingLegacyPass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid + ConstantHoistingLegacyPass() : FunctionPass(ID) { initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -87,9 +110,11 @@ public: private: ConstantHoistingPass Impl; }; -} + +} // end anonymous namespace char ConstantHoistingLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist", "Constant Hoisting", false, false) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) @@ -128,7 +153,6 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { return MadeChange; } - /// \brief Find the constant materialization insertion point. Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, unsigned Idx) const { @@ -217,8 +241,9 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, } // Visit Orders in bottom-up order. - typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency> - InsertPtsCostPair; + using InsertPtsCostPair = + std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>; + // InsertPtsMap is a map from a BB to the best insertion points for the // subtree of BB (subtree not including the BB itself). DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap; @@ -310,7 +335,6 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( return InsertPts; } - /// \brief Record constant integer ConstInt for instruction Inst at operand /// index Idx. /// @@ -351,7 +375,6 @@ void ConstantHoistingPass::collectConstantCandidates( } } - /// \brief Check the operand for instruction Inst at index Idx. void ConstantHoistingPass::collectConstantCandidates( ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) { @@ -393,7 +416,6 @@ void ConstantHoistingPass::collectConstantCandidates( } } - /// \brief Scan the instruction for expensive integer constants and record them /// in the constant candidate vector. void ConstantHoistingPass::collectConstantCandidates( @@ -427,9 +449,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { // bit widths (APInt Operator- does not like that). If the value cannot be // represented in uint64 we return an "empty" APInt. This is then interpreted // as the value is not in range. -static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1, - const APInt &V2) { - llvm::Optional<APInt> Res = None; +static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) { + Optional<APInt> Res = None; unsigned BW = V1.getBitWidth() > V2.getBitWidth() ? V1.getBitWidth() : V2.getBitWidth(); uint64_t LimVal1 = V1.getLimitedValue(); @@ -496,9 +517,9 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, DEBUG(dbgs() << "Cost: " << Cost << "\n"); for (auto C2 = S; C2 != E; ++C2) { - llvm::Optional<APInt> Diff = calculateOffsetDiff( - C2->ConstInt->getValue(), - ConstCand->ConstInt->getValue()); + Optional<APInt> Diff = calculateOffsetDiff( + C2->ConstInt->getValue(), + ConstCand->ConstInt->getValue()); if (Diff) { const int ImmCosts = TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty); @@ -696,6 +717,9 @@ bool ConstantHoistingPass::emitBaseConstants() { IntegerType *Ty = ConstInfo.BaseConstant->getType(); Instruction *Base = new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); + + Base->setDebugLoc(IP->getDebugLoc()); + DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB " << IP->getParent()->getName() << '\n' << *Base << '\n'); @@ -714,6 +738,8 @@ bool ConstantHoistingPass::emitBaseConstants() { emitBaseConstants(Base, RCI.Offset, U); ReBasesNum++; } + + Base->setDebugLoc(DILocation::getMergedLocation(Base->getDebugLoc(), U.Inst->getDebugLoc())); } } UsesNum = Uses; @@ -722,7 +748,6 @@ bool ConstantHoistingPass::emitBaseConstants() { assert(!Base->use_empty() && "The use list is empty!?"); assert(isa<Instruction>(Base->user_back()) && "All uses should be instructions."); - Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc()); } (void)UsesNum; (void)ReBasesNum; diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 28157783daa7..8f468ebf8949 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -12,22 +12,41 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <cassert> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "correlated-value-propagation" @@ -41,13 +60,16 @@ STATISTIC(NumDeadCases, "Number of switch cases removed"); STATISTIC(NumSDivs, "Number of sdiv converted to udiv"); STATISTIC(NumAShrs, "Number of ashr converted to lshr"); STATISTIC(NumSRems, "Number of srem converted to urem"); +STATISTIC(NumOverflows, "Number of overflow checks removed"); static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true)); namespace { + class CorrelatedValuePropagation : public FunctionPass { public: static char ID; + CorrelatedValuePropagation(): FunctionPass(ID) { initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry()); } @@ -59,9 +81,11 @@ namespace { AU.addPreserved<GlobalsAAWrapperPass>(); } }; -} + +} // end anonymous namespace char CorrelatedValuePropagation::ID = 0; + INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation", "Value Propagation", false, false) INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) @@ -302,11 +326,72 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { return Changed; } +// See if we can prove that the given overflow intrinsic will not overflow. +static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) { + using OBO = OverflowingBinaryOperator; + auto NoWrap = [&] (Instruction::BinaryOps BinOp, unsigned NoWrapKind) { + Value *RHS = II->getOperand(1); + ConstantRange RRange = LVI->getConstantRange(RHS, II->getParent(), II); + ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion( + BinOp, RRange, NoWrapKind); + // As an optimization, do not compute LRange if we do not need it. + if (NWRegion.isEmptySet()) + return false; + Value *LHS = II->getOperand(0); + ConstantRange LRange = LVI->getConstantRange(LHS, II->getParent(), II); + return NWRegion.contains(LRange); + }; + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::uadd_with_overflow: + return NoWrap(Instruction::Add, OBO::NoUnsignedWrap); + case Intrinsic::sadd_with_overflow: + return NoWrap(Instruction::Add, OBO::NoSignedWrap); + case Intrinsic::usub_with_overflow: + return NoWrap(Instruction::Sub, OBO::NoUnsignedWrap); + case Intrinsic::ssub_with_overflow: + return NoWrap(Instruction::Sub, OBO::NoSignedWrap); + } + return false; +} + +static void processOverflowIntrinsic(IntrinsicInst *II) { + Value *NewOp = nullptr; + switch (II->getIntrinsicID()) { + default: + llvm_unreachable("Unexpected instruction."); + case Intrinsic::uadd_with_overflow: + case Intrinsic::sadd_with_overflow: + NewOp = BinaryOperator::CreateAdd(II->getOperand(0), II->getOperand(1), + II->getName(), II); + break; + case Intrinsic::usub_with_overflow: + case Intrinsic::ssub_with_overflow: + NewOp = BinaryOperator::CreateSub(II->getOperand(0), II->getOperand(1), + II->getName(), II); + break; + } + ++NumOverflows; + IRBuilder<> B(II); + Value *NewI = B.CreateInsertValue(UndefValue::get(II->getType()), NewOp, 0); + NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(II->getContext()), 1); + II->replaceAllUsesWith(NewI); + II->eraseFromParent(); +} + /// Infer nonnull attributes for the arguments at the specified callsite. static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { SmallVector<unsigned, 4> ArgNos; unsigned ArgNo = 0; + if (auto *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { + if (willNotOverflow(II, LVI)) { + processOverflowIntrinsic(II); + return true; + } + } + for (Value *V : CS.args()) { PointerType *Type = dyn_cast<PointerType>(V->getType()); // Try to mark pointer typed parameters as non-null. We skip the @@ -335,18 +420,6 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { return true; } -// Helper function to rewrite srem and sdiv. As a policy choice, we choose not -// to waste compile time on anything where the operands are local defs. While -// LVI can sometimes reason about such cases, it's not its primary purpose. -static bool hasLocalDefs(BinaryOperator *SDI) { - for (Value *O : SDI->operands()) { - auto *I = dyn_cast<Instruction>(O); - if (I && I->getParent() == SDI->getParent()) - return true; - } - return false; -} - static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) { Constant *Zero = ConstantInt::get(SDI->getType(), 0); for (Value *O : SDI->operands()) { @@ -358,7 +431,7 @@ static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) { } static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { - if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) || + if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI)) return false; @@ -376,7 +449,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { /// conditions, this can sometimes prove conditions instcombine can't by /// exploiting range information. static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) { - if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) || + if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI)) return false; @@ -391,7 +464,7 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) { } static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { - if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI)) + if (SDI->getType()->isVectorTy()) return false; Constant *Zero = ConstantInt::get(SDI->getType(), 0); @@ -410,12 +483,12 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { } static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) { - typedef OverflowingBinaryOperator OBO; + using OBO = OverflowingBinaryOperator; if (DontProcessAdds) return false; - if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp)) + if (AddOp->getType()->isVectorTy()) return false; bool NSW = AddOp->hasNoSignedWrap(); @@ -492,7 +565,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { // blocks before querying later blocks (which require us to analyze early // blocks). Eagerly simplifying shallow blocks means there is strictly less // work to do for deep blocks. This also means we don't visit unreachable - // blocks. + // blocks. for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { bool BBChanged = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 1ec38e56aa4c..e703014bb0e6 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -16,31 +16,55 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/DeadStoreElimination.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstddef> +#include <iterator> #include <map> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "dse" @@ -49,18 +73,23 @@ STATISTIC(NumRedundantStores, "Number of redundant stores deleted"); STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); STATISTIC(NumCompletePartials, "Number of stores dead by later partials"); +STATISTIC(NumModifiedStores, "Number of stores modified"); static cl::opt<bool> EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", cl::init(true), cl::Hidden, cl::desc("Enable partial-overwrite tracking in DSE")); +static cl::opt<bool> +EnablePartialStoreMerging("enable-dse-partial-store-merging", + cl::init(true), cl::Hidden, + cl::desc("Enable partial store merging in DSE")); //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// -typedef std::map<int64_t, int64_t> OverlapIntervalsTy; -typedef DenseMap<Instruction *, OverlapIntervalsTy> InstOverlapIntervalsTy; +using OverlapIntervalsTy = std::map<int64_t, int64_t>; +using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>; /// Delete this instruction. Before we do, go through and zero out all the /// operands of this instruction. If any of them become dead, delete them and @@ -209,7 +238,6 @@ static bool isRemovable(Instruction *I) { case Intrinsic::init_trampoline: // Always safe to remove init_trampoline. return true; - case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy: @@ -224,7 +252,6 @@ static bool isRemovable(Instruction *I) { return false; } - /// Returns true if the end of this instruction can be safely shortened in /// length. static bool isShortenableAtTheEnd(Instruction *I) { @@ -287,14 +314,24 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL, } namespace { -enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown }; -} + +enum OverwriteResult { + OW_Begin, + OW_Complete, + OW_End, + OW_PartialEarlierWithFullLater, + OW_Unknown +}; + +} // end anonymous namespace /// Return 'OW_Complete' if a store to the 'Later' location completely /// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the /// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the -/// beginning of the 'Earlier' location is overwritten by 'Later', or -/// 'OW_Unknown' if nothing can be determined. +/// beginning of the 'Earlier' location is overwritten by 'Later'. +/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was +/// overwritten by a latter (smaller) store which doesn't write outside the big +/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined. static OverwriteResult isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, @@ -427,6 +464,19 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, } } + // Check for an earlier store which writes to all the memory locations that + // the later store writes to. + if (EnablePartialStoreMerging && LaterOff >= EarlierOff && + int64_t(EarlierOff + Earlier.Size) > LaterOff && + uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) { + DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff + << ", " << int64_t(EarlierOff + Earlier.Size) + << ") by a later store [" << LaterOff << ", " + << int64_t(LaterOff + Later.Size) << ")\n"); + // TODO: Maybe come up with a better name? + return OW_PartialEarlierWithFullLater; + } + // Another interesting case is if the later store overwrites the end of the // earlier store. // @@ -544,11 +594,9 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI, } for (; BI != EI; ++BI) { Instruction *I = &*BI; - if (I->mayWriteToMemory() && I != SecondI) { - auto Res = AA->getModRefInfo(I, MemLoc); - if (Res & MRI_Mod) + if (I->mayWriteToMemory() && I != SecondI) + if (isModSet(AA->getModRefInfo(I, MemLoc))) return false; - } } if (B != FirstBB) { assert(B != &FirstBB->getParent()->getEntryBlock() && @@ -772,9 +820,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)); - - return A == MRI_ModRef || A == MRI_Ref; + return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI))); }); // If all of the allocas were clobbered by the call then we're not going @@ -840,7 +886,7 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset, if (!IsOverwriteEnd) LaterOffset = int64_t(LaterOffset + LaterSize); - if (!(llvm::isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) && + if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) && !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0)) return false; @@ -1094,6 +1140,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // If we find a write that is a) removable (i.e., non-volatile), b) is // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. + // Also try to merge two stores if a later one only touches memory written + // to by the earlier one. if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; @@ -1123,6 +1171,72 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, bool IsOverwriteEnd = (OR == OW_End); MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize, InstWriteOffset, LaterSize, IsOverwriteEnd); + } else if (EnablePartialStoreMerging && + OR == OW_PartialEarlierWithFullLater) { + auto *Earlier = dyn_cast<StoreInst>(DepWrite); + auto *Later = dyn_cast<StoreInst>(Inst); + if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) && + Later && isa<ConstantInt>(Later->getValueOperand())) { + // If the store we find is: + // a) partially overwritten by the store to 'Loc' + // b) the later store is fully contained in the earlier one and + // c) they both have a constant value + // Merge the two stores, replacing the earlier store's value with a + // merge of both values. + // TODO: Deal with other constant types (vectors, etc), and probably + // some mem intrinsics (if needed) + + APInt EarlierValue = + cast<ConstantInt>(Earlier->getValueOperand())->getValue(); + APInt LaterValue = + cast<ConstantInt>(Later->getValueOperand())->getValue(); + unsigned LaterBits = LaterValue.getBitWidth(); + assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth()); + LaterValue = LaterValue.zext(EarlierValue.getBitWidth()); + + // Offset of the smaller store inside the larger store + unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8; + unsigned LShiftAmount = + DL.isBigEndian() + ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits + : BitOffsetDiff; + APInt Mask = + APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount, + LShiftAmount + LaterBits); + // Clear the bits we'll be replacing, then OR with the smaller + // store, shifted appropriately. + APInt Merged = + (EarlierValue & ~Mask) | (LaterValue << LShiftAmount); + DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite + << "\n Later: " << *Inst + << "\n Merged Value: " << Merged << '\n'); + + auto *SI = new StoreInst( + ConstantInt::get(Earlier->getValueOperand()->getType(), Merged), + Earlier->getPointerOperand(), false, Earlier->getAlignment(), + Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite); + + unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_nontemporal}; + SI->copyMetadata(*DepWrite, MDToKeep); + ++NumModifiedStores; + + // Remove earlier, wider, store + size_t Idx = InstrOrdering.lookup(DepWrite); + InstrOrdering.erase(DepWrite); + InstrOrdering.insert(std::make_pair(SI, Idx)); + + // Delete the old stores and now-dead instructions that feed them. + deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, &InstrOrdering); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, + &InstrOrdering); + MadeChange = true; + + // We erased DepWrite and Inst (Loc); start over. + break; + } } } @@ -1137,7 +1251,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. - if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref) + if (isRefSet(AA->getModRefInfo(DepWrite, Loc))) break; InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false, @@ -1190,9 +1304,12 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { } namespace { + /// A legacy pass for the legacy pass manager that wraps \c DSEPass. class DSELegacyPass : public FunctionPass { public: + static char ID; // Pass identification, replacement for typeid + DSELegacyPass() : FunctionPass(ID) { initializeDSELegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -1221,12 +1338,12 @@ public: AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceWrapperPass>(); } - - static char ID; // Pass identification, replacement for typeid }; + } // end anonymous namespace char DSELegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp new file mode 100644 index 000000000000..e383af89a384 --- /dev/null +++ b/lib/Transforms/Scalar/DivRemPairs.cpp @@ -0,0 +1,206 @@ +//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass hoists and/or decomposes integer division and remainder +// instructions to enable CFG improvements and better codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/DivRemPairs.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BypassSlowDivision.h" +using namespace llvm; + +#define DEBUG_TYPE "div-rem-pairs" +STATISTIC(NumPairs, "Number of div/rem pairs"); +STATISTIC(NumHoisted, "Number of instructions hoisted"); +STATISTIC(NumDecomposed, "Number of instructions decomposed"); + +/// Find matching pairs of integer div/rem ops (they have the same numerator, +/// denominator, and signedness). If they exist in different basic blocks, bring +/// them together by hoisting or replace the common division operation that is +/// implicit in the remainder: +/// X % Y <--> X - ((X / Y) * Y). +/// +/// We can largely ignore the normal safety and cost constraints on speculation +/// of these ops when we find a matching pair. This is because we are already +/// guaranteed that any exceptions and most cost are already incurred by the +/// first member of the pair. +/// +/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or +/// SimplifyCFG, but it's split off on its own because it's different enough +/// that it doesn't quite match the stated objectives of those passes. +static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, + const DominatorTree &DT) { + bool Changed = false; + + // Insert all divide and remainder instructions into maps keyed by their + // operands and opcode (signed or unsigned). + DenseMap<DivRemMapKey, Instruction *> DivMap, RemMap; + for (auto &BB : F) { + for (auto &I : BB) { + if (I.getOpcode() == Instruction::SDiv) + DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I; + else if (I.getOpcode() == Instruction::UDiv) + DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I; + else if (I.getOpcode() == Instruction::SRem) + RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I; + else if (I.getOpcode() == Instruction::URem) + RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I; + } + } + + // We can iterate over either map because we are only looking for matched + // pairs. Choose remainders for efficiency because they are usually even more + // rare than division. + for (auto &RemPair : RemMap) { + // Find the matching division instruction from the division map. + Instruction *DivInst = DivMap[RemPair.getFirst()]; + if (!DivInst) + continue; + + // We have a matching pair of div/rem instructions. If one dominates the + // other, hoist and/or replace one. + NumPairs++; + Instruction *RemInst = RemPair.getSecond(); + bool IsSigned = DivInst->getOpcode() == Instruction::SDiv; + bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned); + + // If the target supports div+rem and the instructions are in the same block + // already, there's nothing to do. The backend should handle this. If the + // target does not support div+rem, then we will decompose the rem. + if (HasDivRemOp && RemInst->getParent() == DivInst->getParent()) + continue; + + bool DivDominates = DT.dominates(DivInst, RemInst); + if (!DivDominates && !DT.dominates(RemInst, DivInst)) + continue; + + if (HasDivRemOp) { + // The target has a single div/rem operation. Hoist the lower instruction + // to make the matched pair visible to the backend. + if (DivDominates) + RemInst->moveAfter(DivInst); + else + DivInst->moveAfter(RemInst); + NumHoisted++; + } else { + // The target does not have a single div/rem operation. Decompose the + // remainder calculation as: + // X % Y --> X - ((X / Y) * Y). + Value *X = RemInst->getOperand(0); + Value *Y = RemInst->getOperand(1); + Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y); + Instruction *Sub = BinaryOperator::CreateSub(X, Mul); + + // If the remainder dominates, then hoist the division up to that block: + // + // bb1: + // %rem = srem %x, %y + // bb2: + // %div = sdiv %x, %y + // --> + // bb1: + // %div = sdiv %x, %y + // %mul = mul %div, %y + // %rem = sub %x, %mul + // + // If the division dominates, it's already in the right place. The mul+sub + // will be in a different block because we don't assume that they are + // cheap to speculatively execute: + // + // bb1: + // %div = sdiv %x, %y + // bb2: + // %rem = srem %x, %y + // --> + // bb1: + // %div = sdiv %x, %y + // bb2: + // %mul = mul %div, %y + // %rem = sub %x, %mul + // + // If the div and rem are in the same block, we do the same transform, + // but any code movement would be within the same block. + + if (!DivDominates) + DivInst->moveBefore(RemInst); + Mul->insertAfter(RemInst); + Sub->insertAfter(Mul); + + // Now kill the explicit remainder. We have replaced it with: + // (sub X, (mul (div X, Y), Y) + RemInst->replaceAllUsesWith(Sub); + RemInst->eraseFromParent(); + NumDecomposed++; + } + Changed = true; + } + + return Changed; +} + +// Pass manager boilerplate below here. + +namespace { +struct DivRemPairsLegacyPass : public FunctionPass { + static char ID; + DivRemPairsLegacyPass() : FunctionPass(ID) { + initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + return optimizeDivRem(F, TTI, DT); + } +}; +} + +char DivRemPairsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs", + "Hoist/decompose integer division and remainder", false, + false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs", + "Hoist/decompose integer division and remainder", false, + false) +FunctionPass *llvm::createDivRemPairsPass() { + return new DivRemPairsLegacyPass(); +} + +PreservedAnalyses DivRemPairsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + if (!optimizeDivRem(F, TTI, DT)) + return PreservedAnalyses::all(); + // TODO: This pass just hoists/replaces math ops - all analyses are preserved? + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + PA.preserve<GlobalsAA>(); + return PA; +} diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index c5c9b2c185d6..5798e1c4ee99 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -13,9 +13,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -24,18 +27,37 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <cassert> #include <deque> +#include <memory> +#include <utility> + using namespace llvm; using namespace llvm::PatternMatch; @@ -53,6 +75,7 @@ STATISTIC(NumDSE, "Number of trivial dead stores removed"); //===----------------------------------------------------------------------===// namespace { + /// \brief Struct representing the available values in the scoped hash table. struct SimpleValue { Instruction *Inst; @@ -77,20 +100,25 @@ struct SimpleValue { isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); } }; -} + +} // end anonymous namespace namespace llvm { + template <> struct DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { return DenseMapInfo<Instruction *>::getEmptyKey(); } + static inline SimpleValue getTombstoneKey() { return DenseMapInfo<Instruction *>::getTombstoneKey(); } + static unsigned getHashValue(SimpleValue Val); static bool isEqual(SimpleValue LHS, SimpleValue RHS); }; -} + +} // end namespace llvm unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; @@ -115,6 +143,21 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { return hash_combine(Inst->getOpcode(), Pred, LHS, RHS); } + // Hash min/max/abs (cmp + select) to allow for commuted operands. + // Min/max may also have non-canonical compare predicate (eg, the compare for + // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the + // compare. + Value *A, *B; + SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor; + // TODO: We should also detect FP min/max. + if (SPF == SPF_SMIN || SPF == SPF_SMAX || + SPF == SPF_UMIN || SPF == SPF_UMAX || + SPF == SPF_ABS || SPF == SPF_NABS) { + if (A > B) + std::swap(A, B); + return hash_combine(Inst->getOpcode(), SPF, A, B); + } + if (CastInst *CI = dyn_cast<CastInst>(Inst)) return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0)); @@ -173,6 +216,20 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); } + // Min/max/abs can occur with commuted operands, non-canonical predicates, + // and/or non-canonical operands. + Value *LHSA, *LHSB; + SelectPatternFlavor LSPF = matchSelectPattern(LHSI, LHSA, LHSB).Flavor; + // TODO: We should also detect FP min/max. + if (LSPF == SPF_SMIN || LSPF == SPF_SMAX || + LSPF == SPF_UMIN || LSPF == SPF_UMAX || + LSPF == SPF_ABS || LSPF == SPF_NABS) { + Value *RHSA, *RHSB; + SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor; + return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) || + (LHSA == RHSB && LHSB == RHSA))); + } + return false; } @@ -181,6 +238,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { //===----------------------------------------------------------------------===// namespace { + /// \brief Struct representing the available call values in the scoped hash /// table. struct CallValue { @@ -206,20 +264,25 @@ struct CallValue { return true; } }; -} + +} // end anonymous namespace namespace llvm { + template <> struct DenseMapInfo<CallValue> { static inline CallValue getEmptyKey() { return DenseMapInfo<Instruction *>::getEmptyKey(); } + static inline CallValue getTombstoneKey() { return DenseMapInfo<Instruction *>::getTombstoneKey(); } + static unsigned getHashValue(CallValue Val); static bool isEqual(CallValue LHS, CallValue RHS); }; -} + +} // end namespace llvm unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { Instruction *Inst = Val.Inst; @@ -241,6 +304,7 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { //===----------------------------------------------------------------------===// namespace { + /// \brief A simple and fast domtree-based CSE pass. /// /// This pass does a simple depth-first walk over the dominator tree, @@ -257,10 +321,13 @@ public: const SimplifyQuery SQ; MemorySSA *MSSA; std::unique_ptr<MemorySSAUpdater> MSSAUpdater; - typedef RecyclingAllocator< - BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy; - typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, - AllocatorTy> ScopedHTType; + + using AllocatorTy = + RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<SimpleValue, Value *>>; + using ScopedHTType = + ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, + AllocatorTy>; /// \brief A scoped hash table of the current values of all of our simple /// scalar expressions. @@ -285,44 +352,45 @@ public: /// present the table; it is the responsibility of the consumer to inspect /// the atomicity/volatility if needed. struct LoadValue { - Instruction *DefInst; - unsigned Generation; - int MatchingId; - bool IsAtomic; - bool IsInvariant; - LoadValue() - : DefInst(nullptr), Generation(0), MatchingId(-1), IsAtomic(false), - IsInvariant(false) {} + Instruction *DefInst = nullptr; + unsigned Generation = 0; + int MatchingId = -1; + bool IsAtomic = false; + bool IsInvariant = false; + + LoadValue() = default; LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId, bool IsAtomic, bool IsInvariant) : DefInst(Inst), Generation(Generation), MatchingId(MatchingId), IsAtomic(IsAtomic), IsInvariant(IsInvariant) {} }; - typedef RecyclingAllocator<BumpPtrAllocator, - ScopedHashTableVal<Value *, LoadValue>> - LoadMapAllocator; - typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>, - LoadMapAllocator> LoadHTType; + + using LoadMapAllocator = + RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<Value *, LoadValue>>; + using LoadHTType = + ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>, + LoadMapAllocator>; + LoadHTType AvailableLoads; /// \brief A scoped hash table of the current values of read-only call /// values. /// /// It uses the same generation count as loads. - typedef ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>> - CallHTType; + using CallHTType = + ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>; CallHTType AvailableCalls; /// \brief This is the current generation of the memory value. - unsigned CurrentGeneration; + unsigned CurrentGeneration = 0; /// \brief Set up the EarlyCSE runner for a particular function. EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA) : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA), - MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) { - } + MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {} bool run(); @@ -336,11 +404,10 @@ private: CallHTType &AvailableCalls) : Scope(AvailableValues), LoadScope(AvailableLoads), CallScope(AvailableCalls) {} - - private: NodeScope(const NodeScope &) = delete; - void operator=(const NodeScope &) = delete; + NodeScope &operator=(const NodeScope &) = delete; + private: ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; CallHTType::ScopeTy CallScope; @@ -356,8 +423,10 @@ private: CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n, DomTreeNode::iterator child, DomTreeNode::iterator end) : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), - EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls), - Processed(false) {} + EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls) + {} + StackNode(const StackNode &) = delete; + StackNode &operator=(const StackNode &) = delete; // Accessors. unsigned currentGeneration() { return CurrentGeneration; } @@ -365,27 +434,25 @@ private: void childGeneration(unsigned generation) { ChildGeneration = generation; } DomTreeNode *node() { return Node; } DomTreeNode::iterator childIter() { return ChildIter; } + DomTreeNode *nextChild() { DomTreeNode *child = *ChildIter; ++ChildIter; return child; } + DomTreeNode::iterator end() { return EndIter; } bool isProcessed() { return Processed; } void process() { Processed = true; } private: - StackNode(const StackNode &) = delete; - void operator=(const StackNode &) = delete; - - // Members. unsigned CurrentGeneration; unsigned ChildGeneration; DomTreeNode *Node; DomTreeNode::iterator ChildIter; DomTreeNode::iterator EndIter; NodeScope Scopes; - bool Processed; + bool Processed = false; }; /// \brief Wrapper class to handle memory instructions, including loads, @@ -393,24 +460,28 @@ private: class ParseMemoryInst { public: ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) - : IsTargetMemInst(false), Inst(Inst) { + : Inst(Inst) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) if (TTI.getTgtMemIntrinsic(II, Info)) IsTargetMemInst = true; } + bool isLoad() const { if (IsTargetMemInst) return Info.ReadMem; return isa<LoadInst>(Inst); } + bool isStore() const { if (IsTargetMemInst) return Info.WriteMem; return isa<StoreInst>(Inst); } + bool isAtomic() const { if (IsTargetMemInst) return Info.Ordering != AtomicOrdering::NotAtomic; return Inst->isAtomic(); } + bool isUnordered() const { if (IsTargetMemInst) return Info.isUnordered(); @@ -447,6 +518,7 @@ private: return (getPointerOperand() == Inst.getPointerOperand() && getMatchingId() == Inst.getMatchingId()); } + bool isValid() const { return getPointerOperand() != nullptr; } // For regular (non-intrinsic) loads/stores, this is set to -1. For @@ -457,6 +529,7 @@ private: if (IsTargetMemInst) return Info.MatchingId; return -1; } + Value *getPointerOperand() const { if (IsTargetMemInst) return Info.PtrVal; if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { @@ -466,17 +539,19 @@ private: } return nullptr; } + bool mayReadFromMemory() const { if (IsTargetMemInst) return Info.ReadMem; return Inst->mayReadFromMemory(); } + bool mayWriteToMemory() const { if (IsTargetMemInst) return Info.WriteMem; return Inst->mayWriteToMemory(); } private: - bool IsTargetMemInst; + bool IsTargetMemInst = false; MemIntrinsicInfo Info; Instruction *Inst; }; @@ -524,8 +599,8 @@ private: for (MemoryPhi *MP : PhisToCheck) { MemoryAccess *FirstIn = MP->getIncomingValue(0); - if (all_of(MP->incoming_values(), - [=](Use &In) { return In == FirstIn; })) + if (llvm::all_of(MP->incoming_values(), + [=](Use &In) { return In == FirstIn; })) WorkQueue.push_back(MP); } PhisToCheck.clear(); @@ -533,7 +608,8 @@ private: } } }; -} + +} // end anonymous namespace /// Determine if the memory referenced by LaterInst is from the same heap /// version as EarlierInst. @@ -663,6 +739,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + // Skip sideeffect intrinsics, for the same reason as assume intrinsics. + if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) { + DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n'); + continue; + } + // Skip invariant.start intrinsics since they only read memory, and we can // forward values across it. Also, we dont need to consume the last store // since the semantics of invariant.start allow us to perform DSE of the @@ -1014,6 +1096,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, } namespace { + /// \brief A simple and fast domtree-based CSE pass. /// /// This pass does a simple depth-first walk over the dominator tree, @@ -1062,7 +1145,8 @@ public: AU.setPreservesCFG(); } }; -} + +} // end anonymous namespace using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index ea28705e684d..e2c1eaf58e43 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -20,39 +20,64 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/VNCoercion.h" - +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <utility> #include <vector> + using namespace llvm; using namespace llvm::gvn; using namespace llvm::VNCoercion; @@ -80,6 +105,7 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, struct llvm::GVN::Expression { uint32_t opcode; Type *type; + bool commutative = false; SmallVector<uint32_t, 4> varargs; Expression(uint32_t o = ~2U) : opcode(o) {} @@ -104,20 +130,23 @@ struct llvm::GVN::Expression { }; namespace llvm { + template <> struct DenseMapInfo<GVN::Expression> { static inline GVN::Expression getEmptyKey() { return ~0U; } - static inline GVN::Expression getTombstoneKey() { return ~1U; } static unsigned getHashValue(const GVN::Expression &e) { using llvm::hash_value; + return static_cast<unsigned>(hash_value(e)); } + static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) { return LHS == RHS; } }; -} // End llvm namespace. + +} // end namespace llvm /// Represents a particular available value that we know how to materialize. /// Materialization of an AvailableValue never fails. An AvailableValue is @@ -216,6 +245,7 @@ struct llvm::gvn::AvailableValueInBlock { unsigned Offset = 0) { return get(BB, AvailableValue::get(V, Offset)); } + static AvailableValueInBlock getUndef(BasicBlock *BB) { return get(BB, AvailableValue::getUndef()); } @@ -246,6 +276,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); + e.commutative = true; } if (CmpInst *C = dyn_cast<CmpInst>(I)) { @@ -256,6 +287,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (C->getOpcode() << 8) | Predicate; + e.commutative = true; } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) { for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); II != IE; ++II) @@ -281,6 +313,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (Opcode << 8) | Predicate; + e.commutative = true; return e; } @@ -340,7 +373,7 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { // ValueTable External Functions //===----------------------------------------------------------------------===// -GVN::ValueTable::ValueTable() : nextValueNumber(1) {} +GVN::ValueTable::ValueTable() = default; GVN::ValueTable::ValueTable(const ValueTable &) = default; GVN::ValueTable::ValueTable(ValueTable &&) = default; GVN::ValueTable::~ValueTable() = default; @@ -348,25 +381,25 @@ GVN::ValueTable::~ValueTable() = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); + if (PHINode *PN = dyn_cast<PHINode>(V)) + NumberingPhi[num] = PN; } uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) { - e = nextValueNumber++; - valueNumbering[C] = e; - return e; + auto ValNum = assignExpNewValueNum(exp); + if (ValNum.second) { + valueNumbering[C] = ValNum.first; + return ValNum.first; } if (!MD) { - e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } @@ -452,7 +485,6 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { uint32_t v = lookupOrAdd(cdep); valueNumbering[C] = v; return v; - } else { valueNumbering[C] = nextValueNumber; return nextValueNumber++; @@ -522,23 +554,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { case Instruction::ExtractValue: exp = createExtractvalueExpr(cast<ExtractValueInst>(I)); break; + case Instruction::PHI: + valueNumbering[V] = nextValueNumber; + NumberingPhi[nextValueNumber] = cast<PHINode>(V); + return nextValueNumber++; default: valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[V] = e; return e; } /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V) const { +uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); - assert(VI != valueNumbering.end() && "Value not numbered?"); - return VI->second; + if (Verify) { + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; + } + return (VI != valueNumbering.end()) ? VI->second : 0; } /// Returns the value number of the given comparison, @@ -549,21 +587,28 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) { Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; - return e; + return assignExpNewValueNum(exp).first; } /// Remove all entries from the ValueTable. void GVN::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); + NumberingPhi.clear(); + PhiTranslateTable.clear(); nextValueNumber = 1; + Expressions.clear(); + ExprIdx.clear(); + nextExprNumber = 0; } /// Remove a value from the value numbering. void GVN::ValueTable::erase(Value *V) { + uint32_t Num = valueNumbering.lookup(V); valueNumbering.erase(V); + // If V is PHINode, V <--> value number is an one-to-one mapping. + if (isa<PHINode>(V)) + NumberingPhi.erase(Num); } /// verifyRemoved - Verify that the value is removed from all internal data @@ -693,9 +738,6 @@ SpeculationFailure: return false; } - - - /// Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. @@ -789,6 +831,7 @@ static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo, DominatorTree *DT, OptimizationRemarkEmitter *ORE) { using namespace ore; + User *OtherAccess = nullptr; OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI); @@ -817,7 +860,6 @@ static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo, bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, Value *Address, AvailableValue &Res) { - assert((DepInfo.isDef() || DepInfo.isClobber()) && "expected a local dependence"); assert(LI->isUnordered() && "rules below are incorrect for ordered access"); @@ -879,8 +921,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, Instruction *I = DepInfo.getInst(); dbgs() << " is clobbered by " << *I << '\n'; ); - - if (ORE->allowExtraAnalysis()) + if (ORE->allowExtraAnalysis(DEBUG_TYPE)) reportMayClobberedLoad(LI, DepInfo, DT, ORE); return false; @@ -949,7 +990,6 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks) { - // Filter out useless results (non-locals, etc). Keep track of the blocks // where we have a value available in repl, also keep track of whether we see // dependencies that produce an unknown value for the load (such as a call @@ -1009,7 +1049,32 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // backwards through predecessors if needed. BasicBlock *LoadBB = LI->getParent(); BasicBlock *TmpBB = LoadBB; + bool IsSafeToSpeculativelyExecute = isSafeToSpeculativelyExecute(LI); + // Check that there is no implicit control flow instructions above our load in + // its block. If there is an instruction that doesn't always pass the + // execution to the following instruction, then moving through it may become + // invalid. For example: + // + // int arr[LEN]; + // int index = ???; + // ... + // guard(0 <= index && index < LEN); + // use(arr[index]); + // + // It is illegal to move the array access to any point above the guard, + // because if the index is out of bounds we should deoptimize rather than + // access the array. + // Check that there is no guard in this block above our intruction. + if (!IsSafeToSpeculativelyExecute) { + auto It = FirstImplicitControlFlowInsts.find(TmpBB); + if (It != FirstImplicitControlFlowInsts.end()) { + assert(It->second->getParent() == TmpBB && + "Implicit control flow map broken?"); + if (OI->dominates(It->second, LI)) + return false; + } + } while (TmpBB->getSinglePredecessor()) { TmpBB = TmpBB->getSinglePredecessor(); if (TmpBB == LoadBB) // Infinite (unreachable) loop. @@ -1024,6 +1089,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // which it was not previously executed. if (TmpBB->getTerminator()->getNumSuccessors() != 1) return false; + + // Check that there is no implicit control flow in a block above. + if (!IsSafeToSpeculativelyExecute && + FirstImplicitControlFlowInsts.count(TmpBB)) + return false; } assert(TmpBB); @@ -1128,8 +1198,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (!CanDoPRE) { while (!NewInsts.empty()) { Instruction *I = NewInsts.pop_back_val(); - if (MD) MD->removeInstruction(I); - I->eraseFromParent(); + markInstructionForDeletion(I); } // HINT: Don't revert the edge-splitting as following transformation may // also need to split these critical edges. @@ -1206,8 +1275,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (V->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); - ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI) - << "load eliminated by PRE"); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI) + << "load eliminated by PRE"; + }); ++NumPRELoad; return true; } @@ -1215,17 +1286,23 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, static void reportLoadElim(LoadInst *LI, Value *AvailableValue, OptimizationRemarkEmitter *ORE) { using namespace ore; - ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadElim", LI) - << "load of type " << NV("Type", LI->getType()) << " eliminated" - << setExtraArgs() << " in favor of " - << NV("InfavorOfValue", AvailableValue)); + + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI) + << "load of type " << NV("Type", LI->getType()) << " eliminated" + << setExtraArgs() << " in favor of " + << NV("InfavorOfValue", AvailableValue); + }); } /// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. bool GVN::processNonLocalLoad(LoadInst *LI) { // non-local speculations are not allowed under asan. - if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress)) + if (LI->getParent()->getParent()->hasFnAttribute( + Attribute::SanitizeAddress) || + LI->getParent()->getParent()->hasFnAttribute( + Attribute::SanitizeHWAddress)) return false; // Step 1: Find the non-local dependencies of the load. @@ -1322,6 +1399,11 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { } markInstructionForDeletion(IntrinsicI); return false; + } else if (isa<Constant>(V)) { + // If it's not false, and constant, it must evaluate to true. This means our + // assume is assume(true), and thus, pointless, and we don't want to do + // anything more here. + return false; } Constant *True = ConstantInt::getTrue(V->getContext()); @@ -1452,6 +1534,106 @@ bool GVN::processLoad(LoadInst *L) { return false; } +/// Return a pair the first field showing the value number of \p Exp and the +/// second field showing whether it is a value number newly created. +std::pair<uint32_t, bool> +GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { + uint32_t &e = expressionNumbering[Exp]; + bool CreateNewValNum = !e; + if (CreateNewValNum) { + Expressions.push_back(Exp); + if (ExprIdx.size() < nextValueNumber + 1) + ExprIdx.resize(nextValueNumber * 2); + e = nextValueNumber; + ExprIdx[nextValueNumber++] = nextExprNumber++; + } + return {e, CreateNewValNum}; +} + +/// Return whether all the values related with the same \p num are +/// defined in \p BB. +bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, + GVN &Gvn) { + LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; + while (Vals && Vals->BB == BB) + Vals = Vals->Next; + return !Vals; +} + +/// Wrap phiTranslateImpl to provide caching functionality. +uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, + const BasicBlock *PhiBlock, uint32_t Num, + GVN &Gvn) { + auto FindRes = PhiTranslateTable.find({Num, Pred}); + if (FindRes != PhiTranslateTable.end()) + return FindRes->second; + uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); + PhiTranslateTable.insert({{Num, Pred}, NewNum}); + return NewNum; +} + +/// Translate value number \p Num using phis, so that it has the values of +/// the phis in BB. +uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, + const BasicBlock *PhiBlock, + uint32_t Num, GVN &Gvn) { + if (PHINode *PN = NumberingPhi[Num]) { + for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { + if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) + if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) + return TransVal; + } + return Num; + } + + // If there is any value related with Num is defined in a BB other than + // PhiBlock, it cannot depend on a phi in PhiBlock without going through + // a backedge. We can do an early exit in that case to save compile time. + if (!areAllValsInBB(Num, PhiBlock, Gvn)) + return Num; + + if (Num >= ExprIdx.size() || ExprIdx[Num] == 0) + return Num; + Expression Exp = Expressions[ExprIdx[Num]]; + + for (unsigned i = 0; i < Exp.varargs.size(); i++) { + // For InsertValue and ExtractValue, some varargs are index numbers + // instead of value numbers. Those index numbers should not be + // translated. + if ((i > 1 && Exp.opcode == Instruction::InsertValue) || + (i > 0 && Exp.opcode == Instruction::ExtractValue)) + continue; + Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); + } + + if (Exp.commutative) { + assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!"); + if (Exp.varargs[0] > Exp.varargs[1]) { + std::swap(Exp.varargs[0], Exp.varargs[1]); + uint32_t Opcode = Exp.opcode >> 8; + if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) + Exp.opcode = (Opcode << 8) | + CmpInst::getSwappedPredicate( + static_cast<CmpInst::Predicate>(Exp.opcode & 255)); + } + } + + if (uint32_t NewNum = expressionNumbering[Exp]) + return NewNum; + return Num; +} + +/// Erase stale entry from phiTranslate cache so phiTranslate can be computed +/// again. +void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num, + const BasicBlock &CurrBlock) { + for (const BasicBlock *Pred : predecessors(&CurrBlock)) { + auto FindRes = PhiTranslateTable.find({Num, Pred}); + if (FindRes != PhiTranslateTable.end()) + PhiTranslateTable.erase(FindRes); + } +} + // In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in @@ -1496,6 +1678,13 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } +void GVN::assignBlockRPONumber(Function &F) { + uint32_t NextBlockNumber = 1; + ReversePostOrderTraversal<Function *> RPOT(&F); + for (BasicBlock *BB : RPOT) + BlockRPONumber[BB] = NextBlockNumber++; +} + // Tries to replace instruction with const, using information from // ReplaceWithConstMap. bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { @@ -1827,6 +2016,8 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, TLI = &RunTLI; VN.setAliasAnalysis(&RunAA); MD = RunMD; + OrderedInstructions OrderedInstrs(DT); + OI = &OrderedInstrs; VN.setMemDep(MD); ORE = RunORE; @@ -1857,6 +2048,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, // Fabricate val-num for dead-code in order to suppress assertion in // performPRE(). assignValNumForDeadCode(); + assignBlockRPONumber(F); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); @@ -1908,14 +2100,26 @@ bool GVN::processBlock(BasicBlock *BB) { if (!AtStart) --BI; - for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(), - E = InstrsToErase.end(); I != E; ++I) { - DEBUG(dbgs() << "GVN removed: " << **I << '\n'); - if (MD) MD->removeInstruction(*I); - DEBUG(verifyRemoved(*I)); - (*I)->eraseFromParent(); + bool InvalidateImplicitCF = false; + const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB); + for (auto *I : InstrsToErase) { + assert(I->getParent() == BB && "Removing instruction from wrong block?"); + DEBUG(dbgs() << "GVN removed: " << *I << '\n'); + if (MD) MD->removeInstruction(I); + DEBUG(verifyRemoved(I)); + if (MaybeFirstICF == I) { + // We have erased the first ICF in block. The map needs to be updated. + InvalidateImplicitCF = true; + // Do not keep dangling pointer on the erased instruction. + MaybeFirstICF = nullptr; + } + I->eraseFromParent(); } + + OI->invalidateBlock(BB); InstrsToErase.clear(); + if (InvalidateImplicitCF) + fillImplicitControlFlowInfo(BB); if (AtStart) BI = BB->begin(); @@ -1928,7 +2132,7 @@ bool GVN::processBlock(BasicBlock *BB) { // Instantiate an expression in a predecessor that lacked it. bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, - unsigned int ValNo) { + BasicBlock *Curr, unsigned int ValNo) { // Because we are going top-down through the block, all value numbers // will be available in the predecessor by the time we need them. Any // that weren't originally present will have been instantiated earlier @@ -1946,7 +2150,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, success = false; break; } - if (Value *V = findLeader(Pred, VN.lookup(Op))) { + uint32_t TValNo = + VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this); + if (Value *V = findLeader(Pred, TValNo)) { Instr->setOperand(i, V); } else { success = false; @@ -1963,10 +2169,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Instr->insertBefore(Pred->getTerminator()); Instr->setName(Instr->getName() + ".pre"); Instr->setDebugLoc(Instr->getDebugLoc()); - VN.add(Instr, ValNo); + + unsigned Num = VN.lookupOrAdd(Instr); + VN.add(Instr, Num); // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, Instr, Pred); + addToLeaderTable(Num, Instr, Pred); return true; } @@ -2004,18 +2212,27 @@ bool GVN::performScalarPRE(Instruction *CurInst) { SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap; for (BasicBlock *P : predecessors(CurrentBlock)) { - // We're not interested in PRE where the block is its - // own predecessor, or in blocks with predecessors - // that are not reachable. - if (P == CurrentBlock) { + // We're not interested in PRE where blocks with predecessors that are + // not reachable. + if (!DT->isReachableFromEntry(P)) { NumWithout = 2; break; - } else if (!DT->isReachableFromEntry(P)) { + } + // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and + // when CurInst has operand defined in CurrentBlock (so it may be defined + // by phi in the loop header). + if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] && + llvm::any_of(CurInst->operands(), [&](const Use &U) { + if (auto *Inst = dyn_cast<Instruction>(U.get())) + return Inst->getParent() == CurrentBlock; + return false; + })) { NumWithout = 2; break; } - Value *predV = findLeader(P, ValNo); + uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); + Value *predV = findLeader(P, TValNo); if (!predV) { predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); PREPred = P; @@ -2041,6 +2258,20 @@ bool GVN::performScalarPRE(Instruction *CurInst) { Instruction *PREInstr = nullptr; if (NumWithout != 0) { + if (!isSafeToSpeculativelyExecute(CurInst)) { + // It is only valid to insert a new instruction if the current instruction + // is always executed. An instruction with implicit control flow could + // prevent us from doing it. If we cannot speculate the execution, then + // PRE should be prohibited. + auto It = FirstImplicitControlFlowInsts.find(CurrentBlock); + if (It != FirstImplicitControlFlowInsts.end()) { + assert(It->second->getParent() == CurrentBlock && + "Implicit control flow map broken?"); + if (OI->dominates(It->second, CurInst)) + return false; + } + } + // Don't do PRE across indirect branch. if (isa<IndirectBrInst>(PREPred->getTerminator())) return false; @@ -2055,7 +2286,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { } // We need to insert somewhere, so let's give it a shot PREInstr = CurInst->clone(); - if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) { + if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) { // If we failed insertion, make sure we remove the instruction. DEBUG(verifyRemoved(PREInstr)); PREInstr->deleteValue(); @@ -2065,7 +2296,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { // Either we should have filled in the PRE instruction, or we should // not have needed insertions. - assert (PREInstr != nullptr || NumWithout == 0); + assert(PREInstr != nullptr || NumWithout == 0); ++NumGVNPRE; @@ -2074,13 +2305,19 @@ bool GVN::performScalarPRE(Instruction *CurInst) { PHINode::Create(CurInst->getType(), predMap.size(), CurInst->getName() + ".pre-phi", &CurrentBlock->front()); for (unsigned i = 0, e = predMap.size(); i != e; ++i) { - if (Value *V = predMap[i].first) + if (Value *V = predMap[i].first) { + // If we use an existing value in this phi, we have to patch the original + // value because the phi will be used to replace a later value. + patchReplacementInstruction(CurInst, V); Phi->addIncoming(V, predMap[i].second); - else + } else Phi->addIncoming(PREInstr, PREPred); } VN.add(Phi, ValNo); + // After creating a new PHI for ValNo, the phi translate result for ValNo will + // be changed, so erase the related stale entries in phi translate cache. + VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock); addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); @@ -2093,7 +2330,14 @@ bool GVN::performScalarPRE(Instruction *CurInst) { if (MD) MD->removeInstruction(CurInst); DEBUG(verifyRemoved(CurInst)); + bool InvalidateImplicitCF = + FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst; + // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes + // some assertion failures. + OI->invalidateBlock(CurrentBlock); CurInst->eraseFromParent(); + if (InvalidateImplicitCF) + fillImplicitControlFlowInfo(CurrentBlock); ++NumGVNInstr; return true; @@ -2160,6 +2404,9 @@ bool GVN::iterateOnFunction(Function &F) { // RPOT walks the graph in its constructor and will not be invalidated during // processBlock. ReversePostOrderTraversal<Function *> RPOT(&F); + + for (BasicBlock *BB : RPOT) + fillImplicitControlFlowInfo(BB); for (BasicBlock *BB : RPOT) Changed |= processBlock(BB); @@ -2169,7 +2416,50 @@ bool GVN::iterateOnFunction(Function &F) { void GVN::cleanupGlobalSets() { VN.clear(); LeaderTable.clear(); + BlockRPONumber.clear(); TableAllocator.Reset(); + FirstImplicitControlFlowInsts.clear(); +} + +void +GVN::fillImplicitControlFlowInfo(BasicBlock *BB) { + // Make sure that all marked instructions are actually deleted by this point, + // so that we don't need to care about omitting them. + assert(InstrsToErase.empty() && "Filling before removed all marked insns?"); + auto MayNotTransferExecutionToSuccessor = [&](const Instruction *I) { + // If a block's instruction doesn't always pass the control to its successor + // instruction, mark the block as having implicit control flow. We use them + // to avoid wrong assumptions of sort "if A is executed and B post-dominates + // A, then B is also executed". This is not true is there is an implicit + // control flow instruction (e.g. a guard) between them. + // + // TODO: Currently, isGuaranteedToTransferExecutionToSuccessor returns false + // for volatile stores and loads because they can trap. The discussion on + // whether or not it is correct is still ongoing. We might want to get rid + // of this logic in the future. Anyways, trapping instructions shouldn't + // introduce implicit control flow, so we explicitly allow them here. This + // must be removed once isGuaranteedToTransferExecutionToSuccessor is fixed. + if (isGuaranteedToTransferExecutionToSuccessor(I)) + return false; + if (isa<LoadInst>(I)) { + assert(cast<LoadInst>(I)->isVolatile() && + "Non-volatile load should transfer execution to successor!"); + return false; + } + if (isa<StoreInst>(I)) { + assert(cast<StoreInst>(I)->isVolatile() && + "Non-volatile store should transfer execution to successor!"); + return false; + } + return true; + }; + FirstImplicitControlFlowInsts.erase(BB); + + for (auto &I : *BB) + if (MayNotTransferExecutionToSuccessor(&I)) { + FirstImplicitControlFlowInsts[BB] = &I; + break; + } } /// Verify that the specified instruction does not occur in our @@ -2317,6 +2607,7 @@ void GVN::assignValNumForDeadCode() { class llvm::gvn::GVNLegacyPass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid + explicit GVNLegacyPass(bool NoLoads = false) : FunctionPass(ID), NoLoads(NoLoads) { initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry()); @@ -2360,11 +2651,6 @@ private: char GVNLegacyPass::ID = 0; -// The public interface to this file... -FunctionPass *llvm::createGVNPass(bool NoLoads) { - return new GVNLegacyPass(NoLoads); -} - INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) @@ -2374,3 +2660,8 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false) + +// The public interface to this file... +FunctionPass *llvm::createGVNPass(bool NoLoads) { + return new GVNLegacyPass(NoLoads); +} diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp index 29de792bd248..c0cd1ea74a74 100644 --- a/lib/Transforms/Scalar/GVNHoist.cpp +++ b/lib/Transforms/Scalar/GVNHoist.cpp @@ -13,44 +13,72 @@ // 1. To reduce the code size. // 2. In some cases reduce critical path (by exposing more ILP). // +// The algorithm factors out the reachability of values such that multiple +// queries to find reachability of values are fast. This is based on finding the +// ANTIC points in the CFG which do not change during hoisting. The ANTIC points +// are basically the dominance-frontiers in the inverse graph. So we introduce a +// data structure (CHI nodes) to keep track of values flowing out of a basic +// block. We only do this for values with multiple occurrences in the function +// as they are the potential hoistable candidates. This approach allows us to +// hoist instructions to a basic block with more than two successors, as well as +// deal with infinite loops in a trivial way. +// +// Limitations: This pass does not hoist fully redundant expressions because +// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before +// and after gvn-pre because gvn-pre creates opportunities for more instructions +// to be hoisted. +// // Hoisting may affect the performance in some cases. To mitigate that, hoisting // is disabled in the following cases. // 1. Scalars across calls. // 2. geps when corresponding load/store cannot be hoisted. -// -// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores -// in this case because it works on two instructions at a time. -// entry: -// switch i32 %c1, label %exit1 [ -// i32 0, label %sw0 -// i32 1, label %sw1 -// ] -// -// sw0: -// store i32 1, i32* @G -// br label %exit -// -// sw1: -// store i32 1, i32* @G -// br label %exit -// -// exit1: -// store i32 1, i32* @G -// ret void -// exit: -// ret void //===----------------------------------------------------------------------===// #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/IteratedDominanceFrontier.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <memory> +#include <utility> +#include <vector> using namespace llvm; @@ -69,6 +97,7 @@ static cl::opt<int> MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1), cl::desc("Max number of instructions to hoist " "(default unlimited = -1)")); + static cl::opt<int> MaxNumberOfBBSInPath( "gvn-hoist-max-bbs", cl::Hidden, cl::init(4), cl::desc("Max number of basic blocks on the path between " @@ -86,34 +115,50 @@ static cl::opt<int> namespace llvm { -// Provides a sorting function based on the execution order of two instructions. -struct SortByDFSIn { -private: - DenseMap<const Value *, unsigned> &DFSNumber; +using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>; +using SmallVecInsn = SmallVector<Instruction *, 4>; +using SmallVecImplInsn = SmallVectorImpl<Instruction *>; -public: - SortByDFSIn(DenseMap<const Value *, unsigned> &D) : DFSNumber(D) {} +// Each element of a hoisting list contains the basic block where to hoist and +// a list of instructions to be hoisted. +using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>; - // Returns true when A executes before B. - bool operator()(const Instruction *A, const Instruction *B) const { - const BasicBlock *BA = A->getParent(); - const BasicBlock *BB = B->getParent(); - unsigned ADFS, BDFS; - if (BA == BB) { - ADFS = DFSNumber.lookup(A); - BDFS = DFSNumber.lookup(B); - } else { - ADFS = DFSNumber.lookup(BA); - BDFS = DFSNumber.lookup(BB); - } - assert(ADFS && BDFS); - return ADFS < BDFS; - } -}; +using HoistingPointList = SmallVector<HoistingPointInfo, 4>; // A map from a pair of VNs to all the instructions with those VNs. -typedef DenseMap<std::pair<unsigned, unsigned>, SmallVector<Instruction *, 4>> - VNtoInsns; +using VNType = std::pair<unsigned, unsigned>; + +using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>; + +// CHI keeps information about values flowing out of a basic block. It is +// similar to PHI but in the inverse graph, and used for outgoing values on each +// edge. For conciseness, it is computed only for instructions with multiple +// occurrences in the CFG because they are the only hoistable candidates. +// A (CHI[{V, B, I1}, {V, C, I2}] +// / \ +// / \ +// B(I1) C (I2) +// The Value number for both I1 and I2 is V, the CHI node will save the +// instruction as well as the edge where the value is flowing to. +struct CHIArg { + VNType VN; + + // Edge destination (shows the direction of flow), may not be where the I is. + BasicBlock *Dest; + + // The instruction (VN) which uses the values flowing out of CHI. + Instruction *I; + + bool operator==(const CHIArg &A) { return VN == A.VN; } + bool operator!=(const CHIArg &A) { return !(*this == A); } +}; + +using CHIIt = SmallVectorImpl<CHIArg>::iterator; +using CHIArgs = iterator_range<CHIIt>; +using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>; +using InValuesType = + DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>; + // An invalid value number Used when inserting a single value number into // VNtoInsns. enum : unsigned { InvalidVN = ~2U }; @@ -192,16 +237,10 @@ public: } const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; } - const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; } - const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; } }; -typedef DenseMap<const BasicBlock *, bool> BBSideEffectsSet; -typedef SmallVector<Instruction *, 4> SmallVecInsn; -typedef SmallVectorImpl<Instruction *> SmallVecImplInsn; - static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) { static const unsigned KnownIDs[] = { LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, @@ -216,15 +255,13 @@ static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) { // cases reduce critical path (by exposing more ILP). class GVNHoist { public: - GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD, - MemorySSA *MSSA) - : DT(DT), AA(AA), MD(MD), MSSA(MSSA), - MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), - HoistingGeps(false), - HoistedCtr(0) - { } + GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA, + MemoryDependenceResults *MD, MemorySSA *MSSA) + : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA), + MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {} bool run(Function &F) { + NumFuncArgs = F.arg_size(); VN.setDomTree(DT); VN.setAliasAnalysis(AA); VN.setMemDep(MD); @@ -241,7 +278,7 @@ public: int ChainLength = 0; // FIXME: use lazy evaluation of VN to avoid the fix-point computation. - while (1) { + while (true) { if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength) return Res; @@ -261,18 +298,48 @@ public: return Res; } + // Copied from NewGVN.cpp + // This function provides global ranking of operations so that we can place + // them in a canonical order. Note that rank alone is not necessarily enough + // for a complete ordering, as constants all have the same rank. However, + // generally, we will simplify an operation with all constants so that it + // doesn't matter what order they appear in. + unsigned int rank(const Value *V) const { + // Prefer constants to undef to anything else + // Undef is a constant, have to check it first. + // Prefer smaller constants to constantexprs + if (isa<ConstantExpr>(V)) + return 2; + if (isa<UndefValue>(V)) + return 1; + if (isa<Constant>(V)) + return 0; + else if (auto *A = dyn_cast<Argument>(V)) + return 3 + A->getArgNo(); + + // Need to shift the instruction DFS by number of arguments + 3 to account + // for the constant and argument ranking above. + auto Result = DFSNumber.lookup(V); + if (Result > 0) + return 4 + NumFuncArgs + Result; + // Unreachable or something else, just return a really large number. + return ~0; + } + private: GVN::ValueTable VN; DominatorTree *DT; + PostDominatorTree *PDT; AliasAnalysis *AA; MemoryDependenceResults *MD; MemorySSA *MSSA; std::unique_ptr<MemorySSAUpdater> MSSAUpdater; - const bool HoistingGeps; DenseMap<const Value *, unsigned> DFSNumber; BBSideEffectsSet BBSideEffects; - DenseSet<const BasicBlock*> HoistBarrier; - int HoistedCtr; + DenseSet<const BasicBlock *> HoistBarrier; + SmallVector<BasicBlock *, 32> IDFBlocks; + unsigned NumFuncArgs; + const bool HoistingGeps = false; enum InsKind { Unknown, Scalar, Load, Store }; @@ -305,45 +372,7 @@ private: return false; } - // Return true when all paths from HoistBB to the end of the function pass - // through one of the blocks in WL. - bool hoistingFromAllPaths(const BasicBlock *HoistBB, - SmallPtrSetImpl<const BasicBlock *> &WL) { - - // Copy WL as the loop will remove elements from it. - SmallPtrSet<const BasicBlock *, 2> WorkList(WL.begin(), WL.end()); - - for (auto It = df_begin(HoistBB), E = df_end(HoistBB); It != E;) { - // There exists a path from HoistBB to the exit of the function if we are - // still iterating in DF traversal and we removed all instructions from - // the work list. - if (WorkList.empty()) - return false; - - const BasicBlock *BB = *It; - if (WorkList.erase(BB)) { - // Stop DFS traversal when BB is in the work list. - It.skipChildren(); - continue; - } - - // We reached the leaf Basic Block => not all paths have this instruction. - if (!BB->getTerminator()->getNumSuccessors()) - return false; - - // When reaching the back-edge of a loop, there may be a path through the - // loop that does not pass through B or C before exiting the loop. - if (successorDominate(BB, HoistBB)) - return false; - - // Increment DFS traversal when not skipping children. - ++It; - } - - return true; - } - - /* Return true when I1 appears before I2 in the instructions of BB. */ + // Return true when I1 appears before I2 in the instructions of BB. bool firstInBB(const Instruction *I1, const Instruction *I2) { assert(I1->getParent() == I2->getParent()); unsigned I1DFS = DFSNumber.lookup(I1); @@ -387,6 +416,25 @@ private: return false; } + bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB, + int &NBBsOnAllPaths) { + // Stop walk once the limit is reached. + if (NBBsOnAllPaths == 0) + return true; + + // Impossible to hoist with exceptions on the path. + if (hasEH(BB)) + return true; + + // No such instruction after HoistBarrier in a basic block was + // selected for hoisting so instructions selected within basic block with + // a hoist barrier can be hoisted. + if ((BB != SrcBB) && HoistBarrier.count(BB)) + return true; + + return false; + } + // Return true when there are exception handling or loads of memory Def // between Def and NewPt. This function is only called for stores: Def is // the MemoryDef of the store to be hoisted. @@ -414,18 +462,7 @@ private: continue; } - // Stop walk once the limit is reached. - if (NBBsOnAllPaths == 0) - return true; - - // Impossible to hoist with exceptions on the path. - if (hasEH(BB)) - return true; - - // No such instruction after HoistBarrier in a basic block was - // selected for hoisting so instructions selected within basic block with - // a hoist barrier can be hoisted. - if ((BB != OldBB) && HoistBarrier.count(BB)) + if (hasEHhelper(BB, OldBB, NBBsOnAllPaths)) return true; // Check that we do not move a store past loads. @@ -463,18 +500,7 @@ private: continue; } - // Stop walk once the limit is reached. - if (NBBsOnAllPaths == 0) - return true; - - // Impossible to hoist with exceptions on the path. - if (hasEH(BB)) - return true; - - // No such instruction after HoistBarrier in a basic block was - // selected for hoisting so instructions selected within basic block with - // a hoist barrier can be hoisted. - if ((BB != SrcBB) && HoistBarrier.count(BB)) + if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths)) return true; // -1 is unlimited number of blocks on all paths. @@ -491,7 +517,6 @@ private: // to NewPt. bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt, MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) { - // In place hoisting is safe. if (NewPt == OldPt) return true; @@ -533,141 +558,258 @@ private: // Return true when it is safe to hoist scalar instructions from all blocks in // WL to HoistBB. - bool safeToHoistScalar(const BasicBlock *HoistBB, - SmallPtrSetImpl<const BasicBlock *> &WL, + bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB, int &NBBsOnAllPaths) { - // Check that the hoisted expression is needed on all paths. - if (!hoistingFromAllPaths(HoistBB, WL)) - return false; + return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths); + } - for (const BasicBlock *BB : WL) - if (hasEHOnPath(HoistBB, BB, NBBsOnAllPaths)) - return false; + // In the inverse CFG, the dominance frontier of basic block (BB) is the + // point where ANTIC needs to be computed for instructions which are going + // to be hoisted. Since this point does not change during gvn-hoist, + // we compute it only once (on demand). + // The ides is inspired from: + // "Partial Redundancy Elimination in SSA Form" + // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW + // They use similar idea in the forward graph to to find fully redundant and + // partially redundant expressions, here it is used in the inverse graph to + // find fully anticipable instructions at merge point (post-dominator in + // the inverse CFG). + // Returns the edge via which an instruction in BB will get the values from. + + // Returns true when the values are flowing out to each edge. + bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const { + if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end())) + return false; // Not enough args in this CHI. + for (auto CHI : C) { + BasicBlock *Dest = CHI.Dest; + // Find if all the edges have values flowing out of BB. + bool Found = llvm::any_of(TI->successors(), [Dest](const BasicBlock *BB) { + return BB == Dest; }); + if (!Found) + return false; + } return true; } - // Each element of a hoisting list contains the basic block where to hoist and - // a list of instructions to be hoisted. - typedef std::pair<BasicBlock *, SmallVecInsn> HoistingPointInfo; - typedef SmallVector<HoistingPointInfo, 4> HoistingPointList; + // Check if it is safe to hoist values tracked by CHI in the range + // [Begin, End) and accumulate them in Safe. + void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K, + SmallVectorImpl<CHIArg> &Safe) { + int NumBBsOnAllPaths = MaxNumberOfBBSInPath; + for (auto CHI : C) { + Instruction *Insn = CHI.I; + if (!Insn) // No instruction was inserted in this CHI. + continue; + if (K == InsKind::Scalar) { + if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths)) + Safe.push_back(CHI); + } else { + MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn); + if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths)) + Safe.push_back(CHI); + } + } + } + + using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>; - // Partition InstructionsToHoist into a set of candidates which can share a - // common hoisting point. The partitions are collected in HPL. IsScalar is - // true when the instructions in InstructionsToHoist are scalars. IsLoad is - // true when the InstructionsToHoist are loads, false when they are stores. - void partitionCandidates(SmallVecImplInsn &InstructionsToHoist, - HoistingPointList &HPL, InsKind K) { - // No need to sort for two instructions. - if (InstructionsToHoist.size() > 2) { - SortByDFSIn Pred(DFSNumber); - std::sort(InstructionsToHoist.begin(), InstructionsToHoist.end(), Pred); + // Push all the VNs corresponding to BB into RenameStack. + void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs, + RenameStackType &RenameStack) { + auto it1 = ValueBBs.find(BB); + if (it1 != ValueBBs.end()) { + // Iterate in reverse order to keep lower ranked values on the top. + for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) { + // Get the value of instruction I + DEBUG(dbgs() << "\nPushing on stack: " << *VI.second); + RenameStack[VI.first].push_back(VI.second); + } } + } - int NumBBsOnAllPaths = MaxNumberOfBBSInPath; + void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs, + RenameStackType &RenameStack) { + // For each *predecessor* (because Post-DOM) of BB check if it has a CHI + for (auto Pred : predecessors(BB)) { + auto P = CHIBBs.find(Pred); + if (P == CHIBBs.end()) { + continue; + } + DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName();); + // A CHI is found (BB -> Pred is an edge in the CFG) + // Pop the stack until Top(V) = Ve. + auto &VCHI = P->second; + for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) { + CHIArg &C = *It; + if (!C.Dest) { + auto si = RenameStack.find(C.VN); + // The Basic Block where CHI is must dominate the value we want to + // track in a CHI. In the PDom walk, there can be values in the + // stack which are not control dependent e.g., nested loop. + if (si != RenameStack.end() && si->second.size() && + DT->dominates(Pred, si->second.back()->getParent())) { + C.Dest = BB; // Assign the edge + C.I = si->second.pop_back_val(); // Assign the argument + DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName() + << *C.I << ", VN: " << C.VN.first << ", " + << C.VN.second); + } + // Move to next CHI of a different value + It = std::find_if(It, VCHI.end(), + [It](CHIArg &A) { return A != *It; }); + } else + ++It; + } + } + } - SmallVecImplInsn::iterator II = InstructionsToHoist.begin(); - SmallVecImplInsn::iterator Start = II; - Instruction *HoistPt = *II; - BasicBlock *HoistBB = HoistPt->getParent(); - MemoryUseOrDef *UD; - if (K != InsKind::Scalar) - UD = MSSA->getMemoryAccess(HoistPt); + // Walk the post-dominator tree top-down and use a stack for each value to + // store the last value you see. When you hit a CHI from a given edge, the + // value to use as the argument is at the top of the stack, add the value to + // CHI and pop. + void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) { + auto Root = PDT->getNode(nullptr); + if (!Root) + return; + // Depth first walk on PDom tree to fill the CHIargs at each PDF. + RenameStackType RenameStack; + for (auto Node : depth_first(Root)) { + BasicBlock *BB = Node->getBlock(); + if (!BB) + continue; - for (++II; II != InstructionsToHoist.end(); ++II) { - Instruction *Insn = *II; - BasicBlock *BB = Insn->getParent(); - BasicBlock *NewHoistBB; - Instruction *NewHoistPt; + // Collect all values in BB and push to stack. + fillRenameStack(BB, ValueBBs, RenameStack); - if (BB == HoistBB) { // Both are in the same Basic Block. - NewHoistBB = HoistBB; - NewHoistPt = firstInBB(Insn, HoistPt) ? Insn : HoistPt; - } else { - // If the hoisting point contains one of the instructions, - // then hoist there, otherwise hoist before the terminator. - NewHoistBB = DT->findNearestCommonDominator(HoistBB, BB); - if (NewHoistBB == BB) - NewHoistPt = Insn; - else if (NewHoistBB == HoistBB) - NewHoistPt = HoistPt; - else - NewHoistPt = NewHoistBB->getTerminator(); - } + // Fill outgoing values in each CHI corresponding to BB. + fillChiArgs(BB, CHIBBs, RenameStack); + } + } - SmallPtrSet<const BasicBlock *, 2> WL; - WL.insert(HoistBB); - WL.insert(BB); + // Walk all the CHI-nodes to find ones which have a empty-entry and remove + // them Then collect all the instructions which are safe to hoist and see if + // they form a list of anticipable values. OutValues contains CHIs + // corresponding to each basic block. + void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K, + HoistingPointList &HPL) { + auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; }; - if (K == InsKind::Scalar) { - if (safeToHoistScalar(NewHoistBB, WL, NumBBsOnAllPaths)) { - // Extend HoistPt to NewHoistPt. - HoistPt = NewHoistPt; - HoistBB = NewHoistBB; - continue; - } - } else { - // When NewBB already contains an instruction to be hoisted, the - // expression is needed on all paths. - // Check that the hoisted expression is needed on all paths: it is - // unsafe to hoist loads to a place where there may be a path not - // loading from the same address: for instance there may be a branch on - // which the address of the load may not be initialized. - if ((HoistBB == NewHoistBB || BB == NewHoistBB || - hoistingFromAllPaths(NewHoistBB, WL)) && - // Also check that it is safe to move the load or store from HoistPt - // to NewHoistPt, and from Insn to NewHoistPt. - safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NumBBsOnAllPaths) && - safeToHoistLdSt(NewHoistPt, Insn, MSSA->getMemoryAccess(Insn), - K, NumBBsOnAllPaths)) { - // Extend HoistPt to NewHoistPt. - HoistPt = NewHoistPt; - HoistBB = NewHoistBB; - continue; - } - } + // CHIArgs now have the outgoing values, so check for anticipability and + // accumulate hoistable candidates in HPL. + for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) { + BasicBlock *BB = A.first; + SmallVectorImpl<CHIArg> &CHIs = A.second; + // Vector of PHIs contains PHIs for different instructions. + // Sort the args according to their VNs, such that identical + // instructions are together. + std::stable_sort(CHIs.begin(), CHIs.end(), cmpVN); + auto TI = BB->getTerminator(); + auto B = CHIs.begin(); + // [PreIt, PHIIt) form a range of CHIs which have identical VNs. + auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(), + [B](CHIArg &A) { return A != *B; }); + auto PrevIt = CHIs.begin(); + while (PrevIt != PHIIt) { + // Collect values which satisfy safety checks. + SmallVector<CHIArg, 2> Safe; + // We check for safety first because there might be multiple values in + // the same path, some of which are not safe to be hoisted, but overall + // each edge has at least one value which can be hoisted, making the + // value anticipable along that path. + checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe); - // At this point it is not safe to extend the current hoisting to - // NewHoistPt: save the hoisting list so far. - if (std::distance(Start, II) > 1) - HPL.push_back({HoistBB, SmallVecInsn(Start, II)}); + // List of safe values should be anticipable at TI. + if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) { + HPL.push_back({BB, SmallVecInsn()}); + SmallVecInsn &V = HPL.back().second; + for (auto B : Safe) + V.push_back(B.I); + } - // Start over from BB. - Start = II; - if (K != InsKind::Scalar) - UD = MSSA->getMemoryAccess(*Start); - HoistPt = Insn; - HoistBB = BB; - NumBBsOnAllPaths = MaxNumberOfBBSInPath; + // Check other VNs + PrevIt = PHIIt; + PHIIt = std::find_if(PrevIt, CHIs.end(), + [PrevIt](CHIArg &A) { return A != *PrevIt; }); + } } - - // Save the last partition. - if (std::distance(Start, II) > 1) - HPL.push_back({HoistBB, SmallVecInsn(Start, II)}); } - // Initialize HPL from Map. + // Compute insertion points for each values which can be fully anticipated at + // a dominator. HPL contains all such values. void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL, InsKind K) { + // Sort VNs based on their rankings + std::vector<VNType> Ranks; for (const auto &Entry : Map) { - if (MaxHoistedThreshold != -1 && ++HoistedCtr > MaxHoistedThreshold) - return; + Ranks.push_back(Entry.first); + } + + // TODO: Remove fully-redundant expressions. + // Get instruction from the Map, assume that all the Instructions + // with same VNs have same rank (this is an approximation). + std::sort(Ranks.begin(), Ranks.end(), + [this, &Map](const VNType &r1, const VNType &r2) { + return (rank(*Map.lookup(r1).begin()) < + rank(*Map.lookup(r2).begin())); + }); - const SmallVecInsn &V = Entry.second; + // - Sort VNs according to their rank, and start with lowest ranked VN + // - Take a VN and for each instruction with same VN + // - Find the dominance frontier in the inverse graph (PDF) + // - Insert the chi-node at PDF + // - Remove the chi-nodes with missing entries + // - Remove values from CHI-nodes which do not truly flow out, e.g., + // modified along the path. + // - Collect the remaining values that are still anticipable + SmallVector<BasicBlock *, 2> IDFBlocks; + ReverseIDFCalculator IDFs(*PDT); + OutValuesType OutValue; + InValuesType InValue; + for (const auto &R : Ranks) { + const SmallVecInsn &V = Map.lookup(R); if (V.size() < 2) continue; + const VNType &VN = R; + SmallPtrSet<BasicBlock *, 2> VNBlocks; + for (auto &I : V) { + BasicBlock *BBI = I->getParent(); + if (!hasEH(BBI)) + VNBlocks.insert(BBI); + } + // Compute the Post Dominance Frontiers of each basic block + // The dominance frontier of a live block X in the reverse + // control graph is the set of blocks upon which X is control + // dependent. The following sequence computes the set of blocks + // which currently have dead terminators that are control + // dependence sources of a block which is in NewLiveBlocks. + IDFs.setDefiningBlocks(VNBlocks); + IDFs.calculate(IDFBlocks); - // Compute the insertion point and the list of expressions to be hoisted. - SmallVecInsn InstructionsToHoist; - for (auto I : V) - // We don't need to check for hoist-barriers here because if - // I->getParent() is a barrier then I precedes the barrier. - if (!hasEH(I->getParent())) - InstructionsToHoist.push_back(I); - - if (!InstructionsToHoist.empty()) - partitionCandidates(InstructionsToHoist, HPL, K); + // Make a map of BB vs instructions to be hoisted. + for (unsigned i = 0; i < V.size(); ++i) { + InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i])); + } + // Insert empty CHI node for this VN. This is used to factor out + // basic blocks where the ANTIC can potentially change. + for (auto IDFB : IDFBlocks) { // TODO: Prune out useless CHI insertions. + for (unsigned i = 0; i < V.size(); ++i) { + CHIArg C = {VN, nullptr, nullptr}; + // Ignore spurious PDFs. + if (DT->properlyDominates(IDFB, V[i]->getParent())) { + OutValue[IDFB].push_back(C); + DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName() + << ", for Insn: " << *V[i]); + } + } + } } + + // Insert CHI args at each PDF to iterate on factored graph of + // control dependence. + insertCHI(InValue, OutValue); + // Using the CHI args inserted at each PDF, find fully anticipable values. + findHoistableCandidates(OutValue, K, HPL); } // Return true when all operands of Instr are available at insertion point @@ -714,7 +856,6 @@ private: Instruction *ClonedGep = Gep->clone(); for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i) if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) { - // Check whether the operand is already available. if (DT->dominates(Op->getParent(), HoistPt)) continue; @@ -748,6 +889,88 @@ private: Repl->replaceUsesOfWith(Gep, ClonedGep); } + void updateAlignment(Instruction *I, Instruction *Repl) { + if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) { + ReplacementLoad->setAlignment( + std::min(ReplacementLoad->getAlignment(), + cast<LoadInst>(I)->getAlignment())); + ++NumLoadsRemoved; + } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) { + ReplacementStore->setAlignment( + std::min(ReplacementStore->getAlignment(), + cast<StoreInst>(I)->getAlignment())); + ++NumStoresRemoved; + } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) { + ReplacementAlloca->setAlignment( + std::max(ReplacementAlloca->getAlignment(), + cast<AllocaInst>(I)->getAlignment())); + } else if (isa<CallInst>(Repl)) { + ++NumCallsRemoved; + } + } + + // Remove all the instructions in Candidates and replace their usage with Repl. + // Returns the number of instructions removed. + unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl, + MemoryUseOrDef *NewMemAcc) { + unsigned NR = 0; + for (Instruction *I : Candidates) { + if (I != Repl) { + ++NR; + updateAlignment(I, Repl); + if (NewMemAcc) { + // Update the uses of the old MSSA access with NewMemAcc. + MemoryAccess *OldMA = MSSA->getMemoryAccess(I); + OldMA->replaceAllUsesWith(NewMemAcc); + MSSAUpdater->removeMemoryAccess(OldMA); + } + + Repl->andIRFlags(I); + combineKnownMetadata(Repl, I); + I->replaceAllUsesWith(Repl); + // Also invalidate the Alias Analysis cache. + MD->removeInstruction(I); + I->eraseFromParent(); + } + } + return NR; + } + + // Replace all Memory PHI usage with NewMemAcc. + void raMPHIuw(MemoryUseOrDef *NewMemAcc) { + SmallPtrSet<MemoryPhi *, 4> UsePhis; + for (User *U : NewMemAcc->users()) + if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U)) + UsePhis.insert(Phi); + + for (MemoryPhi *Phi : UsePhis) { + auto In = Phi->incoming_values(); + if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) { + Phi->replaceAllUsesWith(NewMemAcc); + MSSAUpdater->removeMemoryAccess(Phi); + } + } + } + + // Remove all other instructions and replace them with Repl. + unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl, + BasicBlock *DestBB, bool MoveAccess) { + MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl); + if (MoveAccess && NewMemAcc) { + // The definition of this ld/st will not change: ld/st hoisting is + // legal when the ld/st is not moved past its current definition. + MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End); + } + + // Replace all other instructions with Repl with memory access NewMemAcc. + unsigned NR = rauw(Candidates, Repl, NewMemAcc); + + // Remove MemorySSA phi nodes with the same arguments. + if (NewMemAcc) + raMPHIuw(NewMemAcc); + return NR; + } + // In the case Repl is a load or a store, we make all their GEPs // available: GEPs are not hoisted by default to avoid the address // computations to be hoisted without the associated load or store. @@ -789,11 +1012,11 @@ private: for (const HoistingPointInfo &HP : HPL) { // Find out whether we already have one of the instructions in HoistPt, // in which case we do not have to move it. - BasicBlock *HoistPt = HP.first; + BasicBlock *DestBB = HP.first; const SmallVecInsn &InstructionsToHoist = HP.second; Instruction *Repl = nullptr; for (Instruction *I : InstructionsToHoist) - if (I->getParent() == HoistPt) + if (I->getParent() == DestBB) // If there are two instructions in HoistPt to be hoisted in place: // update Repl to be the first one, such that we can rename the uses // of the second based on the first. @@ -805,7 +1028,7 @@ private: bool MoveAccess = true; if (Repl) { // Repl is already in HoistPt: it remains in place. - assert(allOperandsAvailable(Repl, HoistPt) && + assert(allOperandsAvailable(Repl, DestBB) && "instruction depends on operands that are not available"); MoveAccess = false; } else { @@ -816,40 +1039,26 @@ private: // We can move Repl in HoistPt only when all operands are available. // The order in which hoistings are done may influence the availability // of operands. - if (!allOperandsAvailable(Repl, HoistPt)) { - + if (!allOperandsAvailable(Repl, DestBB)) { // When HoistingGeps there is nothing more we can do to make the // operands available: just continue. if (HoistingGeps) continue; // When not HoistingGeps we need to copy the GEPs. - if (!makeGepOperandsAvailable(Repl, HoistPt, InstructionsToHoist)) + if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist)) continue; } // Move the instruction at the end of HoistPt. - Instruction *Last = HoistPt->getTerminator(); + Instruction *Last = DestBB->getTerminator(); MD->removeInstruction(Repl); Repl->moveBefore(Last); DFSNumber[Repl] = DFSNumber[Last]++; } - MemoryAccess *NewMemAcc = MSSA->getMemoryAccess(Repl); - - if (MoveAccess) { - if (MemoryUseOrDef *OldMemAcc = - dyn_cast_or_null<MemoryUseOrDef>(NewMemAcc)) { - // The definition of this ld/st will not change: ld/st hoisting is - // legal when the ld/st is not moved past its current definition. - MemoryAccess *Def = OldMemAcc->getDefiningAccess(); - NewMemAcc = - MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End); - OldMemAcc->replaceAllUsesWith(NewMemAcc); - MSSAUpdater->removeMemoryAccess(OldMemAcc); - } - } + NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess); if (isa<LoadInst>(Repl)) ++NL; @@ -859,59 +1068,6 @@ private: ++NC; else // Scalar ++NI; - - // Remove and rename all other instructions. - for (Instruction *I : InstructionsToHoist) - if (I != Repl) { - ++NR; - if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) { - ReplacementLoad->setAlignment( - std::min(ReplacementLoad->getAlignment(), - cast<LoadInst>(I)->getAlignment())); - ++NumLoadsRemoved; - } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) { - ReplacementStore->setAlignment( - std::min(ReplacementStore->getAlignment(), - cast<StoreInst>(I)->getAlignment())); - ++NumStoresRemoved; - } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) { - ReplacementAlloca->setAlignment( - std::max(ReplacementAlloca->getAlignment(), - cast<AllocaInst>(I)->getAlignment())); - } else if (isa<CallInst>(Repl)) { - ++NumCallsRemoved; - } - - if (NewMemAcc) { - // Update the uses of the old MSSA access with NewMemAcc. - MemoryAccess *OldMA = MSSA->getMemoryAccess(I); - OldMA->replaceAllUsesWith(NewMemAcc); - MSSAUpdater->removeMemoryAccess(OldMA); - } - - Repl->andIRFlags(I); - combineKnownMetadata(Repl, I); - I->replaceAllUsesWith(Repl); - // Also invalidate the Alias Analysis cache. - MD->removeInstruction(I); - I->eraseFromParent(); - } - - // Remove MemorySSA phi nodes with the same arguments. - if (NewMemAcc) { - SmallPtrSet<MemoryPhi *, 4> UsePhis; - for (User *U : NewMemAcc->users()) - if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U)) - UsePhis.insert(Phi); - - for (auto *Phi : UsePhis) { - auto In = Phi->incoming_values(); - if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) { - Phi->replaceAllUsesWith(NewMemAcc); - MSSAUpdater->removeMemoryAccess(Phi); - } - } - } } NumHoisted += NL + NS + NC + NI; @@ -935,8 +1091,8 @@ private: // If I1 cannot guarantee progress, subsequent instructions // in BB cannot be hoisted anyways. if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) { - HoistBarrier.insert(BB); - break; + HoistBarrier.insert(BB); + break; } // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting // deeper may increase the register pressure and compilation time. @@ -954,7 +1110,8 @@ private: else if (auto *Call = dyn_cast<CallInst>(&I1)) { if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) { if (isa<DbgInfoIntrinsic>(Intr) || - Intr->getIntrinsicID() == Intrinsic::assume) + Intr->getIntrinsicID() == Intrinsic::assume || + Intr->getIntrinsicID() == Intrinsic::sideeffect) continue; } if (Call->mayHaveSideEffects()) @@ -996,16 +1153,18 @@ public: if (skipFunction(F)) return false; auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); - GVNHoist G(&DT, &AA, &MD, &MSSA); + GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA); return G.run(F); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTreeWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MemoryDependenceWrapperPass>(); AU.addRequired<MemorySSAWrapperPass>(); @@ -1014,14 +1173,16 @@ public: AU.addPreserved<GlobalsAAWrapperPass>(); } }; -} // namespace + +} // end namespace llvm PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) { DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); AliasAnalysis &AA = AM.getResult<AAManager>(F); MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F); MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); - GVNHoist G(&DT, &AA, &MD, &MSSA); + GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA); if (!G.run(F)) return PreservedAnalyses::all(); @@ -1033,6 +1194,7 @@ PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) { } char GVNHoistLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist", "Early GVN Hoisting of Expressions", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp index 5fd2dfc118b4..814a62cd7d65 100644 --- a/lib/Transforms/Scalar/GVNSink.cpp +++ b/lib/Transforms/Scalar/GVNSink.cpp @@ -1,4 +1,4 @@ -//===- GVNSink.cpp - sink expressions into successors -------------------===// +//===- GVNSink.cpp - sink expressions into successors ---------------------===// // // The LLVM Compiler Infrastructure // @@ -31,33 +31,54 @@ /// replace %a1 with %c1, will it contribute in an equivalent way to all /// successive instructions?". The PostValueTable class in GVN provides this /// mapping. -/// +// //===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/MemorySSA.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Verifier.h" -#include "llvm/Support/MathExtras.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/ArrayRecycler.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/GVNExpression.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include <unordered_set> +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <iterator> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "gvn-sink" @@ -72,8 +93,8 @@ LLVM_DUMP_METHOD void Expression::dump() const { dbgs() << "\n"; } -} -} +} // end namespace GVNExpression +} // end namespace llvm namespace { @@ -97,7 +118,7 @@ static bool isMemoryInst(const Instruction *I) { /// list returned by operator*. class LockstepReverseIterator { ArrayRef<BasicBlock *> Blocks; - SmallPtrSet<BasicBlock *, 4> ActiveBlocks; + SmallSetVector<BasicBlock *, 4> ActiveBlocks; SmallVector<Instruction *, 4> Insts; bool Fail; @@ -115,7 +136,7 @@ public: for (BasicBlock *BB : Blocks) { if (BB->size() <= 1) { // Block wasn't big enough - only contained a terminator. - ActiveBlocks.erase(BB); + ActiveBlocks.remove(BB); continue; } Insts.push_back(BB->getTerminator()->getPrevNode()); @@ -126,13 +147,20 @@ public: bool isValid() const { return !Fail; } ArrayRef<Instruction *> operator*() const { return Insts; } - SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; } - void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) { + // Note: This needs to return a SmallSetVector as the elements of + // ActiveBlocks will be later copied to Blocks using std::copy. The + // resultant order of elements in Blocks needs to be deterministic. + // Using SmallPtrSet instead causes non-deterministic order while + // copying. And we cannot simply sort Blocks as they need to match the + // corresponding Values. + SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; } + + void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) { for (auto II = Insts.begin(); II != Insts.end();) { if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) == Blocks.end()) { - ActiveBlocks.erase((*II)->getParent()); + ActiveBlocks.remove((*II)->getParent()); II = Insts.erase(II); } else { ++II; @@ -146,7 +174,7 @@ public: SmallVector<Instruction *, 4> NewInsts; for (auto *Inst : Insts) { if (Inst == &Inst->getParent()->front()) - ActiveBlocks.erase(Inst->getParent()); + ActiveBlocks.remove(Inst->getParent()); else NewInsts.push_back(Inst->getPrevNode()); } @@ -180,14 +208,14 @@ struct SinkingInstructionCandidate { NumExtraPHIs) // PHIs are expensive, so make sure they're worth it. - SplitEdgeCost; } + bool operator>(const SinkingInstructionCandidate &Other) const { return Cost > Other.Cost; } }; #ifndef NDEBUG -llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const SinkingInstructionCandidate &C) { +raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) { OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">"; return OS; @@ -204,17 +232,20 @@ class ModelledPHI { SmallVector<BasicBlock *, 4> Blocks; public: - ModelledPHI() {} + ModelledPHI() = default; + ModelledPHI(const PHINode *PN) { + // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order. + SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops; for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) - Blocks.push_back(PN->getIncomingBlock(I)); - std::sort(Blocks.begin(), Blocks.end()); - - // This assumes the PHI is already well-formed and there aren't conflicting - // incoming values for the same block. - for (auto *B : Blocks) - Values.push_back(PN->getIncomingValueForBlock(B)); + Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)}); + std::sort(Ops.begin(), Ops.end()); + for (auto &P : Ops) { + Blocks.push_back(P.first); + Values.push_back(P.second); + } } + /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI /// without the same ID. /// \note This is specifically for DenseMapInfo - do not use this! @@ -241,7 +272,7 @@ public: /// Restrict the PHI's contents down to only \c NewBlocks. /// \c NewBlocks must be a subset of \c this->Blocks. - void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) { + void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) { auto BI = Blocks.begin(); auto VI = Values.begin(); while (BI != Blocks.end()) { @@ -261,19 +292,23 @@ public: ArrayRef<Value *> getValues() const { return Values; } bool areAllIncomingValuesSame() const { - return all_of(Values, [&](Value *V) { return V == Values[0]; }); + return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; }); } + bool areAllIncomingValuesSameType() const { - return all_of( + return llvm::all_of( Values, [&](Value *V) { return V->getType() == Values[0]->getType(); }); } + bool areAnyIncomingValuesConstant() const { - return any_of(Values, [&](Value *V) { return isa<Constant>(V); }); + return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); }); } + // Hash functor unsigned hash() const { return (unsigned)hash_combine_range(Values.begin(), Values.end()); } + bool operator==(const ModelledPHI &Other) const { return Values == Other.Values && Blocks == Other.Blocks; } @@ -284,17 +319,20 @@ template <typename ModelledPHI> struct DenseMapInfo { static ModelledPHI Dummy = ModelledPHI::createDummy(0); return Dummy; } + static inline ModelledPHI &getTombstoneKey() { static ModelledPHI Dummy = ModelledPHI::createDummy(1); return Dummy; } + static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); } + static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) { return LHS == RHS; } }; -typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet; +using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>; //===----------------------------------------------------------------------===// // ValueTable @@ -325,10 +363,11 @@ public: op_push_back(U.getUser()); std::sort(op_begin(), op_end()); } + void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; } void setVolatile(bool V) { Volatile = V; } - virtual hash_code getHashValue() const { + hash_code getHashValue() const override { return hash_combine(GVNExpression::BasicExpression::getHashValue(), MemoryUseOrder, Volatile); } @@ -348,7 +387,7 @@ class ValueTable { DenseMap<size_t, uint32_t> HashNumbering; BumpPtrAllocator Allocator; ArrayRecycler<Value *> Recycler; - uint32_t nextValueNumber; + uint32_t nextValueNumber = 1; /// Create an expression for I based on its opcode and its uses. If I /// touches or reads memory, the expression is also based upon its memory @@ -378,6 +417,8 @@ class ValueTable { } public: + ValueTable() = default; + /// Returns the value number for the specified value, assigning /// it a new number if it did not have one before. uint32_t lookupOrAdd(Value *V) { @@ -483,8 +524,6 @@ public: nextValueNumber = 1; } - ValueTable() : nextValueNumber(1) {} - /// \c Inst uses or touches memory. Return an ID describing the memory state /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2), /// the exact same memory operations happen after I1 and I2. @@ -519,7 +558,8 @@ public: class GVNSink { public: - GVNSink() : VN() {} + GVNSink() = default; + bool run(Function &F) { DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n"); @@ -576,8 +616,9 @@ private: void foldPointlessPHINodes(BasicBlock *BB) { auto I = BB->begin(); while (PHINode *PN = dyn_cast<PHINode>(I++)) { - if (!all_of(PN->incoming_values(), - [&](const Value *V) { return V == PN->getIncomingValue(0); })) + if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) { + return V == PN->getIncomingValue(0); + })) continue; if (PN->getIncomingValue(0) != PN) PN->replaceAllUsesWith(PN->getIncomingValue(0)); @@ -624,7 +665,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking( SmallVector<Instruction *, 4> NewInsts; for (auto *I : Insts) { if (VN.lookup(I) != VNumToSink) - ActivePreds.erase(I->getParent()); + ActivePreds.remove(I->getParent()); else NewInsts.push_back(I); } @@ -794,7 +835,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, SmallVector<Value *, 4> NewOperands; for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) { - bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) { + bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) { return I->getOperand(O) != I0->getOperand(O); }); if (!NeedPHI) { @@ -860,7 +901,8 @@ public: AU.addPreserved<GlobalsAAWrapperPass>(); } }; -} // namespace + +} // end anonymous namespace PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { GVNSink G; @@ -873,6 +915,7 @@ PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { } char GVNSinkLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink", "Early GVN sinking of Expressions", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp index fb7c6e15758d..c4aeccb85ca7 100644 --- a/lib/Transforms/Scalar/GuardWidening.cpp +++ b/lib/Transforms/Scalar/GuardWidening.cpp @@ -664,6 +664,7 @@ PreservedAnalyses GuardWideningPass::run(Function &F, return PA; } +#ifndef NDEBUG StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { switch (WS) { case WS_IllegalOrNegative: @@ -678,6 +679,7 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { llvm_unreachable("Fully covered switch above!"); } +#endif char GuardWideningLegacyPass::ID = 0; diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 10782963177c..74d6014d3e3d 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -25,27 +25,54 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/IndVarSimplify.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" @@ -53,6 +80,10 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include <cassert> +#include <cstdint> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "indvars" @@ -91,6 +122,7 @@ DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), cl::desc("Disable Linear Function Test Replace optimization")); namespace { + struct RewritePhi; class IndVarSimplify { @@ -131,7 +163,8 @@ public: bool run(Loop *L); }; -} + +} // end anonymous namespace /// Return true if the SCEV expansion generated by the rewriter can replace the /// original value. SCEV guarantees that it produces the same value, but the way @@ -251,7 +284,6 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { /// is converted into /// for(int i = 0; i < 10000; ++i) /// bar((double)i); -/// void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); unsigned BackEdge = IncomingEdge^1; @@ -305,7 +337,6 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { L->contains(TheBr->getSuccessor(1)))) return; - // If it isn't a comparison with an integer-as-fp (the exit value), we can't // transform it. ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); @@ -373,7 +404,6 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { // transform the IV. if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) return; - } else { // If we have a negative stride, we require the init to be greater than the // exit value. @@ -452,7 +482,6 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { // First step. Check to see if there are any floating-point recurrences. // If there are, change them into integer recurrences, permitting analysis by // the SCEV routines. - // BasicBlock *Header = L->getHeader(); SmallVector<WeakTrackingVH, 8> PHIs; @@ -472,18 +501,26 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { } namespace { + // Collect information about PHI nodes which can be transformed in // rewriteLoopExitValues. struct RewritePhi { PHINode *PN; - unsigned Ith; // Ith incoming value. - Value *Val; // Exit value after expansion. - bool HighCost; // High Cost when expansion. + + // Ith incoming value. + unsigned Ith; + + // Exit value after expansion. + Value *Val; + + // High Cost when expansion. + bool HighCost; RewritePhi(PHINode *P, unsigned I, Value *V, bool H) : PN(P), Ith(I), Val(V), HighCost(H) {} }; -} + +} // end anonymous namespace Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, Instruction *InsertPt, @@ -747,7 +784,6 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { /// aggressively. bool IndVarSimplify::canLoopBeDeleted( Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) { - BasicBlock *Preheader = L->getLoopPreheader(); // If there is no preheader, the loop will not be deleted. if (!Preheader) @@ -790,7 +826,9 @@ bool IndVarSimplify::canLoopBeDeleted( } for (auto *BB : L->blocks()) - if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); })) + if (llvm::any_of(*BB, [](Instruction &I) { + return I.mayHaveSideEffects(); + })) return false; return true; @@ -801,15 +839,21 @@ bool IndVarSimplify::canLoopBeDeleted( //===----------------------------------------------------------------------===// namespace { + // Collect information about induction variables that are used by sign/zero // extend operations. This information is recorded by CollectExtend and provides // the input to WidenIV. struct WideIVInfo { PHINode *NarrowIV = nullptr; - Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext - bool IsSigned = false; // Was a sext user seen before a zext? + + // Widest integer type created [sz]ext + Type *WidestNativeType = nullptr; + + // Was a sext user seen before a zext? + bool IsSigned = false; }; -} + +} // end anonymous namespace /// Update information about the induction variable that is extended by this /// sign or zero extend operation. This is used to determine the final width of @@ -885,7 +929,6 @@ struct NarrowIVDefUse { /// creating any new induction variables. To do this, it creates a new phi of /// the wider type and redirects all users, either removing extends or inserting /// truncs whenever we stop propagating the type. -/// class WidenIV { // Parameters PHINode *OrigPhi; @@ -902,22 +945,24 @@ class WidenIV { bool HasGuards; // Result - PHINode *WidePhi; - Instruction *WideInc; - const SCEV *WideIncExpr; + PHINode *WidePhi = nullptr; + Instruction *WideInc = nullptr; + const SCEV *WideIncExpr = nullptr; SmallVectorImpl<WeakTrackingVH> &DeadInsts; SmallPtrSet<Instruction *,16> Widened; SmallVector<NarrowIVDefUse, 8> NarrowIVUsers; enum ExtendKind { ZeroExtended, SignExtended, Unknown }; + // A map tracking the kind of extension used to widen each narrow IV // and narrow IV user. // Key: pointer to a narrow IV or IV user. // Value: the kind of extension used to widen this Instruction. DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap; - typedef std::pair<AssertingVH<Value>, AssertingVH<Instruction>> DefUserPair; + using DefUserPair = std::pair<AssertingVH<Value>, AssertingVH<Instruction>>; + // A map with control-dependent ranges for post increment IV uses. The key is // a pair of IV def and a use of this def denoting the context. The value is // a ConstantRange representing possible values of the def at the given @@ -935,6 +980,7 @@ class WidenIV { void calculatePostIncRanges(PHINode *OrigPhi); void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser); + void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) { DefUserPair Key(Def, UseI); auto It = PostIncRangeInfos.find(Key); @@ -950,8 +996,7 @@ public: bool HasGuards) : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo), L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree), - HasGuards(HasGuards), WidePhi(nullptr), WideInc(nullptr), - WideIncExpr(nullptr), DeadInsts(DI) { + HasGuards(HasGuards), DeadInsts(DI) { assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended; } @@ -969,7 +1014,7 @@ protected: ExtendKind getExtendKind(Instruction *I); - typedef std::pair<const SCEVAddRecExpr *, ExtendKind> WidenedRecTy; + using WidenedRecTy = std::pair<const SCEVAddRecExpr *, ExtendKind>; WidenedRecTy getWideRecurrence(NarrowIVDefUse DU); @@ -984,7 +1029,8 @@ protected: void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; -} // anonymous namespace + +} // end anonymous namespace /// Perform a quick domtree based check for loop invariance assuming that V is /// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this @@ -1182,7 +1228,6 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, /// operands is an AddRec for this loop, return the AddRec and the kind of /// extension used. WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) { - // Handle the common case of add<nsw/nuw> const unsigned OpCode = DU.NarrowUse->getOpcode(); // Only Add/Sub/Mul instructions supported yet. @@ -1310,7 +1355,7 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) { Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); unsigned CastWidth = SE->getTypeSizeInBits(Op->getType()); unsigned IVWidth = SE->getTypeSizeInBits(WideType); - assert (CastWidth <= IVWidth && "Unexpected width while widening compare."); + assert(CastWidth <= IVWidth && "Unexpected width while widening compare."); // Widen the compare instruction. IRBuilder<> Builder( @@ -1461,7 +1506,6 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { } /// Add eligible users of NarrowDef to NarrowIVUsers. -/// void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef); bool NonNegativeDef = @@ -1494,7 +1538,6 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { /// /// It would be simpler to delete uses as they are processed, but we must avoid /// invalidating SCEV expressions. -/// PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { // Is this phi an induction variable? const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); @@ -1581,6 +1624,15 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { if (DU.NarrowDef->use_empty()) DeadInsts.emplace_back(DU.NarrowDef); } + + // Attach any debug information to the new PHI. Since OrigPhi and WidePHI + // evaluate the same recurrence, we can just copy the debug info over. + SmallVector<DbgValueInst *, 1> DbgValues; + llvm::findDbgValues(DbgValues, OrigPhi); + auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(), + ValueAsMetadata::get(WidePhi)); + for (auto &DbgValue : DbgValues) + DbgValue->setOperand(0, MDPhi); return WidePhi; } @@ -1696,12 +1748,12 @@ void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) { // Live IV Reduction - Minimize IVs live across the loop. //===----------------------------------------------------------------------===// - //===----------------------------------------------------------------------===// // Simplification of IV users based on SCEV evaluation. //===----------------------------------------------------------------------===// namespace { + class IndVarSimplifyVisitor : public IVVisitor { ScalarEvolution *SE; const TargetTransformInfo *TTI; @@ -1721,14 +1773,14 @@ public: // Implement the interface used by simplifyUsersOfIV. void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } }; -} + +} // end anonymous namespace /// Iteratively perform simplification on a worklist of IV users. Each /// successive simplification may push more users which may themselves be /// candidates for simplification. /// /// Sign/Zero extend elimination is interleaved with IV simplification. -/// void IndVarSimplify::simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI) { @@ -1759,7 +1811,8 @@ void IndVarSimplify::simplifyAndExtend(Loop *L, // Information about sign/zero extensions of CurrIV. IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); - Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor); + Changed |= + simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, Rewriter, &Visitor); if (Visitor.WI.WidestNativeType) { WideIVs.push_back(Visitor.WI); @@ -2501,8 +2554,10 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, } namespace { + struct IndVarSimplifyLegacyPass : public LoopPass { static char ID; // Pass identification, replacement for typeid + IndVarSimplifyLegacyPass() : LoopPass(ID) { initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -2529,9 +2584,11 @@ struct IndVarSimplifyLegacyPass : public LoopPass { getLoopAnalysisUsage(AU); } }; -} + +} // end anonymous namespace char IndVarSimplifyLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars", "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(LoopPass) diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 99b4458ea0fa..5c4d55bfbb2b 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -1,4 +1,4 @@ -//===-- InductiveRangeCheckElimination.cpp - ------------------------------===// +//===- InductiveRangeCheckElimination.cpp - -------------------------------===// // // The LLVM Compiler Infrastructure // @@ -6,6 +6,7 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// // The InductiveRangeCheckElimination pass splits a loop's iteration space into // three disjoint ranges. It does that in a way such that the loop running in // the middle loop provably does not need range checks. As an example, it will @@ -39,30 +40,61 @@ // throw_out_of_bounds(); // } // } +// //===----------------------------------------------------------------------===// +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden, cl::init(64)); @@ -79,6 +111,9 @@ static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal", static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks", cl::Hidden, cl::init(false)); +static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch", + cl::Hidden, cl::init(true)); + static const char *ClonedLoopTag = "irce.loop.clone"; #define DEBUG_TYPE "irce" @@ -114,15 +149,16 @@ class InductiveRangeCheck { static StringRef rangeCheckKindToStr(RangeCheckKind); - const SCEV *Offset = nullptr; - const SCEV *Scale = nullptr; - Value *Length = nullptr; + const SCEV *Begin = nullptr; + const SCEV *Step = nullptr; + const SCEV *End = nullptr; Use *CheckUse = nullptr; RangeCheckKind Kind = RANGE_CHECK_UNKNOWN; + bool IsSigned = true; static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE, Value *&Index, - Value *&Length); + Value *&Length, bool &IsSigned); static void extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse, @@ -130,20 +166,21 @@ class InductiveRangeCheck { SmallPtrSetImpl<Value *> &Visited); public: - const SCEV *getOffset() const { return Offset; } - const SCEV *getScale() const { return Scale; } - Value *getLength() const { return Length; } + const SCEV *getBegin() const { return Begin; } + const SCEV *getStep() const { return Step; } + const SCEV *getEnd() const { return End; } + bool isSigned() const { return IsSigned; } void print(raw_ostream &OS) const { OS << "InductiveRangeCheck:\n"; OS << " Kind: " << rangeCheckKindToStr(Kind) << "\n"; - OS << " Offset: "; - Offset->print(OS); - OS << " Scale: "; - Scale->print(OS); - OS << " Length: "; - if (Length) - Length->print(OS); + OS << " Begin: "; + Begin->print(OS); + OS << " Step: "; + Step->print(OS); + OS << " End: "; + if (End) + End->print(OS); else OS << "(null)"; OS << "\n CheckUse: "; @@ -173,6 +210,14 @@ public: Type *getType() const { return Begin->getType(); } const SCEV *getBegin() const { return Begin; } const SCEV *getEnd() const { return End; } + bool isEmpty(ScalarEvolution &SE, bool IsSigned) const { + if (Begin == End) + return true; + if (IsSigned) + return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End); + else + return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End); + } }; /// This is the value the condition of the branch needs to evaluate to for the @@ -183,7 +228,8 @@ public: /// check is redundant and can be constant-folded away. The induction /// variable is not required to be the canonical {0,+,1} induction variable. Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE, - const SCEVAddRecExpr *IndVar) const; + const SCEVAddRecExpr *IndVar, + bool IsLatchSigned) const; /// Parse out a set of inductive range checks from \p BI and append them to \p /// Checks. @@ -199,6 +245,7 @@ public: class InductiveRangeCheckElimination : public LoopPass { public: static char ID; + InductiveRangeCheckElimination() : LoopPass(ID) { initializeInductiveRangeCheckEliminationPass( *PassRegistry::getPassRegistry()); @@ -212,8 +259,9 @@ public: bool runOnLoop(Loop *L, LPPassManager &LPM) override; }; +} // end anonymous namespace + char InductiveRangeCheckElimination::ID = 0; -} INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce", "Inductive range check elimination", false, false) @@ -247,12 +295,10 @@ StringRef InductiveRangeCheck::rangeCheckKindToStr( /// range checked, and set `Length` to the upper limit `Index` is being range /// checked with if (and only if) the range check type is stronger or equal to /// RANGE_CHECK_UPPER. -/// InductiveRangeCheck::RangeCheckKind InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE, Value *&Index, - Value *&Length) { - + Value *&Length, bool &IsSigned) { auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) { const SCEV *S = SE.getSCEV(V); if (isa<SCEVCouldNotCompute>(S)) @@ -262,8 +308,6 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, SE.isKnownNonNegative(S); }; - using namespace llvm::PatternMatch; - ICmpInst::Predicate Pred = ICI->getPredicate(); Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); @@ -276,6 +320,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ICmpInst::ICMP_SGE: + IsSigned = true; if (match(RHS, m_ConstantInt<0>())) { Index = LHS; return RANGE_CHECK_LOWER; @@ -286,6 +331,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ICmpInst::ICMP_SGT: + IsSigned = true; if (match(RHS, m_ConstantInt<-1>())) { Index = LHS; return RANGE_CHECK_LOWER; @@ -302,6 +348,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ICmpInst::ICMP_UGT: + IsSigned = false; if (IsNonNegativeAndNotLoopVarying(LHS)) { Index = RHS; Length = LHS; @@ -317,42 +364,16 @@ void InductiveRangeCheck::extractRangeChecksFromCond( Loop *L, ScalarEvolution &SE, Use &ConditionUse, SmallVectorImpl<InductiveRangeCheck> &Checks, SmallPtrSetImpl<Value *> &Visited) { - using namespace llvm::PatternMatch; - Value *Condition = ConditionUse.get(); if (!Visited.insert(Condition).second) return; + // TODO: Do the same for OR, XOR, NOT etc? if (match(Condition, m_And(m_Value(), m_Value()))) { - SmallVector<InductiveRangeCheck, 8> SubChecks; extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0), - SubChecks, Visited); + Checks, Visited); extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1), - SubChecks, Visited); - - if (SubChecks.size() == 2) { - // Handle a special case where we know how to merge two checks separately - // checking the upper and lower bounds into a full range check. - const auto &RChkA = SubChecks[0]; - const auto &RChkB = SubChecks[1]; - if ((RChkA.Length == RChkB.Length || !RChkA.Length || !RChkB.Length) && - RChkA.Offset == RChkB.Offset && RChkA.Scale == RChkB.Scale) { - - // If RChkA.Kind == RChkB.Kind then we just found two identical checks. - // But if one of them is a RANGE_CHECK_LOWER and the other is a - // RANGE_CHECK_UPPER (only possibility if they're different) then - // together they form a RANGE_CHECK_BOTH. - SubChecks[0].Kind = - (InductiveRangeCheck::RangeCheckKind)(RChkA.Kind | RChkB.Kind); - SubChecks[0].Length = RChkA.Length ? RChkA.Length : RChkB.Length; - SubChecks[0].CheckUse = &ConditionUse; - - // We updated one of the checks in place, now erase the other. - SubChecks.pop_back(); - } - } - - Checks.insert(Checks.end(), SubChecks.begin(), SubChecks.end()); + Checks, Visited); return; } @@ -361,7 +382,8 @@ void InductiveRangeCheck::extractRangeChecksFromCond( return; Value *Length = nullptr, *Index; - auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length); + bool IsSigned; + auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned); if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) return; @@ -373,18 +395,18 @@ void InductiveRangeCheck::extractRangeChecksFromCond( return; InductiveRangeCheck IRC; - IRC.Length = Length; - IRC.Offset = IndexAddRec->getStart(); - IRC.Scale = IndexAddRec->getStepRecurrence(SE); + IRC.End = Length ? SE.getSCEV(Length) : nullptr; + IRC.Begin = IndexAddRec->getStart(); + IRC.Step = IndexAddRec->getStepRecurrence(SE); IRC.CheckUse = &ConditionUse; IRC.Kind = RCKind; + IRC.IsSigned = IsSigned; Checks.push_back(IRC); } void InductiveRangeCheck::extractRangeChecksFromBranch( BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI, SmallVectorImpl<InductiveRangeCheck> &Checks) { - if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) return; @@ -435,16 +457,16 @@ namespace { // kinds of loops we can deal with -- ones that have a single latch that is also // an exiting block *and* have a canonical induction variable. struct LoopStructure { - const char *Tag; + const char *Tag = ""; - BasicBlock *Header; - BasicBlock *Latch; + BasicBlock *Header = nullptr; + BasicBlock *Latch = nullptr; // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th // successor is `LatchExit', the exit block of the loop. - BranchInst *LatchBr; - BasicBlock *LatchExit; - unsigned LatchBrExitIdx; + BranchInst *LatchBr = nullptr; + BasicBlock *LatchExit = nullptr; + unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max(); // The loop represented by this instance of LoopStructure is semantically // equivalent to: @@ -452,18 +474,17 @@ struct LoopStructure { // intN_ty inc = IndVarIncreasing ? 1 : -1; // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT; // - // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext) + // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase) // ... body ... - Value *IndVarNext; - Value *IndVarStart; - Value *LoopExitAt; - bool IndVarIncreasing; + Value *IndVarBase = nullptr; + Value *IndVarStart = nullptr; + Value *IndVarStep = nullptr; + Value *LoopExitAt = nullptr; + bool IndVarIncreasing = false; + bool IsSignedPredicate = true; - LoopStructure() - : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr), - LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr), - IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {} + LoopStructure() = default; template <typename M> LoopStructure map(M Map) const { LoopStructure Result; @@ -473,10 +494,12 @@ struct LoopStructure { Result.LatchBr = cast<BranchInst>(Map(LatchBr)); Result.LatchExit = cast<BasicBlock>(Map(LatchExit)); Result.LatchBrExitIdx = LatchBrExitIdx; - Result.IndVarNext = Map(IndVarNext); + Result.IndVarBase = Map(IndVarBase); Result.IndVarStart = Map(IndVarStart); + Result.IndVarStep = Map(IndVarStep); Result.LoopExitAt = Map(LoopExitAt); Result.IndVarIncreasing = IndVarIncreasing; + Result.IsSignedPredicate = IsSignedPredicate; return Result; } @@ -494,7 +517,6 @@ struct LoopStructure { /// loops to run any remaining iterations. The pre loop runs any iterations in /// which the induction variable is < Begin, and the post loop runs any /// iterations in which the induction variable is >= End. -/// class LoopConstrainer { // The representation of a clone of the original loop we started out with. struct ClonedLoop { @@ -511,13 +533,12 @@ class LoopConstrainer { // Result of rewriting the range of a loop. See changeIterationSpaceEnd for // more details on what these fields mean. struct RewrittenRangeInfo { - BasicBlock *PseudoExit; - BasicBlock *ExitSelector; + BasicBlock *PseudoExit = nullptr; + BasicBlock *ExitSelector = nullptr; std::vector<PHINode *> PHIValuesAtPseudoExit; - PHINode *IndVarEnd; + PHINode *IndVarEnd = nullptr; - RewrittenRangeInfo() - : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {} + RewrittenRangeInfo() = default; }; // Calculated subranges we restrict the iteration space of the main loop to. @@ -541,14 +562,12 @@ class LoopConstrainer { // Compute a safe set of limits for the main loop to run in -- effectively the // intersection of `Range' and the iteration space of the original loop. // Return None if unable to compute the set of subranges. - // - Optional<SubRanges> calculateSubRanges() const; + Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const; // Clone `OriginalLoop' and return the result in CLResult. The IR after // running `cloneLoop' is well formed except for the PHI nodes in CLResult -- // the PHI nodes say that there is an incoming edge from `OriginalPreheader` // but there is no such edge. - // void cloneLoop(ClonedLoop &CLResult, const char *Tag) const; // Create the appropriate loop structure needed to describe a cloned copy of @@ -577,7 +596,6 @@ class LoopConstrainer { // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate // preheader because it is made to branch to the loop header only // conditionally. - // RewrittenRangeInfo changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader, Value *ExitLoopAt, @@ -585,7 +603,6 @@ class LoopConstrainer { // The loop denoted by `LS' has `OldPreheader' as its preheader. This // function creates a new preheader for `LS' and returns it. - // BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader, const char *Tag) const; @@ -613,12 +630,13 @@ class LoopConstrainer { // Information about the original loop we started out with. Loop &OriginalLoop; - const SCEV *LatchTakenCount; - BasicBlock *OriginalPreheader; + + const SCEV *LatchTakenCount = nullptr; + BasicBlock *OriginalPreheader = nullptr; // The preheader of the main loop. This may or may not be different from // `OriginalPreheader'. - BasicBlock *MainLoopPreheader; + BasicBlock *MainLoopPreheader = nullptr; // The range we need to run the main loop in. InductiveRangeCheck::Range Range; @@ -632,15 +650,14 @@ public: const LoopStructure &LS, ScalarEvolution &SE, DominatorTree &DT, InductiveRangeCheck::Range R) : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()), - SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), - LatchTakenCount(nullptr), OriginalPreheader(nullptr), - MainLoopPreheader(nullptr), Range(R), MainLoopStructure(LS) {} + SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R), + MainLoopStructure(LS) {} // Entry point for the algorithm. Returns true on success. bool run(); }; -} +} // end anonymous namespace void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block, BasicBlock *ReplaceBy) { @@ -649,22 +666,55 @@ void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block, PN->setIncomingBlock(i, ReplaceBy); } -static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) { - APInt SMax = - APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()); - return SE.getSignedRange(S).contains(SMax) && - SE.getUnsignedRange(S).contains(SMax); +static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) { + APInt Max = Signed ? + APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) : + APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(Max) && + SE.getUnsignedRange(S).contains(Max); +} + +static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2, + bool Signed) { + // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX. + assert(SE.isKnownNonNegative(S2) && + "We expected the 2nd arg to be non-negative!"); + const SCEV *Max = SE.getConstant( + Signed ? APInt::getSignedMaxValue( + cast<IntegerType>(S1->getType())->getBitWidth()) + : APInt::getMaxValue( + cast<IntegerType>(S1->getType())->getBitWidth())); + const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2); + return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + S1, CapForS1); +} + +static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) { + APInt Min = Signed ? + APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) : + APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(Min) && + SE.getUnsignedRange(S).contains(Min); } -static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) { - APInt SMin = - APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()); - return SE.getSignedRange(S).contains(SMin) && - SE.getUnsignedRange(S).contains(SMin); +static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2, + bool Signed) { + // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN. + assert(SE.isKnownNonPositive(S2) && + "We expected the 2nd arg to be non-positive!"); + const SCEV *Max = SE.getConstant( + Signed ? APInt::getSignedMinValue( + cast<IntegerType>(S1->getType())->getBitWidth()) + : APInt::getMinValue( + cast<IntegerType>(S1->getType())->getBitWidth())); + const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2); + return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, + S1, CapForS1); } Optional<LoopStructure> -LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI, +LoopStructure::parseLoopStructure(ScalarEvolution &SE, + BranchProbabilityInfo &BPI, Loop &L, const char *&FailureReason) { if (!L.isLoopSimplifyForm()) { FailureReason = "loop not in LoopSimplify form"; @@ -766,7 +816,11 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap; }; - auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) { + // Here we check whether the suggested AddRec is an induction variable that + // can be handled (i.e. with known constant step), and if yes, calculate its + // step and identify whether it is increasing or decreasing. + auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing, + ConstantInt *&StepCI) { if (!AR->isAffine()) return false; @@ -778,11 +832,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP if (const SCEVConstant *StepExpr = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) { - ConstantInt *StepCI = StepExpr->getValue(); - if (StepCI->isOne() || StepCI->isMinusOne()) { - IsIncreasing = StepCI->isOne(); - return true; - } + StepCI = StepExpr->getValue(); + assert(!StepCI->isZero() && "Zero step?"); + IsIncreasing = !StepCI->isNegative(); + return true; } return false; @@ -791,59 +844,87 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP // `ICI` is interpreted as taking the backedge if the *next* value of the // induction variable satisfies some constraint. - const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV); + const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV); bool IsIncreasing = false; - if (!IsInductionVar(IndVarNext, IsIncreasing)) { + bool IsSignedPredicate = true; + ConstantInt *StepCI; + if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) { FailureReason = "LHS in icmp not induction variable"; return None; } - const SCEV *StartNext = IndVarNext->getStart(); - const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE)); + const SCEV *StartNext = IndVarBase->getStart(); + const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE)); const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); + const SCEV *Step = SE.getSCEV(StepCI); ConstantInt *One = ConstantInt::get(IndVarTy, 1); - // TODO: generalize the predicates here to also match their unsigned variants. if (IsIncreasing) { bool DecreasedRightValueByOne = false; - // Try to turn eq/ne predicates to those we can work with. - if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) - // while (++i != len) { while (++i < len) { - // ... ---> ... - // } } - Pred = ICmpInst::ICMP_SLT; - else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && - !CanBeSMin(SE, RightSCEV)) { - // while (true) { while (true) { - // if (++i == len) ---> if (++i > len - 1) - // break; break; - // ... ... - // } } - Pred = ICmpInst::ICMP_SGT; - RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())); - DecreasedRightValueByOne = true; + if (StepCI->isOne()) { + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (++i != len) { while (++i < len) { + // ... ---> ... + // } } + // If both parts are known non-negative, it is profitable to use + // unsigned comparison in increasing loop. This allows us to make the + // comparison check against "RightSCEV + 1" more optimistic. + if (SE.isKnownNonNegative(IndVarStart) && + SE.isKnownNonNegative(RightSCEV)) + Pred = ICmpInst::ICMP_ULT; + else + Pred = ICmpInst::ICMP_SLT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) { + // while (true) { while (true) { + // if (++i == len) ---> if (++i > len - 1) + // break; break; + // ... ... + // } } + // TODO: Insert ICMP_UGT if both are non-negative? + Pred = ICmpInst::ICMP_SGT; + RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())); + DecreasedRightValueByOne = true; + } } + bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT); + bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT); bool FoundExpectedPred = - (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) || - (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0); + (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0); if (!FoundExpectedPred) { FailureReason = "expected icmp slt semantically, found something else"; return None; } + IsSignedPredicate = + Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT; + + if (!IsSignedPredicate && !AllowUnsignedLatchCondition) { + FailureReason = "unsigned latch conditions are explicitly prohibited"; + return None; + } + + // The predicate that we need to check that the induction variable lies + // within bounds. + ICmpInst::Predicate BoundPred = + IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; + if (LatchBrExitIdx == 0) { - if (CanBeSMax(SE, RightSCEV)) { + const SCEV *StepMinusOne = SE.getMinusSCEV(Step, + SE.getOne(Step->getType())); + if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) { // TODO: this restriction is easily removable -- we just have to // remember that the icmp was an slt and not an sle. - FailureReason = "limit may overflow when coercing sle to slt"; + FailureReason = "limit may overflow when coercing le to lt"; return None; } if (!SE.isLoopEntryGuardedByCond( - &L, CmpInst::ICMP_SLT, IndVarStart, - SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) { + &L, BoundPred, IndVarStart, + SE.getAddExpr(RightSCEV, Step))) { FailureReason = "Induction variable start not bounded by upper limit"; return None; } @@ -855,8 +936,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP RightValue = B.CreateAdd(RightValue, One); } } else { - if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart, - RightSCEV)) { + if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) { FailureReason = "Induction variable start not bounded by upper limit"; return None; } @@ -865,43 +945,65 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP } } else { bool IncreasedRightValueByOne = false; - // Try to turn eq/ne predicates to those we can work with. - if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) - // while (--i != len) { while (--i > len) { - // ... ---> ... - // } } - Pred = ICmpInst::ICMP_SGT; - else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && - !CanBeSMax(SE, RightSCEV)) { - // while (true) { while (true) { - // if (--i == len) ---> if (--i < len + 1) - // break; break; - // ... ... - // } } - Pred = ICmpInst::ICMP_SLT; - RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); - IncreasedRightValueByOne = true; + if (StepCI->isMinusOne()) { + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (--i != len) { while (--i > len) { + // ... ---> ... + // } } + // We intentionally don't turn the predicate into UGT even if we know + // that both operands are non-negative, because it will only pessimize + // our check against "RightSCEV - 1". + Pred = ICmpInst::ICMP_SGT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) { + // while (true) { while (true) { + // if (--i == len) ---> if (--i < len + 1) + // break; break; + // ... ... + // } } + // TODO: Insert ICMP_ULT if both are non-negative? + Pred = ICmpInst::ICMP_SLT; + RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); + IncreasedRightValueByOne = true; + } } + bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT); + bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT); + bool FoundExpectedPred = - (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) || - (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0); + (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0); if (!FoundExpectedPred) { FailureReason = "expected icmp sgt semantically, found something else"; return None; } + IsSignedPredicate = + Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT; + + if (!IsSignedPredicate && !AllowUnsignedLatchCondition) { + FailureReason = "unsigned latch conditions are explicitly prohibited"; + return None; + } + + // The predicate that we need to check that the induction variable lies + // within bounds. + ICmpInst::Predicate BoundPred = + IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT; + if (LatchBrExitIdx == 0) { - if (CanBeSMin(SE, RightSCEV)) { + const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType())); + if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) { // TODO: this restriction is easily removable -- we just have to // remember that the icmp was an sgt and not an sge. - FailureReason = "limit may overflow when coercing sge to sgt"; + FailureReason = "limit may overflow when coercing ge to gt"; return None; } if (!SE.isLoopEntryGuardedByCond( - &L, CmpInst::ICMP_SGT, IndVarStart, + &L, BoundPred, IndVarStart, SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) { FailureReason = "Induction variable start not bounded by lower limit"; return None; @@ -914,8 +1016,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP RightValue = B.CreateSub(RightValue, One); } } else { - if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart, - RightSCEV)) { + if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) { FailureReason = "Induction variable start not bounded by lower limit"; return None; } @@ -923,7 +1024,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP "Right value can be increased only for LatchBrExitIdx == 0!"); } } - BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx); assert(SE.getLoopDisposition(LatchCount, &L) == @@ -946,9 +1046,11 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP Result.LatchExit = LatchExit; Result.LatchBrExitIdx = LatchBrExitIdx; Result.IndVarStart = IndVarStartV; - Result.IndVarNext = LeftValue; + Result.IndVarStep = StepCI; + Result.IndVarBase = LeftValue; Result.IndVarIncreasing = IsIncreasing; Result.LoopExitAt = RightValue; + Result.IsSignedPredicate = IsSignedPredicate; FailureReason = nullptr; @@ -956,7 +1058,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP } Optional<LoopConstrainer::SubRanges> -LoopConstrainer::calculateSubRanges() const { +LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const { IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType()); if (Range.getType() != Ty) @@ -999,26 +1101,31 @@ LoopConstrainer::calculateSubRanges() const { // that case, `Clamp` will always return `Smallest` and // [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`) // will be an empty range. Returning an empty range is always safe. - // Smallest = SE.getAddExpr(End, One); Greatest = SE.getAddExpr(Start, One); GreatestSeen = Start; } - auto Clamp = [this, Smallest, Greatest](const SCEV *S) { - return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S)); + auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) { + return IsSignedPredicate + ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S)) + : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S)); }; - // In some cases we can prove that we don't need a pre or post loop + // In some cases we can prove that we don't need a pre or post loop. + ICmpInst::Predicate PredLE = + IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; + ICmpInst::Predicate PredLT = + IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; bool ProvablyNoPreloop = - SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest); + SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest); if (!ProvablyNoPreloop) Result.LowLimit = Clamp(Range.getBegin()); bool ProvablyNoPostLoop = - SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd()); + SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd()); if (!ProvablyNoPostLoop) Result.HighLimit = Clamp(Range.getEnd()); @@ -1082,7 +1189,6 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result, LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt, BasicBlock *ContinuationBlock) const { - // We start with a loop with a single latch: // // +--------------------+ @@ -1153,7 +1259,6 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( // | original exit <----+ // | | // +--------------------+ - // RewrittenRangeInfo RRI; @@ -1165,22 +1270,35 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator()); bool Increasing = LS.IndVarIncreasing; + bool IsSignedPredicate = LS.IsSignedPredicate; IRBuilder<> B(PreheaderJump); // EnterLoopCond - is it okay to start executing this `LS'? - Value *EnterLoopCond = Increasing - ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt) - : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt); + Value *EnterLoopCond = nullptr; + if (Increasing) + EnterLoopCond = IsSignedPredicate + ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt) + : B.CreateICmpULT(LS.IndVarStart, ExitSubloopAt); + else + EnterLoopCond = IsSignedPredicate + ? B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt) + : B.CreateICmpUGT(LS.IndVarStart, ExitSubloopAt); B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit); PreheaderJump->eraseFromParent(); LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector); B.SetInsertPoint(LS.LatchBr); - Value *TakeBackedgeLoopCond = - Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt) - : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt); + Value *TakeBackedgeLoopCond = nullptr; + if (Increasing) + TakeBackedgeLoopCond = IsSignedPredicate + ? B.CreateICmpSLT(LS.IndVarBase, ExitSubloopAt) + : B.CreateICmpULT(LS.IndVarBase, ExitSubloopAt); + else + TakeBackedgeLoopCond = IsSignedPredicate + ? B.CreateICmpSGT(LS.IndVarBase, ExitSubloopAt) + : B.CreateICmpUGT(LS.IndVarBase, ExitSubloopAt); Value *CondForBranch = LS.LatchBrExitIdx == 1 ? TakeBackedgeLoopCond : B.CreateNot(TakeBackedgeLoopCond); @@ -1192,9 +1310,15 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( // IterationsLeft - are there any more iterations left, given the original // upper bound on the induction variable? If not, we branch to the "real" // exit. - Value *IterationsLeft = Increasing - ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt) - : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt); + Value *IterationsLeft = nullptr; + if (Increasing) + IterationsLeft = IsSignedPredicate + ? B.CreateICmpSLT(LS.IndVarBase, LS.LoopExitAt) + : B.CreateICmpULT(LS.IndVarBase, LS.LoopExitAt); + else + IterationsLeft = IsSignedPredicate + ? B.CreateICmpSGT(LS.IndVarBase, LS.LoopExitAt) + : B.CreateICmpUGT(LS.IndVarBase, LS.LoopExitAt); B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit); BranchInst *BranchToContinuation = @@ -1217,10 +1341,10 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( RRI.PHIValuesAtPseudoExit.push_back(NewPHI); } - RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end", + RRI.IndVarEnd = PHINode::Create(LS.IndVarBase->getType(), 2, "indvar.end", BranchToContinuation); RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader); - RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector); + RRI.IndVarEnd->addIncoming(LS.IndVarBase, RRI.ExitSelector); // The latch exit now has a branch from `RRI.ExitSelector' instead of // `LS.Latch'. The PHI nodes need to be updated to reflect that. @@ -1237,7 +1361,6 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( void LoopConstrainer::rewriteIncomingValuesForPHIs( LoopStructure &LS, BasicBlock *ContinuationBlock, const LoopConstrainer::RewrittenRangeInfo &RRI) const { - unsigned PHIIndex = 0; for (Instruction &I : *LS.Header) { auto *PN = dyn_cast<PHINode>(&I); @@ -1255,7 +1378,6 @@ void LoopConstrainer::rewriteIncomingValuesForPHIs( BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader, const char *Tag) const { - BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header); BranchInst::Create(LS.Header, Preheader); @@ -1282,7 +1404,7 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent, ValueToValueMapTy &VM) { - Loop &New = *new Loop(); + Loop &New = *LI.AllocateLoop(); if (Parent) Parent->addChildLoop(&New); else @@ -1311,7 +1433,8 @@ bool LoopConstrainer::run() { OriginalPreheader = Preheader; MainLoopPreheader = Preheader; - Optional<SubRanges> MaybeSR = calculateSubRanges(); + bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate; + Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate); if (!MaybeSR.hasValue()) { DEBUG(dbgs() << "irce: could not compute subranges\n"); return false; @@ -1320,7 +1443,7 @@ bool LoopConstrainer::run() { SubRanges SR = MaybeSR.getValue(); bool Increasing = MainLoopStructure.IndVarIncreasing; IntegerType *IVTy = - cast<IntegerType>(MainLoopStructure.IndVarNext->getType()); + cast<IntegerType>(MainLoopStructure.IndVarBase->getType()); SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce"); Instruction *InsertPt = OriginalPreheader->getTerminator(); @@ -1345,7 +1468,7 @@ bool LoopConstrainer::run() { if (Increasing) ExitPreLoopAtSCEV = *SR.LowLimit; else { - if (CanBeSMin(SE, *SR.HighLimit)) { + if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) { DEBUG(dbgs() << "irce: could not prove no-overflow when computing " << "preloop exit limit. HighLimit = " << *(*SR.HighLimit) << "\n"); @@ -1354,6 +1477,13 @@ bool LoopConstrainer::run() { ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS); } + if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) { + DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" + << " preloop exit limit " << *ExitPreLoopAtSCEV + << " at block " << InsertPt->getParent()->getName() << "\n"); + return false; + } + ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt); ExitPreLoopAt->setName("exit.preloop.at"); } @@ -1364,7 +1494,7 @@ bool LoopConstrainer::run() { if (Increasing) ExitMainLoopAtSCEV = *SR.HighLimit; else { - if (CanBeSMin(SE, *SR.LowLimit)) { + if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) { DEBUG(dbgs() << "irce: could not prove no-overflow when computing " << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit) << "\n"); @@ -1373,6 +1503,13 @@ bool LoopConstrainer::run() { ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS); } + if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) { + DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" + << " main loop exit limit " << *ExitMainLoopAtSCEV + << " at block " << InsertPt->getParent()->getName() << "\n"); + return false; + } + ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt); ExitMainLoopAt->setName("exit.mainloop.at"); } @@ -1463,34 +1600,27 @@ bool LoopConstrainer::run() { /// range, returns None. Optional<InductiveRangeCheck::Range> InductiveRangeCheck::computeSafeIterationSpace( - ScalarEvolution &SE, const SCEVAddRecExpr *IndVar) const { + ScalarEvolution &SE, const SCEVAddRecExpr *IndVar, + bool IsLatchSigned) const { // IndVar is of the form "A + B * I" (where "I" is the canonical induction // variable, that may or may not exist as a real llvm::Value in the loop) and // this inductive range check is a range check on the "C + D * I" ("C" is - // getOffset() and "D" is getScale()). We rewrite the value being range + // getBegin() and "D" is getStep()). We rewrite the value being range // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA". - // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code - // can be generalized as needed. // // The actual inequalities we solve are of the form // // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1) // - // The inequality is satisfied by -M <= IndVar < (L - M) [^1]. All additions - // and subtractions are twos-complement wrapping and comparisons are signed. - // - // Proof: - // - // If there exists IndVar such that -M <= IndVar < (L - M) then it follows - // that -M <= (-M + L) [== Eq. 1]. Since L >= 0, if (-M + L) sign-overflows - // then (-M + L) < (-M). Hence by [Eq. 1], (-M + L) could not have - // overflown. - // - // This means IndVar = t + (-M) for t in [0, L). Hence (IndVar + M) = t. - // Hence 0 <= (IndVar + M) < L - - // [^1]: Note that the solution does _not_ apply if L < 0; consider values M = - // 127, IndVar = 126 and L = -2 in an i8 world. + // Here L stands for upper limit of the safe iteration space. + // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid + // overflows when calculating (0 - M) and (L - M) we, depending on type of + // IV's iteration space, limit the calculations by borders of the iteration + // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0. + // If we figured out that "anything greater than (-M) is safe", we strengthen + // this to "everything greater than 0 is safe", assuming that values between + // -M and 0 just do not exist in unsigned iteration space, and we don't want + // to deal with overflown values. if (!IndVar->isAffine()) return None; @@ -1499,42 +1629,89 @@ InductiveRangeCheck::computeSafeIterationSpace( const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE)); if (!B) return None; + assert(!B->isZero() && "Recurrence with zero step?"); - const SCEV *C = getOffset(); - const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale()); + const SCEV *C = getBegin(); + const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep()); if (D != B) return None; - ConstantInt *ConstD = D->getValue(); - if (!(ConstD->isMinusOne() || ConstD->isOne())) - return None; + assert(!D->getValue()->isZero() && "Recurrence with zero step?"); + unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth(); + const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); + // Substract Y from X so that it does not go through border of the IV + // iteration space. Mathematically, it is equivalent to: + // + // ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1] + // + // In [1], 'X - Y' is a mathematical substraction (result is not bounded to + // any width of bit grid). But after we take min/max, the result is + // guaranteed to be within [INT_MIN, INT_MAX]. + // + // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min + // values, depending on type of latch condition that defines IV iteration + // space. + auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) { + assert(SE.isKnownNonNegative(X) && + "We can only substract from values in [0; SINT_MAX]!"); + if (IsLatchSigned) { + // X is a number from signed range, Y is interpreted as signed. + // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only + // thing we should care about is that we didn't cross SINT_MAX. + // So, if Y is positive, we substract Y safely. + // Rule 1: Y > 0 ---> Y. + // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely. + // Rule 2: Y >=s (X - SINT_MAX) ---> Y. + // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX). + // Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX). + // It gives us smax(Y, X - SINT_MAX) to substract in all cases. + const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax); + return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax), + SCEV::FlagNSW); + } else + // X is a number from unsigned range, Y is interpreted as signed. + // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only + // thing we should care about is that we didn't cross zero. + // So, if Y is negative, we substract Y safely. + // Rule 1: Y <s 0 ---> Y. + // If 0 <= Y <= X, we substract Y safely. + // Rule 2: Y <=s X ---> Y. + // If 0 <= X < Y, we should stop at 0 and can only substract X. + // Rule 3: Y >s X ---> X. + // It gives us smin(X, Y) to substract in all cases. + return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW); + }; const SCEV *M = SE.getMinusSCEV(C, A); - - const SCEV *Begin = SE.getNegativeSCEV(M); - const SCEV *UpperLimit = nullptr; + const SCEV *Zero = SE.getZero(M->getType()); + const SCEV *Begin = ClampedSubstract(Zero, M); + const SCEV *L = nullptr; // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L". // We can potentially do much better here. - if (Value *V = getLength()) { - UpperLimit = SE.getSCEV(V); - } else { + if (const SCEV *EndLimit = getEnd()) + L = EndLimit; + else { assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!"); - unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth(); - UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); + L = SIntMax; } - - const SCEV *End = SE.getMinusSCEV(UpperLimit, M); + const SCEV *End = ClampedSubstract(L, M); return InductiveRangeCheck::Range(Begin, End); } static Optional<InductiveRangeCheck::Range> -IntersectRange(ScalarEvolution &SE, - const Optional<InductiveRangeCheck::Range> &R1, - const InductiveRangeCheck::Range &R2) { +IntersectSignedRange(ScalarEvolution &SE, + const Optional<InductiveRangeCheck::Range> &R1, + const InductiveRangeCheck::Range &R2) { + if (R2.isEmpty(SE, /* IsSigned */ true)) + return None; if (!R1.hasValue()) return R2; auto &R1Value = R1.getValue(); + // We never return empty ranges from this function, and R1 is supposed to be + // a result of intersection. Thus, R1 is never empty. + assert(!R1Value.isEmpty(SE, /* IsSigned */ true) && + "We should never have empty R1!"); // TODO: we could widen the smaller range and have this work; but for now we // bail out to keep things simple. @@ -1544,7 +1721,40 @@ IntersectRange(ScalarEvolution &SE, const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin()); const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd()); - return InductiveRangeCheck::Range(NewBegin, NewEnd); + // If the resulting range is empty, just return None. + auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd); + if (Ret.isEmpty(SE, /* IsSigned */ true)) + return None; + return Ret; +} + +static Optional<InductiveRangeCheck::Range> +IntersectUnsignedRange(ScalarEvolution &SE, + const Optional<InductiveRangeCheck::Range> &R1, + const InductiveRangeCheck::Range &R2) { + if (R2.isEmpty(SE, /* IsSigned */ false)) + return None; + if (!R1.hasValue()) + return R2; + auto &R1Value = R1.getValue(); + // We never return empty ranges from this function, and R1 is supposed to be + // a result of intersection. Thus, R1 is never empty. + assert(!R1Value.isEmpty(SE, /* IsSigned */ false) && + "We should never have empty R1!"); + + // TODO: we could widen the smaller range and have this work; but for now we + // bail out to keep things simple. + if (R1Value.getType() != R2.getType()) + return None; + + const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin()); + const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd()); + + // If the resulting range is empty, just return None. + auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd); + if (Ret.isEmpty(SE, /* IsSigned */ false)) + return None; + return Ret; } bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { @@ -1598,24 +1808,31 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } LoopStructure LS = MaybeLoopStructure.getValue(); - bool Increasing = LS.IndVarIncreasing; - const SCEV *MinusOne = - SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true); const SCEVAddRecExpr *IndVar = - cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne)); + cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep))); Optional<InductiveRangeCheck::Range> SafeIterRange; Instruction *ExprInsertPt = Preheader->getTerminator(); SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate; + // Basing on the type of latch predicate, we interpret the IV iteration range + // as signed or unsigned range. We use different min/max functions (signed or + // unsigned) when intersecting this range with safe iteration ranges implied + // by range checks. + auto IntersectRange = + LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange; IRBuilder<> B(ExprInsertPt); for (InductiveRangeCheck &IRC : RangeChecks) { - auto Result = IRC.computeSafeIterationSpace(SE, IndVar); + auto Result = IRC.computeSafeIterationSpace(SE, IndVar, + LS.IsSignedPredicate); if (Result.hasValue()) { auto MaybeSafeIterRange = IntersectRange(SE, SafeIterRange, Result.getValue()); if (MaybeSafeIterRange.hasValue()) { + assert( + !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) && + "We should never return empty ranges!"); RangeChecksToEliminate.push_back(IRC); SafeIterRange = MaybeSafeIterRange.getValue(); } diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp index 89b28f0aeee6..7d66c0f73821 100644 --- a/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -1,4 +1,4 @@ -//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===// +//===- InferAddressSpace.cpp - --------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -89,26 +89,54 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include <cassert> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> #define DEBUG_TYPE "infer-address-spaces" using namespace llvm; +static const unsigned UninitializedAddressSpace = + std::numeric_limits<unsigned>::max(); + namespace { -static const unsigned UninitializedAddressSpace = ~0u; using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; @@ -146,10 +174,9 @@ private: // Changes the flat address expressions in function F to point to specific // address spaces if InferredAddrSpace says so. Postorder is the postorder of // all flat expressions in the use-def graph of function F. - bool - rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder, - const ValueToAddrSpaceMapTy &InferredAddrSpace, - Function *F) const; + bool rewriteWithNewAddressSpaces( + const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, + const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const; void appendsFlatAddressExpressionToPostorderStack( Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack, @@ -170,13 +197,16 @@ private: SmallVectorImpl<const Use *> *UndefUsesToFix) const; unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const; }; + } // end anonymous namespace char InferAddressSpaces::ID = 0; namespace llvm { + void initializeInferAddressSpacesPass(PassRegistry &); -} + +} // end namespace llvm INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", false, false) @@ -454,11 +484,10 @@ static Value *cloneInstructionWithNewAddressSpace( NewGEP->setIsInBounds(GEP->isInBounds()); return NewGEP; } - case Instruction::Select: { + case Instruction::Select: assert(I->getType()->isPointerTy()); return SelectInst::Create(I->getOperand(0), NewPointerOperands[1], NewPointerOperands[2], "", nullptr, I); - } default: llvm_unreachable("Unexpected opcode"); } @@ -600,7 +629,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) { // Changes the address spaces of the flat address expressions who are inferred // to point to a specific address space. - return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F); + return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F); } // Constants need to be tracked through RAUW to handle cases with nested @@ -708,24 +737,32 @@ Optional<unsigned> InferAddressSpaces::updateAddressSpace( /// \p returns true if \p U is the pointer operand of a memory instruction with /// a single pointer operand that can have its address space changed by simply -/// mutating the use to a new value. -static bool isSimplePointerUseValidToReplace(Use &U) { +/// mutating the use to a new value. If the memory instruction is volatile, +/// return true only if the target allows the memory instruction to be volatile +/// in the new address space. +static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI, + Use &U, unsigned AddrSpace) { User *Inst = U.getUser(); unsigned OpNo = U.getOperandNo(); + bool VolatileIsAllowed = false; + if (auto *I = dyn_cast<Instruction>(Inst)) + VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace); if (auto *LI = dyn_cast<LoadInst>(Inst)) - return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile(); + return OpNo == LoadInst::getPointerOperandIndex() && + (VolatileIsAllowed || !LI->isVolatile()); if (auto *SI = dyn_cast<StoreInst>(Inst)) - return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile(); + return OpNo == StoreInst::getPointerOperandIndex() && + (VolatileIsAllowed || !SI->isVolatile()); if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst)) - return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile(); + return OpNo == AtomicRMWInst::getPointerOperandIndex() && + (VolatileIsAllowed || !RMW->isVolatile()); - if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { + if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() && - !CmpX->isVolatile(); - } + (VolatileIsAllowed || !CmpX->isVolatile()); return false; } @@ -818,7 +855,7 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I, } bool InferAddressSpaces::rewriteWithNewAddressSpaces( - ArrayRef<WeakTrackingVH> Postorder, + const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const { // For each address expression to be modified, creates a clone of it with its // pointer operands converted to the new address space. Since the pointer @@ -878,7 +915,8 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces( // to the next instruction. I = skipToNextUser(I, E); - if (isSimplePointerUseValidToReplace(U)) { + if (isSimplePointerUseValidToReplace( + TTI, U, V->getType()->getPointerAddressSpace())) { // If V is used as the pointer operand of a compatible memory operation, // sets the pointer operand to NewV. This replacement does not change // the element type, so the resultant load/store is still valid. @@ -933,6 +971,11 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces( if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) { unsigned NewAS = NewV->getType()->getPointerAddressSpace(); if (ASC->getDestAddressSpace() == NewAS) { + if (ASC->getType()->getPointerElementType() != + NewV->getType()->getPointerElementType()) { + NewV = CastInst::Create(Instruction::BitCast, NewV, + ASC->getType(), "", ASC); + } ASC->replaceAllUsesWith(NewV); DeadInstructions.push_back(ASC); continue; diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index dc9143bebc45..6b0377e0ecb3 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -14,25 +14,50 @@ #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -41,8 +66,15 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <iterator> #include <memory> +#include <utility> + using namespace llvm; using namespace jumpthreading; @@ -70,6 +102,7 @@ static cl::opt<bool> PrintLVIAfterJumpThreading( cl::Hidden); namespace { + /// This pass performs 'jump threading', which looks at blocks that have /// multiple predecessors and multiple successors. If one or more of the /// predecessors of the block can be proven to always jump to one of the @@ -85,12 +118,12 @@ namespace { /// /// In this case, the unconditional branch at the end of the first if can be /// revectored to the false side of the second if. - /// class JumpThreading : public FunctionPass { JumpThreadingPass Impl; public: static char ID; // Pass identification + JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) { initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); } @@ -108,9 +141,11 @@ namespace { void releaseMemory() override { Impl.releaseMemory(); } }; -} + +} // end anonymous namespace char JumpThreading::ID = 0; + INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", "Jump Threading", false, false) INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) @@ -120,14 +155,125 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) // Public interface to the Jump Threading pass -FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); } +FunctionPass *llvm::createJumpThreadingPass(int Threshold) { + return new JumpThreading(Threshold); +} JumpThreadingPass::JumpThreadingPass(int T) { BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); } -/// runOnFunction - Top level algorithm. -/// +// Update branch probability information according to conditional +// branch probablity. This is usually made possible for cloned branches +// in inline instances by the context specific profile in the caller. +// For instance, +// +// [Block PredBB] +// [Branch PredBr] +// if (t) { +// Block A; +// } else { +// Block B; +// } +// +// [Block BB] +// cond = PN([true, %A], [..., %B]); // PHI node +// [Branch CondBr] +// if (cond) { +// ... // P(cond == true) = 1% +// } +// +// Here we know that when block A is taken, cond must be true, which means +// P(cond == true | A) = 1 +// +// Given that P(cond == true) = P(cond == true | A) * P(A) + +// P(cond == true | B) * P(B) +// we get: +// P(cond == true ) = P(A) + P(cond == true | B) * P(B) +// +// which gives us: +// P(A) is less than P(cond == true), i.e. +// P(t == true) <= P(cond == true) +// +// In other words, if we know P(cond == true) is unlikely, we know +// that P(t == true) is also unlikely. +// +static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { + BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); + if (!CondBr) + return; + + BranchProbability BP; + uint64_t TrueWeight, FalseWeight; + if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight)) + return; + + // Returns the outgoing edge of the dominating predecessor block + // that leads to the PhiNode's incoming block: + auto GetPredOutEdge = + [](BasicBlock *IncomingBB, + BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> { + auto *PredBB = IncomingBB; + auto *SuccBB = PhiBB; + while (true) { + BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); + if (PredBr && PredBr->isConditional()) + return {PredBB, SuccBB}; + auto *SinglePredBB = PredBB->getSinglePredecessor(); + if (!SinglePredBB) + return {nullptr, nullptr}; + SuccBB = PredBB; + PredBB = SinglePredBB; + } + }; + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *PhiOpnd = PN->getIncomingValue(i); + ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd); + + if (!CI || !CI->getType()->isIntegerTy(1)) + continue; + + BP = (CI->isOne() ? BranchProbability::getBranchProbability( + TrueWeight, TrueWeight + FalseWeight) + : BranchProbability::getBranchProbability( + FalseWeight, TrueWeight + FalseWeight)); + + auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB); + if (!PredOutEdge.first) + return; + + BasicBlock *PredBB = PredOutEdge.first; + BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator()); + + uint64_t PredTrueWeight, PredFalseWeight; + // FIXME: We currently only set the profile data when it is missing. + // With PGO, this can be used to refine even existing profile data with + // context information. This needs to be done after more performance + // testing. + if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight)) + continue; + + // We can not infer anything useful when BP >= 50%, because BP is the + // upper bound probability value. + if (BP >= BranchProbability(50, 100)) + continue; + + SmallVector<uint32_t, 2> Weights; + if (PredBr->getSuccessor(0) == PredOutEdge.second) { + Weights.push_back(BP.getNumerator()); + Weights.push_back(BP.getCompl().getNumerator()); + } else { + Weights.push_back(BP.getCompl().getNumerator()); + Weights.push_back(BP.getNumerator()); + } + PredBr->setMetadata(LLVMContext::MD_prof, + MDBuilder(PredBr->getParent()->getContext()) + .createBranchWeights(Weights)); + } +} + +/// runOnFunction - Toplevel algorithm. bool JumpThreading::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -155,7 +301,6 @@ bool JumpThreading::runOnFunction(Function &F) { PreservedAnalyses JumpThreadingPass::run(Function &F, FunctionAnalysisManager &AM) { - auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto &LVI = AM.getResult<LazyValueAnalysis>(F); auto &AA = AM.getResult<AAManager>(F); @@ -184,7 +329,6 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, bool HasProfileData_, std::unique_ptr<BlockFrequencyInfo> BFI_, std::unique_ptr<BranchProbabilityInfo> BPI_) { - DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = TLI_; LVI = LVI_; @@ -384,7 +528,6 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, /// within the loop (forming a nested loop). This simple analysis is not rich /// enough to track all of these properties and keep it up-to-date as the CFG /// mutates, so we don't allow any of these transformations. -/// void JumpThreadingPass::FindLoopHeaders(Function &F) { SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; FindFunctionBackedges(F, Edges); @@ -418,7 +561,6 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { /// BB in the result vector. /// /// This returns true if there were any known values. -/// bool JumpThreadingPass::ComputeValueKnownInPredecessors( Value *V, BasicBlock *BB, PredValueInfo &Result, ConstantPreference Preference, Instruction *CxtI) { @@ -507,8 +649,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( return true; } - PredValueInfoTy LHSVals, RHSVals; - // Handle some boolean conditions. if (I->getType()->getPrimitiveSizeInBits() == 1) { assert(Preference == WantInteger && "One-bit non-integer type?"); @@ -516,6 +656,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // X & false -> false if (I->getOpcode() == Instruction::Or || I->getOpcode() == Instruction::And) { + PredValueInfoTy LHSVals, RHSVals; + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, WantInteger, CxtI); ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals, @@ -655,6 +797,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // x as a live-in. { using namespace PatternMatch; + Value *AddLHS; ConstantInt *AddConst; if (isa<ConstantInt>(CmpConst) && @@ -751,14 +894,11 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( return !Result.empty(); } - - /// GetBestDestForBranchOnUndef - If we determine that the specified block ends /// in an undefined jump, decide which block is best to revector to. /// /// Since we can pick an arbitrary destination, we pick the successor with the /// fewest predecessors. This should reduce the in-degree of the others. -/// static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { TerminatorInst *BBTerm = BB->getTerminator(); unsigned MinSucc = 0; @@ -979,7 +1119,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // for loads that are used by a switch or by the condition for the branch. If // we see one, check to see if it's partially redundant. If so, insert a PHI // which can then be used to thread the values. - // Value *SimplifyValue = CondInst; if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue)) if (isa<Constant>(CondCmp->getOperand(1))) @@ -991,10 +1130,14 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { if (SimplifyPartiallyRedundantLoad(LI)) return true; + // Before threading, try to propagate profile data backwards: + if (PHINode *PN = dyn_cast<PHINode>(CondInst)) + if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator())) + updatePredecessorProfileMetadata(PN, BB); + // Handle a variety of cases where we are branching on something derived from // a PHI node in the current block. If we can prove that any predecessors // compute a predictable value based on a PHI node, thread those predecessors. - // if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator)) return true; @@ -1036,9 +1179,9 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) { if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB) return false; - bool FalseDest = PBI->getSuccessor(1) == CurrentBB; + bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB; Optional<bool> Implication = - isImpliedCondition(PBI->getCondition(), Cond, DL, FalseDest); + isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue); if (Implication) { BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB); BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI); @@ -1124,7 +1267,9 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { LI->getAAMetadata(AATags); SmallPtrSet<BasicBlock*, 8> PredsScanned; - typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; + + using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>; + AvailablePredsTy AvailablePreds; BasicBlock *OneUnavailablePred = nullptr; SmallVector<LoadInst*, 8> CSELoads; @@ -1283,8 +1428,8 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { /// the list. static BasicBlock * FindMostPopularDest(BasicBlock *BB, - const SmallVectorImpl<std::pair<BasicBlock*, - BasicBlock*> > &PredToDestList) { + const SmallVectorImpl<std::pair<BasicBlock *, + BasicBlock *>> &PredToDestList) { assert(!PredToDestList.empty()); // Determine popularity. If there are multiple possible destinations, we @@ -1502,7 +1647,6 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on /// a PHI node in the current block. See if there are any simplifications we /// can do based on inputs to the phi node. -/// bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) { BasicBlock *BB = PN->getParent(); @@ -1532,7 +1676,6 @@ bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) { /// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on /// a xor instruction in the current block. See if there are any /// simplifications we can do based on inputs to the xor. -/// bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) { BasicBlock *BB = BO->getParent(); @@ -1637,7 +1780,6 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) { return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); } - /// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new /// predecessor to the PHIBB block. If it has PHI nodes, add entries for /// NewPred using the entries from OldPred (suitably mapped). @@ -1677,10 +1819,15 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // If threading this would thread across a loop header, don't thread the edge. // See the comments above FindLoopHeaders for justifications and caveats. - if (LoopHeaders.count(BB)) { - DEBUG(dbgs() << " Not threading across loop header BB '" << BB->getName() - << "' to dest BB '" << SuccBB->getName() - << "' - it might create an irreducible loop!\n"); + if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { + DEBUG({ + bool BBIsHeader = LoopHeaders.count(BB); + bool SuccIsHeader = LoopHeaders.count(SuccBB); + dbgs() << " Not threading across " + << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName() + << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '") + << SuccBB->getName() << "' - it might create an irreducible loop!\n"; + }); return false; } @@ -1795,7 +1942,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, DEBUG(dbgs() << "\n"); } - // Ok, NewBB is good to go. Update the terminator of PredBB to jump to // NewBB instead of BB. This eliminates predecessors from BB, which requires // us to simplify any PHI nodes in BB. @@ -2194,7 +2340,7 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { /// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ... /// %c = cmp %p, 0 /// %s = select %c, trueval, falseval -// +/// /// And expand the select into a branch structure. This later enables /// jump-threading over bb in this pass. /// @@ -2280,6 +2426,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { /// guard is then threaded to one of them. bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) { using namespace PatternMatch; + // We only want to deal with two predecessors. BasicBlock *Pred1, *Pred2; auto PI = pred_begin(BB), PE = pred_end(BB); @@ -2331,8 +2478,7 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, TrueDestIsSafe = true; else { // False dest is safe if !BranchCond => GuardCond. - Impl = - isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true); + Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false); if (Impl && *Impl) FalseDestIsSafe = true; } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 37b9c4b1094e..4ea935793b80 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -42,7 +42,8 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -62,6 +63,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -88,15 +90,15 @@ static cl::opt<uint32_t> MaxNumUsesTraversed( "invariance in loop using invariant start (default = 8)")); static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); -static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo); +static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, + const LoopSafetyInfo *SafetyInfo, + TargetTransformInfo *TTI, bool &FreeInLoop); static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE); -static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST, - const LoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE); +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, bool FreeInLoop); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop, @@ -114,7 +116,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, namespace { struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, ScalarEvolution *SE, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST); DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() { @@ -146,6 +149,9 @@ struct LegacyLICMPass : public LoopPass { } auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + MemorySSA *MSSA = EnableMSSALoopDependency + ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA()) + : nullptr; // For the old PM, we can't use OptimizationRemarkEmitter as an analysis // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). @@ -155,7 +161,9 @@ struct LegacyLICMPass : public LoopPass { &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), - SE ? &SE->getSE() : nullptr, &ORE, false); + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent()), + SE ? &SE->getSE() : nullptr, MSSA, &ORE, false); } /// This transformation requires natural loop information & requires that @@ -164,6 +172,9 @@ struct LegacyLICMPass : public LoopPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfoWrapperPass>(); + if (EnableMSSALoopDependency) + AU.addRequired<MemorySSAWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); getLoopAnalysisUsage(AU); } @@ -189,7 +200,7 @@ private: /// Simple Analysis hook. Delete loop L from alias set map. void deleteAnalysisLoop(Loop *L) override; }; -} +} // namespace PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { @@ -204,7 +215,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, "cached at a higher level"); LoopInvariantCodeMotion LICM; - if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true)) + if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE, + AR.MSSA, ORE, true)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); @@ -217,6 +229,8 @@ INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, false) @@ -228,12 +242,10 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); } /// We should delete AST for inner loops in the new pass manager to avoid /// memory leak. /// -bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA, - LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, - ScalarEvolution *SE, - OptimizationRemarkEmitter *ORE, - bool DeleteAST) { +bool LoopInvariantCodeMotion::runOnLoop( + Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE, + MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) { bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -258,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA, // instructions, we perform another pass to hoist them out of the loop. // if (L->hasDedicatedExits()) - Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, CurAST, &SafetyInfo, ORE); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, @@ -292,10 +304,26 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA, bool Promoted = false; // Loop over all of the alias sets in the tracker object. - for (AliasSet &AS : *CurAST) - Promoted |= - promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT, - TLI, L, CurAST, &SafetyInfo, ORE); + for (AliasSet &AS : *CurAST) { + // We can promote this alias set if it has a store, if it is a "Must" + // alias set, if the pointer is loop invariant, and if we are not + // eliminating any volatile loads or stores. + if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || + AS.isVolatile() || !L->isLoopInvariant(AS.begin()->getValue())) + continue; + + assert( + !AS.empty() && + "Must alias set should have at least one pointer element in it!"); + + SmallSetVector<Value *, 8> PointerMustAliases; + for (const auto &ASI : AS) + PointerMustAliases.insert(ASI.getValue()); + + Promoted |= promoteLoopAccessesToScalars(PointerMustAliases, ExitBlocks, + InsertPts, PIC, LI, DT, TLI, L, + CurAST, &SafetyInfo, ORE); + } // Once we have promoted values across the loop body we have to // recursively reform LCSSA as any nested loop may now have values defined @@ -335,7 +363,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA, /// definitions, allowing us to sink a loop body in one pass without iteration. /// bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + DominatorTree *DT, TargetLibraryInfo *TLI, + TargetTransformInfo *TTI, Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { @@ -344,46 +373,50 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr && "Unexpected input to sinkRegion"); - BasicBlock *BB = N->getBlock(); - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) - return false; + // We want to visit children before parents. We will enque all the parents + // before their children in the worklist and process the worklist in reverse + // order. + SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop); - // We are processing blocks in reverse dfo, so process children first. bool Changed = false; - const std::vector<DomTreeNode *> &Children = N->getChildren(); - for (DomTreeNode *Child : Children) - Changed |= - sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE); - - // Only need to process the contents of this block if it is not part of a - // subloop (which would already have been processed). - if (inSubLoop(BB, CurLoop, LI)) - return Changed; + for (DomTreeNode *DTN : reverse(Worklist)) { + BasicBlock *BB = DTN->getBlock(); + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (inSubLoop(BB, CurLoop, LI)) + continue; - for (BasicBlock::iterator II = BB->end(); II != BB->begin();) { - Instruction &I = *--II; + for (BasicBlock::iterator II = BB->end(); II != BB->begin();) { + Instruction &I = *--II; - // If the instruction is dead, we would try to sink it because it isn't used - // in the loop, instead, just delete it. - if (isInstructionTriviallyDead(&I, TLI)) { - DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); - ++II; - CurAST->deleteValue(&I); - I.eraseFromParent(); - Changed = true; - continue; - } + // If the instruction is dead, we would try to sink it because it isn't + // used in the loop, instead, just delete it. + if (isInstructionTriviallyDead(&I, TLI)) { + DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); + ++II; + CurAST->deleteValue(&I); + I.eraseFromParent(); + Changed = true; + continue; + } - // Check to see if we can sink this instruction to the exit blocks - // of the loop. We can do this if the all users of the instruction are - // outside of the loop. In this case, it doesn't even matter if the - // operands of the instruction are loop invariant. - // - if (isNotUsedInLoop(I, CurLoop, SafetyInfo) && - canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) { - ++II; - Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE); + // Check to see if we can sink this instruction to the exit blocks + // of the loop. We can do this if the all users of the instruction are + // outside of the loop. In this case, it doesn't even matter if the + // operands of the instruction are loop invariant. + // + bool FreeInLoop = false; + if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) && + canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) { + if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) { + if (!FreeInLoop) { + ++II; + CurAST->deleteValue(&I); + I.eraseFromParent(); + } + Changed = true; + } + } } } return Changed; @@ -403,73 +436,70 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr && "Unexpected input to hoistRegion"); - BasicBlock *BB = N->getBlock(); - - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) - return false; + // We want to visit parents before children. We will enque all the parents + // before their children in the worklist and process the worklist in order. + SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop); - // Only need to process the contents of this block if it is not part of a - // subloop (which would already have been processed). bool Changed = false; - if (!inSubLoop(BB, CurLoop, LI)) - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { - Instruction &I = *II++; - // Try constant folding this instruction. If all the operands are - // constants, it is technically hoistable, but it would be better to just - // fold it. - if (Constant *C = ConstantFoldInstruction( - &I, I.getModule()->getDataLayout(), TLI)) { - DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); - CurAST->copyValue(&I, C); - I.replaceAllUsesWith(C); - if (isInstructionTriviallyDead(&I, TLI)) { - CurAST->deleteValue(&I); - I.eraseFromParent(); + for (DomTreeNode *DTN : Worklist) { + BasicBlock *BB = DTN->getBlock(); + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (!inSubLoop(BB, CurLoop, LI)) + for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { + Instruction &I = *II++; + // Try constant folding this instruction. If all the operands are + // constants, it is technically hoistable, but it would be better to + // just fold it. + if (Constant *C = ConstantFoldInstruction( + &I, I.getModule()->getDataLayout(), TLI)) { + DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); + CurAST->copyValue(&I, C); + I.replaceAllUsesWith(C); + if (isInstructionTriviallyDead(&I, TLI)) { + CurAST->deleteValue(&I); + I.eraseFromParent(); + } + Changed = true; + continue; } - Changed = true; - continue; - } - // Attempt to remove floating point division out of the loop by converting - // it to a reciprocal multiplication. - if (I.getOpcode() == Instruction::FDiv && - CurLoop->isLoopInvariant(I.getOperand(1)) && - I.hasAllowReciprocal()) { - auto Divisor = I.getOperand(1); - auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); - auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); - ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); - ReciprocalDivisor->insertBefore(&I); + // Attempt to remove floating point division out of the loop by + // converting it to a reciprocal multiplication. + if (I.getOpcode() == Instruction::FDiv && + CurLoop->isLoopInvariant(I.getOperand(1)) && + I.hasAllowReciprocal()) { + auto Divisor = I.getOperand(1); + auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); + auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); + ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); + ReciprocalDivisor->insertBefore(&I); - auto Product = BinaryOperator::CreateFMul(I.getOperand(0), - ReciprocalDivisor); - Product->setFastMathFlags(I.getFastMathFlags()); - Product->insertAfter(&I); - I.replaceAllUsesWith(Product); - I.eraseFromParent(); + auto Product = + BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); + Product->setFastMathFlags(I.getFastMathFlags()); + Product->insertAfter(&I); + I.replaceAllUsesWith(Product); + I.eraseFromParent(); - hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE); - Changed = true; - continue; - } + hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE); + Changed = true; + continue; + } - // Try hoisting the instruction out to the preheader. We can only do this - // if all of the operands of the instruction are loop invariant and if it - // is safe to hoist the instruction. - // - if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) && - isSafeToExecuteUnconditionally( - I, DT, CurLoop, SafetyInfo, ORE, - CurLoop->getLoopPreheader()->getTerminator())) - Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE); - } + // Try hoisting the instruction out to the preheader. We can only do + // this if all of the operands of the instruction are loop invariant and + // if it is safe to hoist the instruction. + // + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) && + isSafeToExecuteUnconditionally( + I, DT, CurLoop, SafetyInfo, ORE, + CurLoop->getLoopPreheader()->getTerminator())) + Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE); + } + } - const std::vector<DomTreeNode *> &Children = N->getChildren(); - for (DomTreeNode *Child : Children) - Changed |= - hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE); return Changed; } @@ -492,7 +522,8 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) { // Iterate over loop instructions and compute safety info. // Skip header as it has been computed and stored in HeaderMayThrow. // The first block in loopinfo.Blocks is guaranteed to be the header. - assert(Header == *CurLoop->getBlocks().begin() && "First block must be header"); + assert(Header == *CurLoop->getBlocks().begin() && + "First block must be header"); for (Loop::block_iterator BB = std::next(CurLoop->block_begin()), BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow; ++BB) @@ -510,9 +541,9 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) { } // Return true if LI is invariant within scope of the loop. LI is invariant if -// CurLoop is dominated by an invariant.start representing the same memory location -// and size as the memory location LI loads from, and also the invariant.start -// has no uses. +// CurLoop is dominated by an invariant.start representing the same memory +// location and size as the memory location LI loads from, and also the +// invariant.start has no uses. static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, Loop *CurLoop) { Value *Addr = LI->getOperand(0); @@ -566,10 +597,13 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { + // SafetyInfo is nullptr if we are checking for sinking from preheader to + // loop body. + const bool SinkingToLoopBody = !SafetyInfo; // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->isUnordered()) - return false; // Don't hoist volatile/atomic loads! + return false; // Don't sink/hoist volatile or ordered atomic loads! // Loads from constant memory are always safe to move, even if they end up // in the same alias set as something that ends up being modified. @@ -578,6 +612,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, if (LI->getMetadata(LLVMContext::MD_invariant_load)) return true; + if (LI->isAtomic() && SinkingToLoopBody) + return false; // Don't sink unordered atomic loads to loop body. + // This checks for an invariant.start dominating the load. if (isLoadInvariantInLoop(LI, DT, CurLoop)) return true; @@ -595,10 +632,12 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // Check loop-invariant address because this may also be a sinkable load // whose address is not necessarily loop-invariant. if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand())) - ORE->emit(OptimizationRemarkMissed( - DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI) - << "failed to move load with loop-invariant address " - "because the loop may invalidate its value"); + ORE->emit([&]() { + return OptimizationRemarkMissed( + DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI) + << "failed to move load with loop-invariant address " + "because the loop may invalidate its value"; + }); return !Invalidated; } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { @@ -653,9 +692,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, !isa<InsertValueInst>(I)) return false; - // SafetyInfo is nullptr if we are checking for sinking from preheader to - // loop body. It will be always safe as there is no speculative execution. - if (!SafetyInfo) + // If we are checking for sinking from preheader to loop body it will be + // always safe as there is no speculative execution. + if (SinkingToLoopBody) return true; // TODO: Plumb the context instruction through to make hoisting and sinking @@ -677,13 +716,40 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) { return true; } +/// Return true if the instruction is free in the loop. +static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop, + const TargetTransformInfo *TTI) { + + if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) { + if (TTI->getUserCost(GEP) != TargetTransformInfo::TCC_Free) + return false; + // For a GEP, we cannot simply use getUserCost because currently it + // optimistically assume that a GEP will fold into addressing mode + // regardless of its users. + const BasicBlock *BB = GEP->getParent(); + for (const User *U : GEP->users()) { + const Instruction *UI = cast<Instruction>(U); + if (CurLoop->contains(UI) && + (BB != UI->getParent() || + (!isa<StoreInst>(UI) && !isa<LoadInst>(UI)))) + return false; + } + return true; + } else + return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free; +} + /// Return true if the only users of this instruction are outside of /// the loop. If this is true, we can sink the instruction to the exit /// blocks of the loop. /// -static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo) { +/// We also return true if the instruction could be folded away in lowering. +/// (e.g., a GEP can be folded into a load as an addressing mode in the loop). +static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, + const LoopSafetyInfo *SafetyInfo, + TargetTransformInfo *TTI, bool &FreeInLoop) { const auto &BlockColors = SafetyInfo->BlockColors; + bool IsFree = isFreeInLoop(I, CurLoop, TTI); for (const User *U : I.users()) { const Instruction *UI = cast<Instruction>(U); if (const PHINode *PN = dyn_cast<PHINode>(UI)) { @@ -698,30 +764,15 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, if (!BlockColors.empty() && BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1) return false; - - // A PHI node where all of the incoming values are this instruction are - // special -- they can just be RAUW'ed with the instruction and thus - // don't require a use in the predecessor. This is a particular important - // special case because it is the pattern found in LCSSA form. - if (isTriviallyReplacablePHI(*PN, I)) { - if (CurLoop->contains(PN)) - return false; - else - continue; - } - - // Otherwise, PHI node uses occur in predecessor blocks if the incoming - // values. Check for such a use being inside the loop. - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == &I) - if (CurLoop->contains(PN->getIncomingBlock(i))) - return false; - - continue; } - if (CurLoop->contains(UI)) + if (CurLoop->contains(UI)) { + if (IsFree) { + FreeInLoop = true; + continue; + } return false; + } } return true; } @@ -787,77 +838,189 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, return New; } +static Instruction *sinkThroughTriviallyReplacablePHI( + PHINode *TPN, Instruction *I, LoopInfo *LI, + SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, + const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) { + assert(isTriviallyReplacablePHI(*TPN, *I) && + "Expect only trivially replacalbe PHI"); + BasicBlock *ExitBlock = TPN->getParent(); + Instruction *New; + auto It = SunkCopies.find(ExitBlock); + if (It != SunkCopies.end()) + New = It->second; + else + New = SunkCopies[ExitBlock] = + CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo); + return New; +} + +static bool canSplitPredecessors(PHINode *PN) { + BasicBlock *BB = PN->getParent(); + if (!BB->canSplitPredecessors()) + return false; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *BBPred = *PI; + if (isa<IndirectBrInst>(BBPred->getTerminator())) + return false; + } + return true; +} + +static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, + LoopInfo *LI, const Loop *CurLoop) { +#ifndef NDEBUG + SmallVector<BasicBlock *, 32> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); +#endif + BasicBlock *ExitBB = PN->getParent(); + assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block."); + + // Split predecessors of the loop exit to make instructions in the loop are + // exposed to exit blocks through trivially replacable PHIs while keeping the + // loop in the canonical form where each predecessor of each exit block should + // be contained within the loop. For example, this will convert the loop below + // from + // + // LB1: + // %v1 = + // br %LE, %LB2 + // LB2: + // %v2 = + // br %LE, %LB1 + // LE: + // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable + // + // to + // + // LB1: + // %v1 = + // br %LE.split, %LB2 + // LB2: + // %v2 = + // br %LE.split2, %LB1 + // LE.split: + // %p1 = phi [%v1, %LB1] <-- trivially replacable + // br %LE + // LE.split2: + // %p2 = phi [%v2, %LB2] <-- trivially replacable + // br %LE + // LE: + // %p = phi [%p1, %LE.split], [%p2, %LE.split2] + // + SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); + while (!PredBBs.empty()) { + BasicBlock *PredBB = *PredBBs.begin(); + assert(CurLoop->contains(PredBB) && + "Expect all predecessors are in the loop"); + if (PN->getBasicBlockIndex(PredBB) >= 0) + SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true); + PredBBs.remove(PredBB); + } +} + /// When an instruction is found to only be used outside of the loop, this /// function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its /// position, and may either delete it or move it to outside of the loop. /// -static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST, - const LoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE) { +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, bool FreeInLoop) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); - ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) - << "sinking " << ore::NV("Inst", &I)); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) + << "sinking " << ore::NV("Inst", &I); + }); bool Changed = false; if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumSunk; - Changed = true; -#ifndef NDEBUG - SmallVector<BasicBlock *, 32> ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); -#endif + // Iterate over users to be ready for actual sinking. Replace users via + // unrechable blocks with undef and make all user PHIs trivially replcable. + SmallPtrSet<Instruction *, 8> VisitedUsers; + for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) { + auto *User = cast<Instruction>(*UI); + Use &U = UI.getUse(); + ++UI; - // Clones of this instruction. Don't create more than one per exit block! - SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies; + if (VisitedUsers.count(User) || CurLoop->contains(User)) + continue; - // If this instruction is only used outside of the loop, then all users are - // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of - // the instruction. - while (!I.use_empty()) { - Value::user_iterator UI = I.user_begin(); - auto *User = cast<Instruction>(*UI); if (!DT->isReachableFromEntry(User->getParent())) { - User->replaceUsesOfWith(&I, UndefValue::get(I.getType())); + U = UndefValue::get(I.getType()); + Changed = true; continue; } + // The user must be a PHI node. PHINode *PN = cast<PHINode>(User); // Surprisingly, instructions can be used outside of loops without any // exits. This can only happen in PHI nodes if the incoming block is // unreachable. - Use &U = UI.getUse(); BasicBlock *BB = PN->getIncomingBlock(U); if (!DT->isReachableFromEntry(BB)) { U = UndefValue::get(I.getType()); + Changed = true; continue; } - BasicBlock *ExitBlock = PN->getParent(); - assert(ExitBlockSet.count(ExitBlock) && - "The LCSSA PHI is not in an exit block!"); + VisitedUsers.insert(PN); + if (isTriviallyReplacablePHI(*PN, I)) + continue; - Instruction *New; - auto It = SunkCopies.find(ExitBlock); - if (It != SunkCopies.end()) - New = It->second; - else - New = SunkCopies[ExitBlock] = - CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo); + if (!canSplitPredecessors(PN)) + return Changed; + + // Split predecessors of the PHI so that we can make users trivially + // replacable. + splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop); + + // Should rebuild the iterators, as they may be invalidated by + // splitPredecessorsOfLoopExit(). + UI = I.user_begin(); + UE = I.user_end(); + } + + if (VisitedUsers.empty()) + return Changed; + +#ifndef NDEBUG + SmallVector<BasicBlock *, 32> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); +#endif + + // Clones of this instruction. Don't create more than one per exit block! + SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies; + + // If this instruction is only used outside of the loop, then all users are + // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of + // the instruction. + SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end()); + for (auto *UI : Users) { + auto *User = cast<Instruction>(UI); + + if (CurLoop->contains(User)) + continue; + PHINode *PN = cast<PHINode>(User); + assert(ExitBlockSet.count(PN->getParent()) && + "The LCSSA PHI is not in an exit block!"); + // The PHI must be trivially replacable. + Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies, + SafetyInfo, CurLoop); PN->replaceAllUsesWith(New); PN->eraseFromParent(); + Changed = true; } - - CurAST->deleteValue(&I); - I.eraseFromParent(); return Changed; } @@ -870,8 +1033,10 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, auto *Preheader = CurLoop->getLoopPreheader(); DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I << "\n"); - ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) - << "hoisting " << ore::NV("Inst", &I)); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting " + << ore::NV("Inst", &I); + }); // Metadata can be dependent on conditions we are hoisting above. // Conservatively strip all metadata on the instruction unless we were @@ -921,10 +1086,12 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst, if (!GuaranteedToExecute) { auto *LI = dyn_cast<LoadInst>(&Inst); if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand())) - ORE->emit(OptimizationRemarkMissed( - DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI) - << "failed to hoist load with loop-invariant address " - "because load is conditionally executed"); + ORE->emit([&]() { + return OptimizationRemarkMissed( + DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI) + << "failed to hoist load with loop-invariant address " + "because load is conditionally executed"; + }); } return GuaranteedToExecute; @@ -933,7 +1100,7 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst, namespace { class LoopPromoter : public LoadAndStorePromoter { Value *SomePtr; // Designated pointer to store to. - SmallPtrSetImpl<Value *> &PointerMustAliases; + const SmallSetVector<Value *, 8> &PointerMustAliases; SmallVectorImpl<BasicBlock *> &LoopExitBlocks; SmallVectorImpl<Instruction *> &LoopInsertPts; PredIteratorCache &PredCache; @@ -961,7 +1128,7 @@ class LoopPromoter : public LoadAndStorePromoter { public: LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S, - SmallPtrSetImpl<Value *> &PMA, + const SmallSetVector<Value *, 8> &PMA, SmallVectorImpl<BasicBlock *> &LEB, SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment, @@ -969,7 +1136,7 @@ public: : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast), LI(li), DL(std::move(dl)), Alignment(alignment), - UnorderedAtomic(UnorderedAtomic),AATags(AATags) {} + UnorderedAtomic(UnorderedAtomic), AATags(AATags) {} bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction *> &) const override { @@ -1008,7 +1175,31 @@ public: } void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); } }; -} // end anon namespace + + +/// Return true iff we can prove that a caller of this function can not inspect +/// the contents of the provided object in a well defined program. +bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) { + if (isa<AllocaInst>(Object)) + // Since the alloca goes out of scope, we know the caller can't retain a + // reference to it and be well defined. Thus, we don't need to check for + // capture. + return true; + + // For all other objects we need to know that the caller can't possibly + // have gotten a reference to the object. There are two components of + // that: + // 1) Object can't be escaped by this function. This is what + // PointerMayBeCaptured checks. + // 2) Object can't have been captured at definition site. For this, we + // need to know the return value is noalias. At the moment, we use a + // weaker condition and handle only AllocLikeFunctions (which are + // known to be noalias). TODO + return isAllocLikeFn(Object, TLI) && + !PointerMayBeCaptured(Object, true, true); +} + +} // namespace /// Try to promote memory values to scalars by sinking stores out of the /// loop and moving loads to before the loop. We do this by looping over @@ -1016,7 +1207,8 @@ public: /// loop invariant. /// bool llvm::promoteLoopAccessesToScalars( - AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks, + const SmallSetVector<Value *, 8> &PointerMustAliases, + SmallVectorImpl<BasicBlock *> &ExitBlocks, SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo, @@ -1026,17 +1218,7 @@ bool llvm::promoteLoopAccessesToScalars( CurAST != nullptr && SafetyInfo != nullptr && "Unexpected Input to promoteLoopAccessesToScalars"); - // We can promote this alias set if it has a store, if it is a "Must" alias - // set, if the pointer is loop invariant, and if we are not eliminating any - // volatile loads or stores. - if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || - AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue())) - return false; - - assert(!AS.empty() && - "Must alias set should have at least one pointer element in it!"); - - Value *SomePtr = AS.begin()->getValue(); + Value *SomePtr = *PointerMustAliases.begin(); BasicBlock *Preheader = CurLoop->getLoopPreheader(); // It isn't safe to promote a load/store from the loop if the load/store is @@ -1065,8 +1247,8 @@ bool llvm::promoteLoopAccessesToScalars( // is safe (i.e. proving dereferenceability on all paths through the loop). We // can use any access within the alias set to prove dereferenceability, // since they're all must alias. - // - // There are two ways establish (p2): + // + // There are two ways establish (p2): // a) Prove the location is thread-local. In this case the memory model // requirement does not apply, and stores are safe to insert. // b) Prove a store dominates every exit block. In this case, if an exit @@ -1080,55 +1262,36 @@ bool llvm::promoteLoopAccessesToScalars( bool SafeToInsertStore = false; SmallVector<Instruction *, 64> LoopUses; - SmallPtrSet<Value *, 4> PointerMustAliases; // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. unsigned Alignment = 1; // Keep track of which types of access we see - bool SawUnorderedAtomic = false; + bool SawUnorderedAtomic = false; bool SawNotAtomic = false; AAMDNodes AATags; const DataLayout &MDL = Preheader->getModule()->getDataLayout(); - // Do we know this object does not escape ? - bool IsKnownNonEscapingObject = false; + bool IsKnownThreadLocalObject = false; if (SafetyInfo->MayThrow) { // If a loop can throw, we have to insert a store along each unwind edge. // That said, we can't actually make the unwind edge explicit. Therefore, - // we have to prove that the store is dead along the unwind edge. - // - // If the underlying object is not an alloca, nor a pointer that does not - // escape, then we can not effectively prove that the store is dead along - // the unwind edge. i.e. the caller of this function could have ways to - // access the pointed object. + // we have to prove that the store is dead along the unwind edge. We do + // this by proving that the caller can't have a reference to the object + // after return and thus can't possibly load from the object. Value *Object = GetUnderlyingObject(SomePtr, MDL); - // If this is a base pointer we do not understand, simply bail. - // We only handle alloca and return value from alloc-like fn right now. - if (!isa<AllocaInst>(Object)) { - if (!isAllocLikeFn(Object, TLI)) - return false; - // If this is an alloc like fn. There are more constraints we need to verify. - // More specifically, we must make sure that the pointer can not escape. - // - // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped - // even though its not captured by the enclosing function. Standard allocation - // functions like malloc, calloc, and operator new return values which can - // be assumed not to have previously escaped. - if (PointerMayBeCaptured(Object, true, true)) - return false; - IsKnownNonEscapingObject = true; - } + if (!isKnownNonEscaping(Object, TLI)) + return false; + // Subtlety: Alloca's aren't visible to callers, but *are* potentially + // visible to other threads if captured and used during their lifetimes. + IsKnownThreadLocalObject = !isa<AllocaInst>(Object); } // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in // different sizes. While we are at it, collect alignment and AA info. - for (const auto &ASI : AS) { - Value *ASIV = ASI.getValue(); - PointerMustAliases.insert(ASIV); - + for (Value *ASIV : PointerMustAliases) { // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in // different sizes. @@ -1147,7 +1310,7 @@ bool llvm::promoteLoopAccessesToScalars( assert(!Load->isVolatile() && "AST broken"); if (!Load->isUnordered()) return false; - + SawUnorderedAtomic |= Load->isAtomic(); SawNotAtomic |= !Load->isAtomic(); @@ -1234,14 +1397,13 @@ bool llvm::promoteLoopAccessesToScalars( // stores along paths which originally didn't have them without violating the // memory model. if (!SafeToInsertStore) { - // If this is a known non-escaping object, it is safe to insert the stores. - if (IsKnownNonEscapingObject) + if (IsKnownThreadLocalObject) SafeToInsertStore = true; else { Value *Object = GetUnderlyingObject(SomePtr, MDL); SafeToInsertStore = - (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && - !PointerMayBeCaptured(Object, true, true); + (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && + !PointerMayBeCaptured(Object, true, true); } } @@ -1252,9 +1414,11 @@ bool llvm::promoteLoopAccessesToScalars( // Otherwise, this is safe to promote, lets do it! DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr << '\n'); - ORE->emit( - OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0]) - << "Moving accesses to memory location out of the loop"); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", + LoopUses[0]) + << "Moving accesses to memory location out of the loop"; + }); ++NumPromoted; // Grab a debug location for the inserted loads/stores; given that the @@ -1333,7 +1497,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI, auto mergeLoop = [&](Loop *L) { // Loop over the body of this loop, looking for calls, invokes, and stores. for (BasicBlock *BB : L->blocks()) - CurAST->add(*BB); // Incorporate the specified basic block + CurAST->add(*BB); // Incorporate the specified basic block }; // Add everything from the sub loops that are no longer directly available. diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp index d09af32a99fd..7f7c6de76450 100644 --- a/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -18,25 +18,20 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; @@ -120,9 +115,7 @@ public: AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addRequired<ScalarEvolutionWrapperPass>(); - // FIXME: For some reason, preserving SE here breaks LSR (even if - // this pass changes nothing). - // AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); } @@ -329,8 +322,10 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { ++NumPrefetches; DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV << "\n"); - ORE->emit(OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI) - << "prefetched memory access"); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI) + << "prefetched memory access"; + }); MadeChange = true; } diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index ac4dd44a0e90..82604a8842bf 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -30,20 +30,12 @@ using namespace llvm; STATISTIC(NumDeleted, "Number of loops deleted"); -/// This function deletes dead loops. The caller of this function needs to -/// guarantee that the loop is infact dead. Here we handle two kinds of dead -/// loop. The first kind (\p isLoopDead) is where only invariant values from -/// within the loop are used outside of it. The second kind (\p -/// isLoopNeverExecuted) is where the loop is provably never executed. We can -/// always remove never executed loops since they will not cause any difference -/// to program behaviour. -/// -/// This also updates the relevant analysis information in \p DT, \p SE, and \p -/// LI. It also updates the loop PM if an updater struct is provided. -// TODO: This function will be used by loop-simplifyCFG as well. So, move this -// to LoopUtils.cpp -static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, - LoopInfo &LI, LPMUpdater *Updater = nullptr); +enum class LoopDeletionResult { + Unmodified, + Modified, + Deleted, +}; + /// Determines if a loop is dead. /// /// This assumes that we've already checked for unique exit and exiting blocks, @@ -144,8 +136,8 @@ static bool isLoopNeverExecuted(Loop *L) { /// \returns true if any changes were made. This may mutate the loop even if it /// is unable to delete it due to hoisting trivially loop invariant /// instructions out of the loop. -static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, - LoopInfo &LI, LPMUpdater *Updater = nullptr) { +static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, + ScalarEvolution &SE, LoopInfo &LI) { assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); // We can only remove the loop if there is a preheader that we can branch from @@ -155,13 +147,13 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, if (!Preheader || !L->hasDedicatedExits()) { DEBUG(dbgs() << "Deletion requires Loop with preheader and dedicated exits.\n"); - return false; + return LoopDeletionResult::Unmodified; } // We can't remove loops that contain subloops. If the subloops were dead, // they would already have been removed in earlier executions of this pass. if (L->begin() != L->end()) { DEBUG(dbgs() << "Loop contains subloops.\n"); - return false; + return LoopDeletionResult::Unmodified; } @@ -176,9 +168,9 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, P->setIncomingValue(i, UndefValue::get(P->getType())); BI++; } - deleteDeadLoop(L, DT, SE, LI, Updater); + deleteDeadLoop(L, &DT, &SE, &LI); ++NumDeleted; - return true; + return LoopDeletionResult::Deleted; } // The remaining checks below are for a loop being dead because all statements @@ -192,13 +184,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, // a loop invariant manner. if (!ExitBlock) { DEBUG(dbgs() << "Deletion requires single exit block\n"); - return false; + return LoopDeletionResult::Unmodified; } // Finally, we have to check that the loop really is dead. bool Changed = false; if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) { DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n"); - return Changed; + return Changed ? LoopDeletionResult::Modified + : LoopDeletionResult::Unmodified; } // Don't remove loops for which we can't solve the trip count. @@ -206,114 +199,15 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) { DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n"); - return Changed; + return Changed ? LoopDeletionResult::Modified + : LoopDeletionResult::Unmodified; } DEBUG(dbgs() << "Loop is invariant, delete it!"); - deleteDeadLoop(L, DT, SE, LI, Updater); + deleteDeadLoop(L, &DT, &SE, &LI); ++NumDeleted; - return true; -} - -static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, - LoopInfo &LI, LPMUpdater *Updater) { - assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); - auto *Preheader = L->getLoopPreheader(); - assert(Preheader && "Preheader should exist!"); - - // Now that we know the removal is safe, remove the loop by changing the - // branch from the preheader to go to the single exit block. - // - // Because we're deleting a large chunk of code at once, the sequence in which - // we remove things is very important to avoid invalidation issues. - - // If we have an LPM updater, tell it about the loop being removed. - if (Updater) - Updater->markLoopAsDeleted(*L); - - // Tell ScalarEvolution that the loop is deleted. Do this before - // deleting the loop so that ScalarEvolution can look at the loop - // to determine what it needs to clean up. - SE.forgetLoop(L); - - auto *ExitBlock = L->getUniqueExitBlock(); - assert(ExitBlock && "Should have a unique exit block!"); - - assert(L->hasDedicatedExits() && "Loop should have dedicated exits!"); - - // Connect the preheader directly to the exit block. - // Even when the loop is never executed, we cannot remove the edge from the - // source block to the exit block. Consider the case where the unexecuted loop - // branches back to an outer loop. If we deleted the loop and removed the edge - // coming to this inner loop, this will break the outer loop structure (by - // deleting the backedge of the outer loop). If the outer loop is indeed a - // non-loop, it will be deleted in a future iteration of loop deletion pass. - Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock); - - // Rewrite phis in the exit block to get their inputs from the Preheader - // instead of the exiting block. - BasicBlock::iterator BI = ExitBlock->begin(); - while (PHINode *P = dyn_cast<PHINode>(BI)) { - // Set the zero'th element of Phi to be from the preheader and remove all - // other incoming values. Given the loop has dedicated exits, all other - // incoming values must be from the exiting blocks. - int PredIndex = 0; - P->setIncomingBlock(PredIndex, Preheader); - // Removes all incoming values from all other exiting blocks (including - // duplicate values from an exiting block). - // Nuke all entries except the zero'th entry which is the preheader entry. - // NOTE! We need to remove Incoming Values in the reverse order as done - // below, to keep the indices valid for deletion (removeIncomingValues - // updates getNumIncomingValues and shifts all values down into the operand - // being deleted). - for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i) - P->removeIncomingValue(e-i, false); - - assert((P->getNumIncomingValues() == 1 && - P->getIncomingBlock(PredIndex) == Preheader) && - "Should have exactly one value and that's from the preheader!"); - ++BI; - } - - // Update the dominator tree and remove the instructions and blocks that will - // be deleted from the reference counting scheme. - SmallVector<DomTreeNode*, 8> ChildNodes; - for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); - LI != LE; ++LI) { - // Move all of the block's children to be children of the Preheader, which - // allows us to remove the domtree entry for the block. - ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); - for (DomTreeNode *ChildNode : ChildNodes) { - DT.changeImmediateDominator(ChildNode, DT[Preheader]); - } - - ChildNodes.clear(); - DT.eraseNode(*LI); - - // Remove the block from the reference counting scheme, so that we can - // delete it freely later. - (*LI)->dropAllReferences(); - } - - // Erase the instructions and the blocks without having to worry - // about ordering because we already dropped the references. - // NOTE: This iteration is safe because erasing the block does not remove its - // entry from the loop's block list. We do that in the next section. - for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); - LI != LE; ++LI) - (*LI)->eraseFromParent(); - - // Finally, the blocks from loopinfo. This has to happen late because - // otherwise our loop iterators won't work. - - SmallPtrSet<BasicBlock *, 8> blocks; - blocks.insert(L->block_begin(), L->block_end()); - for (BasicBlock *BB : blocks) - LI.removeBlock(BB); - - // The last step is to update LoopInfo now that we've eliminated this loop. - LI.markAsRemoved(L); + return LoopDeletionResult::Deleted; } PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM, @@ -322,9 +216,14 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM, DEBUG(dbgs() << "Analyzing Loop for deletion: "); DEBUG(L.dump()); - if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater)) + std::string LoopName = L.getName(); + auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI); + if (Result == LoopDeletionResult::Unmodified) return PreservedAnalyses::all(); + if (Result == LoopDeletionResult::Deleted) + Updater.markLoopAsDeleted(L, LoopName); + return getLoopPassPreservedAnalyses(); } @@ -354,7 +253,7 @@ INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion", Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); } -bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) { +bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipLoop(L)) return false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -363,5 +262,11 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) { DEBUG(dbgs() << "Analyzing Loop for deletion: "); DEBUG(L->dump()); - return deleteLoopIfDead(L, DT, SE, LI); + + LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI); + + if (Result == LoopDeletionResult::Deleted) + LPM.markLoopAsDeleted(*L); + + return Result != LoopDeletionResult::Unmodified; } diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp index 3624bba10345..0d7e3db901cb 100644 --- a/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/lib/Transforms/Scalar/LoopDistribute.cpp @@ -23,32 +23,61 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopDistribute.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <cassert> +#include <functional> #include <list> +#include <tuple> +#include <utility> + +using namespace llvm; #define LDIST_NAME "loop-distribute" #define DEBUG_TYPE LDIST_NAME -using namespace llvm; - static cl::opt<bool> LDistVerify("loop-distribute-verify", cl::Hidden, cl::desc("Turn on DominatorTree and LoopInfo verification " @@ -81,14 +110,15 @@ static cl::opt<bool> EnableLoopDistribute( STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { + /// \brief Maintains the set of instructions of the loop for a partition before /// cloning. After cloning, it hosts the new loop. class InstPartition { - typedef SmallPtrSet<Instruction *, 8> InstructionSet; + using InstructionSet = SmallPtrSet<Instruction *, 8>; public: InstPartition(Instruction *I, Loop *L, bool DepCycle = false) - : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { + : DepCycle(DepCycle), OrigLoop(L) { Set.insert(I); } @@ -220,7 +250,7 @@ private: /// \brief The cloned loop. If this partition is mapped to the original loop, /// this is null. - Loop *ClonedLoop; + Loop *ClonedLoop = nullptr; /// \brief The blocks of ClonedLoop including the preheader. If this /// partition is mapped to the original loop, this is empty. @@ -235,7 +265,7 @@ private: /// \brief Holds the set of Partitions. It populates them, merges them and then /// clones the loops. class InstPartitionContainer { - typedef DenseMap<Instruction *, int> InstToPartitionIdT; + using InstToPartitionIdT = DenseMap<Instruction *, int>; public: InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) @@ -308,8 +338,8 @@ public: /// /// Return if any partitions were merged. bool mergeToAvoidDuplicatedLoads() { - typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; - typedef EquivalenceClasses<InstPartition *> ToBeMergedT; + using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; + using ToBeMergedT = EquivalenceClasses<InstPartition *>; LoadToPartitionT LoadToPartition; ToBeMergedT ToBeMerged; @@ -511,7 +541,7 @@ public: } private: - typedef std::list<InstPartition> PartitionContainerT; + using PartitionContainerT = std::list<InstPartition>; /// \brief List of partitions. PartitionContainerT PartitionContainer; @@ -552,17 +582,17 @@ private: /// By traversing the memory instructions in program order and accumulating this /// number, we know whether any unsafe dependence crosses over a program point. class MemoryInstructionDependences { - typedef MemoryDepChecker::Dependence Dependence; + using Dependence = MemoryDepChecker::Dependence; public: struct Entry { Instruction *Inst; - unsigned NumUnsafeDependencesStartOrEnd; + unsigned NumUnsafeDependencesStartOrEnd = 0; - Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} + Entry(Instruction *Inst) : Inst(Inst) {} }; - typedef SmallVector<Entry, 8> AccessesType; + using AccessesType = SmallVector<Entry, 8>; AccessesType::const_iterator begin() const { return Accesses.begin(); } AccessesType::const_iterator end() const { return Accesses.end(); } @@ -594,7 +624,7 @@ class LoopDistributeForLoop { public: LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) - : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) { + : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { setForced(); } @@ -755,9 +785,11 @@ public: ++NumLoopsDistributed; // Report the success. - ORE->emit(OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), - L->getHeader()) - << "distributed loop"); + ORE->emit([&]() { + return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), + L->getHeader()) + << "distributed loop"; + }); return true; } @@ -769,11 +801,13 @@ public: DEBUG(dbgs() << "Skipping; " << Message << "\n"); // With Rpass-missed report that distribution failed. - ORE->emit( - OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", L->getStartLoc(), - L->getHeader()) - << "loop not distributed: use -Rpass-analysis=loop-distribute for more " - "info"); + ORE->emit([&]() { + return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", + L->getStartLoc(), L->getHeader()) + << "loop not distributed: use -Rpass-analysis=loop-distribute for " + "more " + "info"; + }); // With Rpass-analysis report why. This is on by default if distribution // was requested explicitly. @@ -857,7 +891,7 @@ private: // Analyses used. LoopInfo *LI; - const LoopAccessInfo *LAI; + const LoopAccessInfo *LAI = nullptr; DominatorTree *DT; ScalarEvolution *SE; OptimizationRemarkEmitter *ORE; @@ -871,6 +905,8 @@ private: Optional<bool> IsForced; }; +} // end anonymous namespace + /// Shared implementation between new and old PMs. static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, @@ -901,9 +937,13 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, return Changed; } +namespace { + /// \brief The pass class. class LoopDistributeLegacy : public FunctionPass { public: + static char ID; + LoopDistributeLegacy() : FunctionPass(ID) { // The default is set by the caller. initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); @@ -934,10 +974,9 @@ public: AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } - - static char ID; }; -} // anonymous namespace + +} // end anonymous namespace PreservedAnalyses LoopDistributePass::run(Function &F, FunctionAnalysisManager &AM) { @@ -956,7 +995,7 @@ PreservedAnalyses LoopDistributePass::run(Function &F, auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; @@ -971,6 +1010,7 @@ PreservedAnalyses LoopDistributePass::run(Function &F, } char LoopDistributeLegacy::ID; + static const char ldist_name[] = "Loop Distribution"; INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, @@ -982,6 +1022,4 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) -namespace llvm { -FunctionPass *createLoopDistributePass() { return new LoopDistributeLegacy(); } -} +FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 4a6a35c0ab1b..21551f0a0825 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1,4 +1,4 @@ -//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===// +//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===// // // The LLVM Compiler Infrastructure // @@ -38,32 +38,64 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "loop-idiom" @@ -80,7 +112,7 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( namespace { class LoopIdiomRecognize { - Loop *CurLoop; + Loop *CurLoop = nullptr; AliasAnalysis *AA; DominatorTree *DT; LoopInfo *LI; @@ -96,20 +128,21 @@ public: TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const DataLayout *DL) - : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), - DL(DL) {} + : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL) {} bool runOnLoop(Loop *L); private: - typedef SmallVector<StoreInst *, 8> StoreList; - typedef MapVector<Value *, StoreList> StoreListMap; + using StoreList = SmallVector<StoreInst *, 8>; + using StoreListMap = MapVector<Value *, StoreList>; + StoreListMap StoreRefsForMemset; StoreListMap StoreRefsForMemsetPattern; StoreList StoreRefsForMemcpy; bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; + /// Return code for isLegalStore() enum LegalStoreKind { None = 0, @@ -164,6 +197,7 @@ private: class LoopIdiomRecognizeLegacyPass : public LoopPass { public: static char ID; + explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) { initializeLoopIdiomRecognizeLegacyPassPass( *PassRegistry::getPassRegistry()); @@ -190,14 +224,16 @@ public: /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG. - /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); getLoopAnalysisUsage(AU); } }; -} // End anonymous namespace. + +} // end anonymous namespace + +char LoopIdiomRecognizeLegacyPass::ID = 0; PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, @@ -211,7 +247,6 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, return getLoopPassPreservedAnalyses(); } -char LoopIdiomRecognizeLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_DEPENDENCY(LoopPass) @@ -299,13 +334,6 @@ bool LoopIdiomRecognize::runOnCountableLoop() { return MadeChange; } -static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { - uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); - assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && - "Don't overflow unsigned."); - return (unsigned)SizeInBits >> 3; -} - static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); return ConstStride->getAPInt(); @@ -354,7 +382,6 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { LoopIdiomRecognize::LegalStoreKind LoopIdiomRecognize::isLegalStore(StoreInst *SI) { - // Don't touch volatile stores. if (SI->isVolatile()) return LegalStoreKind::None; @@ -424,7 +451,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. APInt Stride = getStoreStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI, DL); + unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); if (StoreSize != Stride && StoreSize != -Stride) return LegalStoreKind::None; @@ -563,7 +590,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEVAddRecExpr *FirstStoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); APInt FirstStride = getStoreStride(FirstStoreEv); - unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); + unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType()); // See if we can optimize just this store in isolation. if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { @@ -656,7 +683,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, break; AdjacentStores.insert(I); - StoreSize += getStoreSizeInBytes(I, DL); + StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType()); // Move to the next value in the chain. I = ConsecutiveChain[I]; } @@ -761,7 +788,8 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, ++BI) for (Instruction &I : **BI) if (IgnoredStores.count(&I) == 0 && - (AA.getModRefInfo(&I, StoreLoc) & Access)) + isModOrRefSet( + intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access))) return true; return false; @@ -780,6 +808,41 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, return SE->getMinusSCEV(Start, Index); } +/// Compute the number of bytes as a SCEV from the backedge taken count. +/// +/// This also maps the SCEV into the provided type and tries to handle the +/// computation in a way that will fold cleanly. +static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, + unsigned StoreSize, Loop *CurLoop, + const DataLayout *DL, ScalarEvolution *SE) { + const SCEV *NumBytesS; + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to + // pointer size if it isn't already. + // + // If we're going to need to zero extend the BE count, check if we can add + // one to it prior to zero extending without overflow. Provided this is safe, + // it allows better simplification of the +1. + if (DL->getTypeSizeInBits(BECount->getType()) < + DL->getTypeSizeInBits(IntPtr) && + SE->isLoopEntryGuardedByCond( + CurLoop, ICmpInst::ICMP_NE, BECount, + SE->getNegativeSCEV(SE->getOne(BECount->getType())))) { + NumBytesS = SE->getZeroExtendExpr( + SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW), + IntPtr); + } else { + NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr), + SE->getOne(IntPtr), SCEV::FlagNUW); + } + + // And scale it based on the store size. + if (StoreSize != 1) { + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + SCEV::FlagNUW); + } + return NumBytesS; +} + /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( @@ -824,8 +887,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( // base pointer and checking the region. Value *BasePtr = Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); - if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, - *AA, Stores)) { + if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount, + StoreSize, *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); @@ -837,16 +900,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, everything looks good, insert the memset. - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to - // pointer size if it isn't already. - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); - const SCEV *NumBytesS = - SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW); - if (StoreSize != 1) { - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), - SCEV::FlagNUW); - } + getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -903,7 +958,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StorePtr = SI->getPointerOperand(); const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); APInt Stride = getStoreStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI, DL); + unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); bool NegStride = StoreSize == -Stride; // The store must be feeding a non-volatile load. @@ -942,7 +997,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, SmallPtrSet<Instruction *, 1> Stores; Stores.insert(SI); - if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, + if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, StoreSize, *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. @@ -962,8 +1017,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *LoadBasePtr = Expander.expandCodeFor( LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); - if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, Stores)) { + if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, + StoreSize, *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -976,16 +1031,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Okay, everything is safe, we can transform this! - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to - // pointer size if it isn't already. - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = - SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); - - if (StoreSize != 1) - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), - SCEV::FlagNUW); + getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE); Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); @@ -1010,16 +1057,12 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) return false; + // Create the call. + // Note that unordered atomic loads/stores are *required* by the spec to + // have an alignment but non-atomic loads/stores may not. NewCall = Builder.CreateElementUnorderedAtomicMemCpy( - StoreBasePtr, LoadBasePtr, NumBytes, StoreSize); - - // Propagate alignment info onto the pointer args. Note that unordered - // atomic loads/stores are *required* by the spec to have an alignment - // but non-atomic loads/stores may not. - NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(), - SI->getAlignment())); - NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(), - LI->getAlignment())); + StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(), + NumBytes, StoreSize); } NewCall->setDebugLoc(SI->getDebugLoc()); @@ -1273,9 +1316,9 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, // step 2: detect instructions corresponding to "x.next = x >> 1" if (!DefX || DefX->getOpcode() != Instruction::AShr) return false; - if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1))) - if (!Shft || !Shft->isOne()) - return false; + ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); + if (!Shft || !Shft->isOne()) + return false; VarX = DefX->getOperand(0); // step 3: Check the recurrence of variable X @@ -1469,7 +1512,7 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, /// PhiX = PHI [InitX, DefX] /// CntInst = CntPhi + 1 /// DefX = PhiX >> 1 -// LOOP_BODY +/// LOOP_BODY /// Br: loop if (DefX != 0) /// Use(CntPhi) or Use(CntInst) /// diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index af095560cc02..40d468a084d4 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -12,22 +12,33 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopInstSimplify.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/Support/Debug.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/User.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include <algorithm> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "loop-instsimplify" @@ -45,7 +56,7 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI, // The bit we are stealing from the pointer represents whether this basic // block is the header of a subloop, in which case we only process its phis. - typedef PointerIntPair<BasicBlock *, 1> WorklistItem; + using WorklistItem = PointerIntPair<BasicBlock *, 1>; SmallVector<WorklistItem, 16> VisitStack; SmallPtrSet<BasicBlock *, 32> Visited; @@ -151,9 +162,11 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI, } namespace { + class LoopInstSimplifyLegacyPass : public LoopPass { public: static char ID; // Pass ID, replacement for typeid + LoopInstSimplifyLegacyPass() : LoopPass(ID) { initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -181,7 +194,8 @@ public: getLoopAnalysisUsage(AU); } }; -} + +} // end anonymous namespace PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, @@ -195,6 +209,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, } char LoopInstSimplifyLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify", "Simplify instructions in loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp index 2e0d8e0374c0..4f8dafef230a 100644 --- a/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -1,4 +1,4 @@ -//===- LoopInterchange.cpp - Loop interchange pass------------------------===// +//===- LoopInterchange.cpp - Loop interchange pass-------------------------===// // // The LLVM Compiler Infrastructure // @@ -13,33 +13,38 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include <cassert> +#include <utility> +#include <vector> using namespace llvm; @@ -51,10 +56,12 @@ static cl::opt<int> LoopInterchangeCostThreshold( namespace { -typedef SmallVector<Loop *, 8> LoopVector; +using LoopVector = SmallVector<Loop *, 8>; // TODO: Check if we can use a sparse matrix here. -typedef std::vector<std::vector<char>> CharMatrix; +using CharMatrix = std::vector<std::vector<char>>; + +} // end anonymous namespace // Maximum number of dependencies that can be handled in the dependency matrix. static const unsigned MaxMemInstrCount = 100; @@ -62,14 +69,11 @@ static const unsigned MaxMemInstrCount = 100; // Maximum loop depth supported. static const unsigned MaxLoopNestDepth = 10; -struct LoopInterchange; - #ifdef DUMP_DEP_MATRICIES -void printDepMatrix(CharMatrix &DepMatrix) { - for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) { - std::vector<char> Vec = *I; - for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II) - DEBUG(dbgs() << *II << " "); +static void printDepMatrix(CharMatrix &DepMatrix) { + for (auto &Row : DepMatrix) { + for (auto D : Row) + DEBUG(dbgs() << D << " "); DEBUG(dbgs() << "\n"); } } @@ -77,25 +81,24 @@ void printDepMatrix(CharMatrix &DepMatrix) { static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI) { - typedef SmallVector<Value *, 16> ValueVector; + using ValueVector = SmallVector<Value *, 16>; + ValueVector MemInstr; // For each block. - for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end(); - BB != BE; ++BB) { + for (BasicBlock *BB : L->blocks()) { // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; - ++I) { + for (Instruction &I : *BB) { if (!isa<Instruction>(I)) return false; - if (LoadInst *Ld = dyn_cast<LoadInst>(I)) { + if (auto *Ld = dyn_cast<LoadInst>(&I)) { if (!Ld->isSimple()) return false; - MemInstr.push_back(&*I); - } else if (StoreInst *St = dyn_cast<StoreInst>(I)) { + MemInstr.push_back(&I); + } else if (auto *St = dyn_cast<StoreInst>(&I)) { if (!St->isSimple()) return false; - MemInstr.push_back(&*I); + MemInstr.push_back(&I); } } } @@ -171,7 +174,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, } // We don't have a DepMatrix to check legality return false. - if (DepMatrix.size() == 0) + if (DepMatrix.empty()) return false; return true; } @@ -216,7 +219,6 @@ static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, unsigned OuterLoopId, char InnerDep, char OuterDep) { - if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) return false; @@ -255,7 +257,6 @@ static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId) { - unsigned NumRows = DepMatrix.size(); // For each row check if it is valid to interchange. for (unsigned Row = 0; Row < NumRows; ++Row) { @@ -270,7 +271,6 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, } static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { - DEBUG(dbgs() << "Calling populateWorklist on Func: " << L.getHeader()->getParent()->getName() << " Loop: %" << L.getHeader()->getName() << '\n'); @@ -320,6 +320,8 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { return nullptr; } +namespace { + /// LoopInterchangeLegality checks if it is legal to interchange the loop. class LoopInterchangeLegality { public: @@ -327,11 +329,12 @@ public: LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, OptimizationRemarkEmitter *ORE) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), - PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {} + PreserveLCSSA(PreserveLCSSA), ORE(ORE) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix); + /// Check if the loop structure is understood. We do not handle triangular /// loops for now. bool isLoopStructureUnderstood(PHINode *InnerInductionVar); @@ -348,6 +351,7 @@ private: bool findInductionAndReductions(Loop *L, SmallVector<PHINode *, 8> &Inductions, SmallVector<PHINode *, 8> &Reductions); + Loop *OuterLoop; Loop *InnerLoop; @@ -355,10 +359,11 @@ private: LoopInfo *LI; DominatorTree *DT; bool PreserveLCSSA; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; - bool InnerLoopHasReduction; + bool InnerLoopHasReduction = false; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -381,6 +386,7 @@ private: /// Scev analysis. ScalarEvolution *SE; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; }; @@ -415,6 +421,7 @@ private: /// Scev analysis. ScalarEvolution *SE; + LoopInfo *LI; DominatorTree *DT; BasicBlock *LoopExit; @@ -424,16 +431,16 @@ private: // Main LoopInterchange Pass. struct LoopInterchange : public FunctionPass { static char ID; - ScalarEvolution *SE; - LoopInfo *LI; - DependenceInfo *DI; - DominatorTree *DT; + ScalarEvolution *SE = nullptr; + LoopInfo *LI = nullptr; + DependenceInfo *DI = nullptr; + DominatorTree *DT = nullptr; bool PreserveLCSSA; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; - LoopInterchange() - : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) { + LoopInterchange() : FunctionPass(ID) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); } @@ -501,7 +508,6 @@ struct LoopInterchange : public FunctionPass { } bool processLoopList(LoopVector LoopList, Function &F) { - bool Changed = false; unsigned LoopNestDepth = LoopList.size(); if (LoopNestDepth < 2) { @@ -580,7 +586,6 @@ struct LoopInterchange : public FunctionPass { bool processLoop(LoopVector LoopList, unsigned InnerLoopId, unsigned OuterLoopId, BasicBlock *LoopNestExit, std::vector<std::vector<char>> &DependencyMatrix) { - DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << "\n"); Loop *InnerLoop = LoopList[InnerLoopId]; @@ -599,10 +604,12 @@ struct LoopInterchange : public FunctionPass { return false; } - ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Loop interchanged with enclosing loop."); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Interchanged", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Loop interchanged with enclosing loop."; + }); LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); @@ -612,9 +619,10 @@ struct LoopInterchange : public FunctionPass { } }; -} // end of namespace +} // end anonymous namespace + bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { - return none_of(Ins->users(), [=](User *U) -> bool { + return llvm::none_of(Ins->users(), [=](User *U) -> bool { auto *UserIns = dyn_cast<PHINode>(U); RecurrenceDescriptor RD; return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD); @@ -664,11 +672,9 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { if (!OuterLoopHeaderBI) return false; - for (unsigned i = 0, e = OuterLoopHeaderBI->getNumSuccessors(); i < e; ++i) { - if (OuterLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader && - OuterLoopHeaderBI->getSuccessor(i) != OuterLoopLatch) + for (BasicBlock *Succ : OuterLoopHeaderBI->successors()) + if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch) return false; - } DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); // We do not have any basic block in between now make sure the outer header @@ -682,10 +688,8 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { return true; } - bool LoopInterchangeLegality::isLoopStructureUnderstood( PHINode *InnerInduction) { - unsigned Num = InnerInduction->getNumOperands(); BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); for (unsigned i = 0; i < Num; ++i) { @@ -750,12 +754,12 @@ static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, BasicBlock *LoopHeader) { if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) { - unsigned Num = BI->getNumSuccessors(); - assert(Num == 2); - for (unsigned i = 0; i < Num; ++i) { - if (BI->getSuccessor(i) == LoopHeader) + assert(BI->getNumSuccessors() == 2 && + "Branch leaving loop latch must have 2 successors"); + for (BasicBlock *Succ : BI->successors()) { + if (Succ == LoopHeader) continue; - return BI->getSuccessor(i); + return Succ; } } return nullptr; @@ -764,7 +768,6 @@ static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { - BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); @@ -777,12 +780,13 @@ bool LoopInterchangeLegality::currentLimitations() { if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " << "are supported currently.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "UnsupportedPHIInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with induction or reduction PHI nodes can be" - " interchange currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with induction or reduction PHI nodes can be" + " interchange currently."; + }); return true; } @@ -790,12 +794,13 @@ bool LoopInterchangeLegality::currentLimitations() { if (Inductions.size() != 1) { DEBUG(dbgs() << "We currently only support loops with 1 induction variable." << "Failed to interchange due to current limitation\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "MultiInductionInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with 1 induction variable can be " - "interchanged currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with 1 induction variable can be " + "interchanged currently."; + }); return true; } if (Reductions.size() > 0) @@ -806,12 +811,13 @@ bool LoopInterchangeLegality::currentLimitations() { if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " << "are supported currently.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "UnsupportedPHIOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with induction or reduction PHI nodes can be" - " interchanged currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with induction or reduction PHI nodes can be" + " interchanged currently."; + }); return true; } @@ -820,35 +826,38 @@ bool LoopInterchangeLegality::currentLimitations() { if (!Reductions.empty()) { DEBUG(dbgs() << "Outer loops with reductions are not supported " << "currently.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "ReductionsOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Outer loops with reductions cannot be interchangeed " - "currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Outer loops with reductions cannot be interchangeed " + "currently."; + }); return true; } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { DEBUG(dbgs() << "Loops with more than 1 induction variables are not " << "supported currently.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "MultiIndutionOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with 1 induction variable can be " - "interchanged currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with 1 induction variable can be " + "interchanged currently."; + }); return true; } // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { DEBUG(dbgs() << "Loop structure not understood by pass\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "UnsupportedStructureInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Inner loop structure not understood currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Inner loop structure not understood currently."; + }); return true; } @@ -857,24 +866,26 @@ bool LoopInterchangeLegality::currentLimitations() { getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "NoLCSSAPHIOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with LCSSA PHIs can be interchange " - "currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with LCSSA PHIs can be interchange " + "currently."; + }); return true; } LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "NoLCSSAPHIOuterInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with LCSSA PHIs can be interchange " - "currently."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with LCSSA PHIs can be interchange " + "currently."; + }); return true; } @@ -899,11 +910,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (!InnerIndexVarInc) { DEBUG(dbgs() << "Did not find an instruction to increment the induction " << "variable.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "NoIncrementInInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "The inner loop does not increment the induction variable."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "The inner loop does not increment the induction variable."; + }); return true; } @@ -912,8 +924,9 @@ bool LoopInterchangeLegality::currentLimitations() { // instruction. bool FoundInduction = false; - for (const Instruction &I : reverse(*InnerLoopLatch)) { - if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I)) + for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) { + if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || + isa<ZExtInst>(I)) continue; // We found an instruction. If this is not induction variable then it is not @@ -921,12 +934,13 @@ bool LoopInterchangeLegality::currentLimitations() { if (!I.isIdenticalTo(InnerIndexVarInc)) { DEBUG(dbgs() << "Found unsupported instructions between induction " << "variable increment and branch.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "UnsupportedInsBetweenInduction", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Found unsupported instruction between induction variable " - "increment and branch."); + ORE->emit([&]() { + return OptimizationRemarkMissed( + DEBUG_TYPE, "UnsupportedInsBetweenInduction", + InnerLoop->getStartLoc(), InnerLoop->getHeader()) + << "Found unsupported instruction between induction variable " + "increment and branch."; + }); return true; } @@ -937,11 +951,12 @@ bool LoopInterchangeLegality::currentLimitations() { // current limitation. if (!FoundInduction) { DEBUG(dbgs() << "Did not find the induction variable.\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "NoIndutionVariable", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Did not find the induction variable."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Did not find the induction variable."; + }); return true; } return false; @@ -950,19 +965,31 @@ bool LoopInterchangeLegality::currentLimitations() { bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << " due to dependence\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "Dependence", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Cannot interchange loops due to dependences."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops due to dependences."; + }); return false; } + // Check if outer and inner loop contain legal instructions only. + for (auto *BB : OuterLoop->blocks()) + for (Instruction &I : *BB) + if (CallInst *CI = dyn_cast<CallInst>(&I)) { + // readnone functions do not prevent interchanging. + if (CI->doesNotReadMemory()) + continue; + DEBUG(dbgs() << "Loops with call instructions cannot be interchanged " + << "safely."); + return false; + } + // Create unique Preheaders if we already do not have one. BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); @@ -995,12 +1022,13 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, // Check if the loops are tightly nested. if (!tightlyNested(OuterLoop, InnerLoop)) { DEBUG(dbgs() << "Loops not tightly nested\n"); - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "NotTightlyNested", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Cannot interchange loops because they are not tightly " - "nested."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops because they are not tightly " + "nested."; + }); return false; } @@ -1010,9 +1038,8 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, int LoopInterchangeProfitability::getInstrOrderCost() { unsigned GoodOrder, BadOrder; BadOrder = GoodOrder = 0; - for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end(); - BI != BE; ++BI) { - for (Instruction &Ins : **BI) { + for (BasicBlock *BB : InnerLoop->blocks()) { + for (Instruction &Ins : *BB) { if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { unsigned NumOp = GEP->getNumOperands(); bool FoundInnerInduction = false; @@ -1064,12 +1091,11 @@ static bool isProfitableForVectorization(unsigned InnerLoopId, // TODO: Improve this heuristic to catch more cases. // If the inner loop is loop independent or doesn't carry any dependency it is // profitable to move this to outer position. - unsigned Row = DepMatrix.size(); - for (unsigned i = 0; i < Row; ++i) { - if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I') + for (auto &Row : DepMatrix) { + if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') return false; // TODO: We need to improve this heuristic. - if (DepMatrix[i][OuterLoopId] != '=') + if (Row[OuterLoopId] != '=') return false; } // If outer loop has dependence and inner loop is loop independent then it is @@ -1080,7 +1106,6 @@ static bool isProfitableForVectorization(unsigned InnerLoopId, bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // TODO: Add better profitability checks. // e.g // 1) Construct dependency matrix and move the one with no loop carried dep @@ -1099,14 +1124,15 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) return true; - ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, - "InterchangeNotProfitable", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Interchanging loops is too costly (cost=" - << ore::NV("Cost", Cost) << ", threshold=" - << ore::NV("Threshold", LoopInterchangeCostThreshold) << - ") and it does not improve parallelism."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Interchanging loops is too costly (cost=" + << ore::NV("Cost", Cost) << ", threshold=" + << ore::NV("Threshold", LoopInterchangeCostThreshold) + << ") and it does not improve parallelism."; + }); return false; } @@ -1145,7 +1171,7 @@ bool LoopInterchangeTransform::transform() { bool Transformed = false; Instruction *InnerIndexVar; - if (InnerLoop->getSubLoops().size() == 0) { + if (InnerLoop->getSubLoops().empty()) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); DEBUG(dbgs() << "Calling Split Inner Loop\n"); PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); @@ -1159,7 +1185,11 @@ bool LoopInterchangeTransform::transform() { else InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); - // + // Ensure that InductionPHI is the first Phi node as required by + // splitInnerLoopHeader + if (&InductionPHI->getParent()->front() != InductionPHI) + InductionPHI->moveBefore(&InductionPHI->getParent()->front()); + // Split at the place were the induction variable is // incremented/decremented. // TODO: This splitting logic may not work always. Fix this. @@ -1188,13 +1218,12 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { } void LoopInterchangeTransform::splitInnerLoopHeader() { - // Split the inner loop header out. Here make sure that the reduction PHI's // stay in the innerloop body. BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); if (InnerLoopHasReduction) { - // FIXME: Check if the induction PHI will always be the first PHI. + // Note: The induction PHI must be the first PHI for this to work BasicBlock *New = InnerLoopHeader->splitBasicBlock( ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split"); if (LI) @@ -1244,7 +1273,6 @@ void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, } bool LoopInterchangeTransform::adjustLoopBranches() { - DEBUG(dbgs() << "adjustLoopBranches called\n"); // Adjust the loop preheader BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); @@ -1352,8 +1380,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() { return true; } -void LoopInterchangeTransform::adjustLoopPreheaders() { +void LoopInterchangeTransform::adjustLoopPreheaders() { // We have interchanged the preheaders so we need to interchange the data in // the preheader as well. // This is because the content of inner preheader was previously executed @@ -1373,7 +1401,6 @@ void LoopInterchangeTransform::adjustLoopPreheaders() { } bool LoopInterchangeTransform::adjustLoopLinks() { - // Adjust all branches in the inner and outer loop. bool Changed = adjustLoopBranches(); if (Changed) @@ -1382,6 +1409,7 @@ bool LoopInterchangeTransform::adjustLoopLinks() { } char LoopInterchange::ID = 0; + INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp index 20b37c4b70e6..dfa5ec1f354d 100644 --- a/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -28,22 +28,29 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include <algorithm> @@ -53,11 +60,11 @@ #include <tuple> #include <utility> +using namespace llvm; + #define LLE_OPTION "loop-load-elim" #define DEBUG_TYPE LLE_OPTION -using namespace llvm; - static cl::opt<unsigned> CheckPerElim( "runtime-check-per-loop-load-elim", cl::Hidden, cl::desc("Max number of memchecks allowed per eliminated load on average"), @@ -127,10 +134,12 @@ struct StoreToLoadForwardingCandidate { #endif }; +} // end anonymous namespace + /// \brief Check if the store dominates all latches, so as long as there is no /// intervening store this value will be loaded in the next iteration. -bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, - DominatorTree *DT) { +static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, + DominatorTree *DT) { SmallVector<BasicBlock *, 8> Latches; L->getLoopLatches(Latches); return llvm::all_of(Latches, [&](const BasicBlock *Latch) { @@ -143,6 +152,8 @@ static bool isLoadConditional(LoadInst *Load, Loop *L) { return Load->getParent() != L->getHeader(); } +namespace { + /// \brief The per-loop class that does most of the work. class LoadEliminationForLoop { public: @@ -241,8 +252,8 @@ public: std::forward_list<StoreToLoadForwardingCandidate> &Candidates) { // If Store is nullptr it means that we have multiple stores forwarding to // this store. - typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *> - LoadToSingleCandT; + using LoadToSingleCandT = + DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>; LoadToSingleCandT LoadToSingleCand; for (const auto &Cand : Candidates) { @@ -393,7 +404,6 @@ public: void propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, SCEVExpander &SEE) { - // // loop: // %x = load %gep_i // = ... %x @@ -431,6 +441,7 @@ public: bool processLoop() { DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() << "\" checking " << *L << "\n"); + // Look for store-to-load forwarding cases across the // backedge. E.g.: // @@ -558,6 +569,8 @@ private: PredicatedScalarEvolution PSE; }; +} // end anonymous namespace + static bool eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, function_ref<const LoopAccessInfo &(Loop &)> GetLAI) { @@ -584,10 +597,14 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, return Changed; } +namespace { + /// \brief The pass. Most of the work is delegated to the per-loop /// LoadEliminationForLoop class. class LoopLoadElimination : public FunctionPass { public: + static char ID; + LoopLoadElimination() : FunctionPass(ID) { initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); } @@ -616,13 +633,12 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } - - static char ID; }; } // end anonymous namespace char LoopLoadElimination::ID; + static const char LLE_name[] = "Loop Load Elimination"; INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) @@ -633,9 +649,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) -namespace llvm { - -FunctionPass *createLoopLoadEliminationPass() { +FunctionPass *llvm::createLoopLoadEliminationPass() { return new LoopLoadElimination(); } @@ -652,7 +666,8 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F, auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); bool Changed = eliminateLoadsAcrossLoops( F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, + SE, TLI, TTI, nullptr}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }); @@ -662,5 +677,3 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F, PreservedAnalyses PA; return PA; } - -} // end namespace llvm diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp index 9b12ba180444..2e4c7b19e476 100644 --- a/lib/Transforms/Scalar/LoopPredication.cpp +++ b/lib/Transforms/Scalar/LoopPredication.cpp @@ -34,6 +34,143 @@ // else // deoptimize // +// It's tempting to rely on SCEV here, but it has proven to be problematic. +// Generally the facts SCEV provides about the increment step of add +// recurrences are true if the backedge of the loop is taken, which implicitly +// assumes that the guard doesn't fail. Using these facts to optimize the +// guard results in a circular logic where the guard is optimized under the +// assumption that it never fails. +// +// For example, in the loop below the induction variable will be marked as nuw +// basing on the guard. Basing on nuw the guard predicate will be considered +// monotonic. Given a monotonic condition it's tempting to replace the induction +// variable in the condition with its value on the last iteration. But this +// transformation is not correct, e.g. e = 4, b = 5 breaks the loop. +// +// for (int i = b; i != e; i++) +// guard(i u< len) +// +// One of the ways to reason about this problem is to use an inductive proof +// approach. Given the loop: +// +// if (B(0)) { +// do { +// I = PHI(0, I.INC) +// I.INC = I + Step +// guard(G(I)); +// } while (B(I)); +// } +// +// where B(x) and G(x) are predicates that map integers to booleans, we want a +// loop invariant expression M such the following program has the same semantics +// as the above: +// +// if (B(0)) { +// do { +// I = PHI(0, I.INC) +// I.INC = I + Step +// guard(G(0) && M); +// } while (B(I)); +// } +// +// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step) +// +// Informal proof that the transformation above is correct: +// +// By the definition of guards we can rewrite the guard condition to: +// G(I) && G(0) && M +// +// Let's prove that for each iteration of the loop: +// G(0) && M => G(I) +// And the condition above can be simplified to G(Start) && M. +// +// Induction base. +// G(0) && M => G(0) +// +// Induction step. Assuming G(0) && M => G(I) on the subsequent +// iteration: +// +// B(I) is true because it's the backedge condition. +// G(I) is true because the backedge is guarded by this condition. +// +// So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step). +// +// Note that we can use anything stronger than M, i.e. any condition which +// implies M. +// +// When S = 1 (i.e. forward iterating loop), the transformation is supported +// when: +// * The loop has a single latch with the condition of the form: +// B(X) = latchStart + X <pred> latchLimit, +// where <pred> is u<, u<=, s<, or s<=. +// * The guard condition is of the form +// G(X) = guardStart + X u< guardLimit +// +// For the ult latch comparison case M is: +// forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit => +// guardStart + X + 1 u< guardLimit +// +// The only way the antecedent can be true and the consequent can be false is +// if +// X == guardLimit - 1 - guardStart +// (and guardLimit is non-zero, but we won't use this latter fact). +// If X == guardLimit - 1 - guardStart then the second half of the antecedent is +// latchStart + guardLimit - 1 - guardStart u< latchLimit +// and its negation is +// latchStart + guardLimit - 1 - guardStart u>= latchLimit +// +// In other words, if +// latchLimit u<= latchStart + guardLimit - 1 - guardStart +// then: +// (the ranges below are written in ConstantRange notation, where [A, B) is the +// set for (I = A; I != B; I++ /*maywrap*/) yield(I);) +// +// forall X . guardStart + X u< guardLimit && +// latchStart + X u< latchLimit => +// guardStart + X + 1 u< guardLimit +// == forall X . guardStart + X u< guardLimit && +// latchStart + X u< latchStart + guardLimit - 1 - guardStart => +// guardStart + X + 1 u< guardLimit +// == forall X . (guardStart + X) in [0, guardLimit) && +// (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) => +// (guardStart + X + 1) in [0, guardLimit) +// == forall X . X in [-guardStart, guardLimit - guardStart) && +// X in [-latchStart, guardLimit - 1 - guardStart) => +// X in [-guardStart - 1, guardLimit - guardStart - 1) +// == true +// +// So the widened condition is: +// guardStart u< guardLimit && +// latchStart + guardLimit - 1 - guardStart u>= latchLimit +// Similarly for ule condition the widened condition is: +// guardStart u< guardLimit && +// latchStart + guardLimit - 1 - guardStart u> latchLimit +// For slt condition the widened condition is: +// guardStart u< guardLimit && +// latchStart + guardLimit - 1 - guardStart s>= latchLimit +// For sle condition the widened condition is: +// guardStart u< guardLimit && +// latchStart + guardLimit - 1 - guardStart s> latchLimit +// +// When S = -1 (i.e. reverse iterating loop), the transformation is supported +// when: +// * The loop has a single latch with the condition of the form: +// B(X) = X <pred> latchLimit, where <pred> is u> or s>. +// * The guard condition is of the form +// G(X) = X - 1 u< guardLimit +// +// For the ugt latch comparison case M is: +// forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit +// +// The only way the antecedent can be true and the consequent can be false is if +// X == 1. +// If X == 1 then the second half of the antecedent is +// 1 u> latchLimit, and its negation is latchLimit u>= 1. +// +// So the widened condition is: +// guardStart u< guardLimit && latchLimit u>= 1. +// Similarly for sgt condition the widened condition is: +// guardStart u< guardLimit && latchLimit s>= 1. //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopPredication.h" @@ -56,6 +193,11 @@ using namespace llvm; +static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation", + cl::Hidden, cl::init(true)); + +static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop", + cl::Hidden, cl::init(true)); namespace { class LoopPredication { /// Represents an induction variable check: @@ -68,6 +210,10 @@ class LoopPredication { const SCEV *Limit) : Pred(Pred), IV(IV), Limit(Limit) {} LoopICmp() {} + void dump() { + dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV + << ", Limit = " << *Limit << "\n"; + } }; ScalarEvolution *SE; @@ -75,17 +221,51 @@ class LoopPredication { Loop *L; const DataLayout *DL; BasicBlock *Preheader; + LoopICmp LatchCheck; - Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI); + bool isSupportedStep(const SCEV* Step); + Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI) { + return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0), + ICI->getOperand(1)); + } + Optional<LoopICmp> parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS, + Value *RHS); + + Optional<LoopICmp> parseLoopLatchICmp(); + bool CanExpand(const SCEV* S); Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, Instruction *InsertAt); Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander, IRBuilder<> &Builder); + Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck, + LoopICmp RangeCheck, + SCEVExpander &Expander, + IRBuilder<> &Builder); + Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck, + LoopICmp RangeCheck, + SCEVExpander &Expander, + IRBuilder<> &Builder); bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); + // When the IV type is wider than the range operand type, we can still do loop + // predication, by generating SCEVs for the range and latch that are of the + // same type. We achieve this by generating a SCEV truncate expression for the + // latch IV. This is done iff truncation of the IV is a safe operation, + // without loss of information. + // Another way to achieve this is by generating a wider type SCEV for the + // range check operand, however, this needs a more involved check that + // operands do not overflow. This can lead to loss of information when the + // range operand is of the form: add i32 %offset, %iv. We need to prove that + // sext(x + y) is same as sext(x) + sext(y). + // This function returns true if we can safely represent the IV type in + // the RangeCheckType without loss of information. + bool isSafeToTruncateWideIVType(Type *RangeCheckType); + // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do + // so. + Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType); public: LoopPredication(ScalarEvolution *SE) : SE(SE){}; bool runOnLoop(Loop *L); @@ -135,11 +315,8 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, } Optional<LoopPredication::LoopICmp> -LoopPredication::parseLoopICmp(ICmpInst *ICI) { - ICmpInst::Predicate Pred = ICI->getPredicate(); - - Value *LHS = ICI->getOperand(0); - Value *RHS = ICI->getOperand(1); +LoopPredication::parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS, + Value *RHS) { const SCEV *LHSS = SE->getSCEV(LHS); if (isa<SCEVCouldNotCompute>(LHSS)) return None; @@ -165,13 +342,146 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, Instruction *InsertAt) { + // TODO: we can check isLoopEntryGuardedByCond before emitting the check + Type *Ty = LHS->getType(); assert(Ty == RHS->getType() && "expandCheck operands have different types?"); + + if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS)) + return Builder.getTrue(); + Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt); Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt); return Builder.CreateICmp(Pred, LHSV, RHSV); } +Optional<LoopPredication::LoopICmp> +LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) { + + auto *LatchType = LatchCheck.IV->getType(); + if (RangeCheckType == LatchType) + return LatchCheck; + // For now, bail out if latch type is narrower than range type. + if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType)) + return None; + if (!isSafeToTruncateWideIVType(RangeCheckType)) + return None; + // We can now safely identify the truncated version of the IV and limit for + // RangeCheckType. + LoopICmp NewLatchCheck; + NewLatchCheck.Pred = LatchCheck.Pred; + NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>( + SE->getTruncateExpr(LatchCheck.IV, RangeCheckType)); + if (!NewLatchCheck.IV) + return None; + NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType); + DEBUG(dbgs() << "IV of type: " << *LatchType + << "can be represented as range check type:" << *RangeCheckType + << "\n"); + DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n"); + DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n"); + return NewLatchCheck; +} + +bool LoopPredication::isSupportedStep(const SCEV* Step) { + return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop); +} + +bool LoopPredication::CanExpand(const SCEV* S) { + return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE); +} + +Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop( + LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck, + SCEVExpander &Expander, IRBuilder<> &Builder) { + auto *Ty = RangeCheck.IV->getType(); + // Generate the widened condition for the forward loop: + // guardStart u< guardLimit && + // latchLimit <pred> guardLimit - 1 - guardStart + latchStart + // where <pred> depends on the latch condition predicate. See the file + // header comment for the reasoning. + // guardLimit - guardStart + latchStart - 1 + const SCEV *GuardStart = RangeCheck.IV->getStart(); + const SCEV *GuardLimit = RangeCheck.Limit; + const SCEV *LatchStart = LatchCheck.IV->getStart(); + const SCEV *LatchLimit = LatchCheck.Limit; + + // guardLimit - guardStart + latchStart - 1 + const SCEV *RHS = + SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart), + SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); + if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || + !CanExpand(LatchLimit) || !CanExpand(RHS)) { + DEBUG(dbgs() << "Can't expand limit check!\n"); + return None; + } + ICmpInst::Predicate LimitCheckPred; + switch (LatchCheck.Pred) { + case ICmpInst::ICMP_ULT: + LimitCheckPred = ICmpInst::ICMP_ULE; + break; + case ICmpInst::ICMP_ULE: + LimitCheckPred = ICmpInst::ICMP_ULT; + break; + case ICmpInst::ICMP_SLT: + LimitCheckPred = ICmpInst::ICMP_SLE; + break; + case ICmpInst::ICMP_SLE: + LimitCheckPred = ICmpInst::ICMP_SLT; + break; + default: + llvm_unreachable("Unsupported loop latch!"); + } + + DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); + DEBUG(dbgs() << "RHS: " << *RHS << "\n"); + DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); + + Instruction *InsertAt = Preheader->getTerminator(); + auto *LimitCheck = + expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt); + auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred, + GuardStart, GuardLimit, InsertAt); + return Builder.CreateAnd(FirstIterationCheck, LimitCheck); +} + +Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop( + LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck, + SCEVExpander &Expander, IRBuilder<> &Builder) { + auto *Ty = RangeCheck.IV->getType(); + const SCEV *GuardStart = RangeCheck.IV->getStart(); + const SCEV *GuardLimit = RangeCheck.Limit; + const SCEV *LatchLimit = LatchCheck.Limit; + if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || + !CanExpand(LatchLimit)) { + DEBUG(dbgs() << "Can't expand limit check!\n"); + return None; + } + // The decrement of the latch check IV should be the same as the + // rangeCheckIV. + auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE); + if (RangeCheck.IV != PostDecLatchCheckIV) { + DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: " + << *PostDecLatchCheckIV + << " and RangeCheckIV: " << *RangeCheck.IV << "\n"); + return None; + } + + // Generate the widened condition for CountDownLoop: + // guardStart u< guardLimit && + // latchLimit <pred> 1. + // See the header comment for reasoning of the checks. + Instruction *InsertAt = Preheader->getTerminator(); + auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred) + ? ICmpInst::ICMP_SGE + : ICmpInst::ICMP_UGE; + auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT, + GuardStart, GuardLimit, InsertAt); + auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, + SE->getOne(Ty), InsertAt); + return Builder.CreateAnd(FirstIterationCheck, LimitCheck); +} + /// If ICI can be widened to a loop invariant condition emits the loop /// invariant condition in the loop preheader and return it, otherwise /// returns None. @@ -181,51 +491,62 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, DEBUG(dbgs() << "Analyzing ICmpInst condition:\n"); DEBUG(ICI->dump()); + // parseLoopStructure guarantees that the latch condition is: + // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=. + // We are looking for the range checks of the form: + // i u< guardLimit auto RangeCheck = parseLoopICmp(ICI); if (!RangeCheck) { DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); return None; } - - ICmpInst::Predicate Pred = RangeCheck->Pred; - const SCEVAddRecExpr *IndexAR = RangeCheck->IV; - const SCEV *RHSS = RangeCheck->Limit; - - auto CanExpand = [this](const SCEV *S) { - return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE); - }; - if (!CanExpand(RHSS)) + DEBUG(dbgs() << "Guard check:\n"); + DEBUG(RangeCheck->dump()); + if (RangeCheck->Pred != ICmpInst::ICMP_ULT) { + DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred + << ")!\n"); return None; - - DEBUG(dbgs() << "IndexAR: "); - DEBUG(IndexAR->dump()); - - bool IsIncreasing = false; - if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing)) + } + auto *RangeCheckIV = RangeCheck->IV; + if (!RangeCheckIV->isAffine()) { + DEBUG(dbgs() << "Range check IV is not affine!\n"); return None; - - // If the predicate is increasing the condition can change from false to true - // as the loop progresses, in this case take the value on the first iteration - // for the widened check. Otherwise the condition can change from true to - // false as the loop progresses, so take the value on the last iteration. - const SCEV *NewLHSS = IsIncreasing - ? IndexAR->getStart() - : SE->getSCEVAtScope(IndexAR, L->getParentLoop()); - if (NewLHSS == IndexAR) { - DEBUG(dbgs() << "Can't compute NewLHSS!\n"); + } + auto *Step = RangeCheckIV->getStepRecurrence(*SE); + // We cannot just compare with latch IV step because the latch and range IVs + // may have different types. + if (!isSupportedStep(Step)) { + DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); return None; } - - DEBUG(dbgs() << "NewLHSS: "); - DEBUG(NewLHSS->dump()); - - if (!CanExpand(NewLHSS)) + auto *Ty = RangeCheckIV->getType(); + auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty); + if (!CurrLatchCheckOpt) { + DEBUG(dbgs() << "Failed to generate a loop latch check " + "corresponding to range type: " + << *Ty << "\n"); return None; + } - DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n"); + LoopICmp CurrLatchCheck = *CurrLatchCheckOpt; + // At this point, the range and latch step should have the same type, but need + // not have the same value (we support both 1 and -1 steps). + assert(Step->getType() == + CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() && + "Range and latch steps should be of same type!"); + if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) { + DEBUG(dbgs() << "Range and latch have different step values!\n"); + return None; + } - Instruction *InsertAt = Preheader->getTerminator(); - return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt); + if (Step->isOne()) + return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck, + Expander, Builder); + else { + assert(Step->isAllOnesValue() && "Step should be -1!"); + return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck, + Expander, Builder); + } } bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, @@ -288,6 +609,97 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, return true; } +Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() { + using namespace PatternMatch; + + BasicBlock *LoopLatch = L->getLoopLatch(); + if (!LoopLatch) { + DEBUG(dbgs() << "The loop doesn't have a single latch!\n"); + return None; + } + + ICmpInst::Predicate Pred; + Value *LHS, *RHS; + BasicBlock *TrueDest, *FalseDest; + + if (!match(LoopLatch->getTerminator(), + m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest, + FalseDest))) { + DEBUG(dbgs() << "Failed to match the latch terminator!\n"); + return None; + } + assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) && + "One of the latch's destinations must be the header"); + if (TrueDest != L->getHeader()) + Pred = ICmpInst::getInversePredicate(Pred); + + auto Result = parseLoopICmp(Pred, LHS, RHS); + if (!Result) { + DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); + return None; + } + + // Check affine first, so if it's not we don't try to compute the step + // recurrence. + if (!Result->IV->isAffine()) { + DEBUG(dbgs() << "The induction variable is not affine!\n"); + return None; + } + + auto *Step = Result->IV->getStepRecurrence(*SE); + if (!isSupportedStep(Step)) { + DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n"); + return None; + } + + auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) { + if (Step->isOne()) { + return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT && + Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE; + } else { + assert(Step->isAllOnesValue() && "Step should be -1!"); + return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT; + } + }; + + if (IsUnsupportedPredicate(Step, Result->Pred)) { + DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred + << ")!\n"); + return None; + } + return Result; +} + +// Returns true if its safe to truncate the IV to RangeCheckType. +bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) { + if (!EnableIVTruncation) + return false; + assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) > + DL->getTypeSizeInBits(RangeCheckType) && + "Expected latch check IV type to be larger than range check operand " + "type!"); + // The start and end values of the IV should be known. This is to guarantee + // that truncating the wide type will not lose information. + auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit); + auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart()); + if (!Limit || !Start) + return false; + // This check makes sure that the IV does not change sign during loop + // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE, + // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the + // IV wraps around, and the truncation of the IV would lose the range of + // iterations between 2^32 and 2^64. + bool Increasing; + if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing)) + return false; + // The active bits should be less than the bits in the RangeCheckType. This + // guarantees that truncating the latch check to RangeCheckType is a safe + // operation. + auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType); + return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize && + Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; @@ -308,6 +720,14 @@ bool LoopPredication::runOnLoop(Loop *Loop) { if (!Preheader) return false; + auto LatchCheckOpt = parseLoopLatchICmp(); + if (!LatchCheckOpt) + return false; + LatchCheck = *LatchCheckOpt; + + DEBUG(dbgs() << "Latch check:\n"); + DEBUG(LatchCheck.dump()); + // Collect all the guards into a vector and process later, so as not // to invalidate the instruction iterator. SmallVector<IntrinsicInst *, 4> Guards; diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index fc0216e76a5b..d1a54b877950 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -1,4 +1,4 @@ -//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// +//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===// // // The LLVM Compiler Infrastructure // @@ -11,22 +11,42 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/APInt.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -34,6 +54,13 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <iterator> +#include <map> +#include <utility> using namespace llvm; @@ -127,6 +154,7 @@ NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), // br %cmp, header, exit namespace { + enum IterationLimits { /// The maximum number of iterations that we'll try and reroll. IL_MaxRerollIterations = 32, @@ -139,6 +167,7 @@ namespace { class LoopReroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid + LoopReroll() : LoopPass(ID) { initializeLoopRerollPass(*PassRegistry::getPassRegistry()); } @@ -158,11 +187,12 @@ namespace { DominatorTree *DT; bool PreserveLCSSA; - typedef SmallVector<Instruction *, 16> SmallInstructionVector; - typedef SmallSet<Instruction *, 16> SmallInstructionSet; + using SmallInstructionVector = SmallVector<Instruction *, 16>; + using SmallInstructionSet = SmallSet<Instruction *, 16>; // Map between induction variable and its increment DenseMap<Instruction *, int64_t> IVToIncMap; + // For loop with multiple induction variable, remember the one used only to // control the loop. Instruction *LoopControlIV; @@ -171,8 +201,7 @@ namespace { // representing a reduction. Only the last value may be used outside the // loop. struct SimpleLoopReduction { - SimpleLoopReduction(Instruction *P, Loop *L) - : Valid(false), Instructions(1, P) { + SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) { assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); add(L); } @@ -204,8 +233,8 @@ namespace { return Instructions.size()-1; } - typedef SmallInstructionVector::iterator iterator; - typedef SmallInstructionVector::const_iterator const_iterator; + using iterator = SmallInstructionVector::iterator; + using const_iterator = SmallInstructionVector::const_iterator; iterator begin() { assert(Valid && "Using invalid reduction"); @@ -221,7 +250,7 @@ namespace { const_iterator end() const { return Instructions.end(); } protected: - bool Valid; + bool Valid = false; SmallInstructionVector Instructions; void add(Loop *L); @@ -230,7 +259,7 @@ namespace { // The set of all reductions, and state tracking of possible reductions // during loop instruction processing. struct ReductionTracker { - typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; + using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>; // Add a new possible reduction. void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); } @@ -342,6 +371,7 @@ namespace { struct DAGRootSet { Instruction *BaseInst; SmallInstructionVector Roots; + // The instructions between IV and BaseInst (but not including BaseInst). SmallInstructionSet SubsumedInsts; }; @@ -361,15 +391,17 @@ namespace { /// Stage 1: Find all the DAG roots for the induction variable. bool findRoots(); + /// Stage 2: Validate if the found roots are valid. bool validate(ReductionTracker &Reductions); + /// Stage 3: Assuming validate() returned true, perform the /// replacement. /// @param IterCount The maximum iteration count of L. void replace(const SCEV *IterCount); protected: - typedef MapVector<Instruction*, BitVector> UsesTy; + using UsesTy = MapVector<Instruction *, BitVector>; void findRootsRecursive(Instruction *IVU, SmallInstructionSet SubsumedInsts); @@ -412,22 +444,29 @@ namespace { // The loop induction variable. Instruction *IV; + // Loop step amount. int64_t Inc; + // Loop reroll count; if Inc == 1, this records the scaling applied // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; // If Inc is not 1, Scale = Inc. uint64_t Scale; + // The roots themselves. SmallVector<DAGRootSet,16> RootSets; + // All increment instructions for IV. SmallInstructionVector LoopIncs; + // Map of all instructions in the loop (in order) to the iterations // they are used in (or specially, IL_All for instructions // used in the loop increment mechanism). UsesTy Uses; + // Map between induction variable and its increment DenseMap<Instruction *, int64_t> &IVToIncMap; + Instruction *LoopControlIV; }; @@ -446,9 +485,11 @@ namespace { bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions); }; -} + +} // end anonymous namespace char LoopReroll::ID = 0; + INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) @@ -1069,7 +1110,6 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po } return true; - } /// Get the next instruction in "In" that is a member of set Val. @@ -1124,7 +1164,7 @@ static bool isIgnorableInst(const Instruction *I) { switch (II->getIntrinsicID()) { default: return false; - case llvm::Intrinsic::annotation: + case Intrinsic::annotation: case Intrinsic::ptr_annotation: case Intrinsic::var_annotation: // TODO: the following intrinsics may also be whitelisted: @@ -1407,8 +1447,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { BaseIt = nextInstr(0, Uses, Visited); RootIt = nextInstr(Iter, Uses, Visited); } - assert (BaseIt == Uses.end() && RootIt == Uses.end() && - "Mismatched set sizes!"); + assert(BaseIt == Uses.end() && RootIt == Uses.end() && + "Mismatched set sizes!"); } DEBUG(dbgs() << "LRR: Matched all iteration increments for " << diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 3506ac343d59..a91f53ba663f 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" @@ -141,37 +142,29 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug // intrinsics. - LLVMContext &C = OrigHeader->getContext(); - if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) { - if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) { - for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) { - // Grab the use before incrementing the iterator. Otherwise, altering - // the Use will invalidate the iterator. - Use &U = *UI++; - DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser()); - if (!UserInst) - continue; - - // The original users in the OrigHeader are already using the original - // definitions. - BasicBlock *UserBB = UserInst->getParent(); - if (UserBB == OrigHeader) - continue; + SmallVector<DbgValueInst *, 1> DbgValues; + llvm::findDbgValues(DbgValues, OrigHeaderVal); + for (auto &DbgValue : DbgValues) { + // The original users in the OrigHeader are already using the original + // definitions. + BasicBlock *UserBB = DbgValue->getParent(); + if (UserBB == OrigHeader) + continue; - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped and anything else can be handled by - // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute undef. - Value *NewVal; - if (UserBB == OrigPreheader) - NewVal = OrigPreHeaderVal; - else if (SSA.HasValueForBlock(UserBB)) - NewVal = SSA.GetValueInMiddleOfBlock(UserBB); - else - NewVal = UndefValue::get(OrigHeaderVal->getType()); - U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal)); - } - } + // Users in the OrigPreHeader need to use the value to which the + // original definitions are mapped and anything else can be handled by + // the SSAUpdater. To avoid adding PHINodes, check if the value is + // available in UserBB, if not substitute undef. + Value *NewVal; + if (UserBB == OrigPreheader) + NewVal = OrigPreHeaderVal; + else if (SSA.HasValueForBlock(UserBB)) + NewVal = SSA.GetValueInMiddleOfBlock(UserBB); + else + NewVal = UndefValue::get(OrigHeaderVal->getType()); + DbgValue->setOperand(0, + MetadataAsValue::get(OrigHeaderVal->getContext(), + ValueAsMetadata::get(NewVal))); } } } @@ -315,6 +308,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); + + // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. + using DbgIntrinsicHash = + std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>; + auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash { + return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; + }; + SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics; + for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); + I != E; ++I) { + if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I)) + DbgIntrinsics.insert(makeHash(DII)); + else + break; + } + while (I != E) { Instruction *Inst = &*I++; @@ -338,6 +347,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { RemapInstruction(C, ValueMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + // Avoid inserting the same intrinsic twice. + if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C)) + if (DbgIntrinsics.count(makeHash(DII))) { + C->deleteValue(); + continue; + } + // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. @@ -395,6 +411,17 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { L->moveToHeader(NewHeader); assert(L->getHeader() == NewHeader && "Latch block is our new header"); + // Inform DT about changes to the CFG. + if (DT) { + // The OrigPreheader branches to the NewHeader and Exit now. Then, inform + // the DT about the removed edge to the OrigHeader (that got removed). + SmallVector<DominatorTree::UpdateType, 3> Updates; + Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); + Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); + Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); + DT->applyUpdates(Updates); + } + // At this point, we've finished our major CFG changes. As part of cloning // the loop into the preheader we've simplified instructions and the // duplicated conditional branch may now be branching on a constant. If it is @@ -408,26 +435,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) != NewHeader) { // The conditional branch can't be folded, handle the general case. - // Update DominatorTree to reflect the CFG change we just made. Then split - // edges as necessary to preserve LoopSimplify form. - if (DT) { - // Everything that was dominated by the old loop header is now dominated - // by the original loop preheader. Conceptually the header was merged - // into the preheader, even though we reuse the actual block as a new - // loop latch. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); - SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), - OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); - for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - - assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); - - // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(OrigHeader, OrigLatch); - } + // Split edges as necessary to preserve LoopSimplify form. // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and // thus is not a preheader anymore. @@ -467,52 +475,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { PHBI->eraseFromParent(); // With our CFG finalized, update DomTree if it is available. - if (DT) { - // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(NewHeader, OrigPreheader); - DT->changeImmediateDominator(OrigHeader, OrigLatch); - - // Brute force incremental dominator tree update. Call - // findNearestCommonDominator on all CFG predecessors of each child of the - // original header. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); - SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), - OrigHeaderNode->end()); - bool Changed; - do { - Changed = false; - for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) { - DomTreeNode *Node = HeaderChildren[I]; - BasicBlock *BB = Node->getBlock(); - - BasicBlock *NearestDom = nullptr; - for (BasicBlock *Pred : predecessors(BB)) { - // Consider only reachable basic blocks. - if (!DT->getNode(Pred)) - continue; - - if (!NearestDom) { - NearestDom = Pred; - continue; - } - - NearestDom = DT->findNearestCommonDominator(NearestDom, Pred); - assert(NearestDom && "No NearestCommonDominator found"); - } - - assert(NearestDom && "Nearest dominator not found"); - - // Remember if this changes the DomTree. - if (Node->getIDom()->getBlock() != NearestDom) { - DT->changeImmediateDominator(BB, NearestDom); - Changed = true; - } - } - - // If the dominator changed, this may have an effect on other - // predecessors, continue until we reach a fixpoint. - } while (Changed); - } + if (DT) DT->deleteEdge(OrigPreheader, Exit); } assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); @@ -671,7 +634,7 @@ bool LoopRotate::processLoop(Loop *L) { if ((MadeChange || SimplifiedLatch) && LoopMD) L->setLoopID(LoopMD); - return MadeChange; + return MadeChange || SimplifiedLatch; } LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication) diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 3638da118cb7..953854c8b7b7 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -65,7 +65,9 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -80,13 +82,18 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" @@ -98,7 +105,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -107,8 +113,8 @@ #include <cstdint> #include <cstdlib> #include <iterator> +#include <limits> #include <map> -#include <tuple> #include <utility> using namespace llvm; @@ -131,7 +137,7 @@ static cl::opt<bool> EnablePhiElim( // The flag adds instruction count to solutions cost comparision. static cl::opt<bool> InsnsCost( - "lsr-insns-cost", cl::Hidden, cl::init(false), + "lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model")); // Flag to choose how to narrow complex lsr solution @@ -160,15 +166,14 @@ namespace { struct MemAccessTy { /// Used in situations where the accessed memory type is unknown. - static const unsigned UnknownAddressSpace = ~0u; + static const unsigned UnknownAddressSpace = + std::numeric_limits<unsigned>::max(); - Type *MemTy; - unsigned AddrSpace; + Type *MemTy = nullptr; + unsigned AddrSpace = UnknownAddressSpace; - MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {} - - MemAccessTy(Type *Ty, unsigned AS) : - MemTy(Ty), AddrSpace(AS) {} + MemAccessTy() = default; + MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {} bool operator==(MemAccessTy Other) const { return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; @@ -195,11 +200,11 @@ public: } // end anonymous namespace +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegSortData::dump() const { print(errs()); errs() << '\n'; } @@ -209,7 +214,7 @@ namespace { /// Map register candidates to information about how they are used. class RegUseTracker { - typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; + using RegUsesTy = DenseMap<const SCEV *, RegSortData>; RegUsesTy RegUsesMap; SmallVector<const SCEV *, 16> RegSequence; @@ -225,8 +230,9 @@ public: void clear(); - typedef SmallVectorImpl<const SCEV *>::iterator iterator; - typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator; + using iterator = SmallVectorImpl<const SCEV *>::iterator; + using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator; + iterator begin() { return RegSequence.begin(); } iterator end() { return RegSequence.end(); } const_iterator begin() const { return RegSequence.begin(); } @@ -299,16 +305,16 @@ namespace { /// satisfying a use. It may include broken-out immediates and scaled registers. struct Formula { /// Global base address used for complex addressing. - GlobalValue *BaseGV; + GlobalValue *BaseGV = nullptr; /// Base offset for complex addressing. - int64_t BaseOffset; + int64_t BaseOffset = 0; /// Whether any complex addressing has a base register. - bool HasBaseReg; + bool HasBaseReg = false; /// The scale of any complex addressing. - int64_t Scale; + int64_t Scale = 0; /// The list of "base" registers for this use. When this is non-empty. The /// canonical representation of a formula is @@ -328,16 +334,14 @@ struct Formula { /// The 'scaled' register for this use. This should be non-null when Scale is /// not zero. - const SCEV *ScaledReg; + const SCEV *ScaledReg = nullptr; /// An additional constant offset which added near the use. This requires a /// temporary register, but the offset itself can live in an add immediate /// field rather than a register. - int64_t UnfoldedOffset; + int64_t UnfoldedOffset = 0; - Formula() - : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0), - ScaledReg(nullptr), UnfoldedOffset(0) {} + Formula() = default; void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); @@ -562,6 +566,7 @@ bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, return false; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Formula::print(raw_ostream &OS) const { bool First = true; if (BaseGV) { @@ -598,7 +603,6 @@ void Formula::print(raw_ostream &OS) const { } } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void Formula::dump() const { print(errs()); errs() << '\n'; } @@ -773,7 +777,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { /// Returns true if the specified instruction is using the specified value as an /// address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { +static bool isAddressUse(const TargetTransformInfo &TTI, + Instruction *Inst, Value *OperandVal) { bool isAddress = isa<LoadInst>(Inst); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (SI->getPointerOperand() == OperandVal) @@ -782,11 +787,24 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::prefetch: - if (II->getArgOperand(0) == OperandVal) + case Intrinsic::memset: + case Intrinsic::prefetch: + if (II->getArgOperand(0) == OperandVal) + isAddress = true; + break; + case Intrinsic::memmove: + case Intrinsic::memcpy: + if (II->getArgOperand(0) == OperandVal || + II->getArgOperand(1) == OperandVal) + isAddress = true; + break; + default: { + MemIntrinsicInfo IntrInfo; + if (TTI.getTgtMemIntrinsic(II, IntrInfo)) { + if (IntrInfo.PtrVal == OperandVal) isAddress = true; - break; + } + } } } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { if (RMW->getPointerOperand() == OperandVal) @@ -799,7 +817,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { } /// Return the type of the memory being accessed. -static MemAccessTy getAccessType(const Instruction *Inst) { +static MemAccessTy getAccessType(const TargetTransformInfo &TTI, + Instruction *Inst) { MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { AccessTy.MemTy = SI->getOperand(0)->getType(); @@ -810,6 +829,21 @@ static MemAccessTy getAccessType(const Instruction *Inst) { AccessTy.AddrSpace = RMW->getPointerAddressSpace(); } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::prefetch: + AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace(); + break; + default: { + MemIntrinsicInfo IntrInfo; + if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) { + AccessTy.AddrSpace + = IntrInfo.PtrVal->getType()->getPointerAddressSpace(); + } + + break; + } + } } // All pointers have the same requirements, so canonicalize them to an @@ -948,6 +982,7 @@ class LSRUse; /// accurate cost model. static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F); + // Get the cost of the scaling factor used in F for LU. static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, @@ -1013,30 +1048,30 @@ private: ScalarEvolution &SE, DominatorTree &DT, SmallPtrSetImpl<const SCEV *> *LoserRegs); }; - + /// An operand value in an instruction which is to be replaced with some /// equivalent, possibly strength-reduced, replacement. struct LSRFixup { /// The instruction which will be updated. - Instruction *UserInst; + Instruction *UserInst = nullptr; /// The operand of the instruction which will be replaced. The operand may be /// used more than once; every instance will be replaced. - Value *OperandValToReplace; + Value *OperandValToReplace = nullptr; /// If this user is to use the post-incremented value of an induction - /// variable, this variable is non-null and holds the loop associated with the + /// variable, this set is non-empty and holds the loops associated with the /// induction variable. PostIncLoopSet PostIncLoops; /// A constant offset to be added to the LSRUse expression. This allows /// multiple fixups to share the same LSRUse with different offsets, for /// example in an unrolled loop. - int64_t Offset; + int64_t Offset = 0; - bool isUseFullyOutsideLoop(const Loop *L) const; + LSRFixup() = default; - LSRFixup(); + bool isUseFullyOutsideLoop(const Loop *L) const; void print(raw_ostream &OS) const; void dump() const; @@ -1086,7 +1121,7 @@ public: // TODO: Add a generic icmp too? }; - typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair; + using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>; KindType Kind; MemAccessTy AccessTy; @@ -1095,25 +1130,25 @@ public: SmallVector<LSRFixup, 8> Fixups; /// Keep track of the min and max offsets of the fixups. - int64_t MinOffset; - int64_t MaxOffset; + int64_t MinOffset = std::numeric_limits<int64_t>::max(); + int64_t MaxOffset = std::numeric_limits<int64_t>::min(); /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. - bool AllFixupsOutsideLoop; + bool AllFixupsOutsideLoop = true; /// RigidFormula is set to true to guarantee that this use will be associated /// with a single formula--the one that initially matched. Some SCEV /// expressions cannot be expanded. This allows LSR to consider the registers /// used by those expressions without the need to expand them later after /// changing the formula. - bool RigidFormula; + bool RigidFormula = false; /// This records the widest use type for any fixup using this /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max /// fixup widths to be equivalent, because the narrower one may be relying on /// the implicit truncation to truncate away bogus bits. - Type *WidestFixupType; + Type *WidestFixupType = nullptr; /// A list of ways to build a value that can satisfy this user. After the /// list is populated, one of these is selected heuristically and used to @@ -1123,10 +1158,7 @@ public: /// The set of register candidates used by all formulae in this LSRUse. SmallPtrSet<const SCEV *, 4> Regs; - LSRUse(KindType K, MemAccessTy AT) - : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true), RigidFormula(false), - WidestFixupType(nullptr) {} + LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {} LSRFixup &getNewFixup() { Fixups.push_back(LSRFixup()); @@ -1140,7 +1172,7 @@ public: if (f.Offset < MinOffset) MinOffset = f.Offset; } - + bool HasFormulaWithSameRegs(const Formula &F) const; float getNotSelectedProbability(const SCEV *Reg) const; bool InsertFormula(const Formula &F, const Loop &L); @@ -1153,6 +1185,12 @@ public: } // end anonymous namespace +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + LSRUse::KindType Kind, MemAccessTy AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg, int64_t Scale, + Instruction *Fixup = nullptr); + /// Tally up interesting quantities from the given register. void Cost::RateRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, @@ -1280,8 +1318,9 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, // Check with target if this offset with this instruction is // specifically not supported. - if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) && - !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset)) + if (LU.Kind == LSRUse::Address && Offset != 0 && + !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, + Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) C.NumBaseAdds++; } @@ -1325,14 +1364,14 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, /// Set this cost to a losing value. void Cost::Lose() { - C.Insns = ~0u; - C.NumRegs = ~0u; - C.AddRecCost = ~0u; - C.NumIVMuls = ~0u; - C.NumBaseAdds = ~0u; - C.ImmCost = ~0u; - C.SetupCost = ~0u; - C.ScaleCost = ~0u; + C.Insns = std::numeric_limits<unsigned>::max(); + C.NumRegs = std::numeric_limits<unsigned>::max(); + C.AddRecCost = std::numeric_limits<unsigned>::max(); + C.NumIVMuls = std::numeric_limits<unsigned>::max(); + C.NumBaseAdds = std::numeric_limits<unsigned>::max(); + C.ImmCost = std::numeric_limits<unsigned>::max(); + C.SetupCost = std::numeric_limits<unsigned>::max(); + C.ScaleCost = std::numeric_limits<unsigned>::max(); } /// Choose the lower cost. @@ -1343,6 +1382,7 @@ bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) { return TTI.isLSRCostLess(C, Other.C); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Cost::print(raw_ostream &OS) const { if (InsnsCost) OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s "); @@ -1363,16 +1403,11 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << C.SetupCost << " setup cost"; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void Cost::dump() const { print(errs()); errs() << '\n'; } #endif -LSRFixup::LSRFixup() - : UserInst(nullptr), OperandValToReplace(nullptr), - Offset(0) {} - /// Test whether this fixup always uses its value outside of the given loop. bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { // PHI nodes use their value in their incoming blocks. @@ -1387,6 +1422,7 @@ bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { return !L->contains(UserInst); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRFixup::print(raw_ostream &OS) const { OS << "UserInst="; // Store is common and interesting enough to be worth special-casing. @@ -1410,7 +1446,6 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void LSRFixup::dump() const { print(errs()); errs() << '\n'; } @@ -1493,6 +1528,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { RegUses.dropRegister(S, LUIdx); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRUse::print(raw_ostream &OS) const { OS << "LSR Use: Kind="; switch (Kind) { @@ -1526,7 +1562,6 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void LSRUse::dump() const { print(errs()); errs() << '\n'; } @@ -1535,11 +1570,12 @@ LLVM_DUMP_METHOD void LSRUse::dump() const { static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, - bool HasBaseReg, int64_t Scale) { + bool HasBaseReg, int64_t Scale, + Instruction *Fixup/*= nullptr*/) { switch (Kind) { case LSRUse::Address: return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, - HasBaseReg, Scale, AccessTy.AddrSpace); + HasBaseReg, Scale, AccessTy.AddrSpace, Fixup); case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to @@ -1564,7 +1600,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset // Offs is the ICmp immediate. if (Scale == 0) - // The cast does the right thing with INT64_MIN. + // The cast does the right thing with + // std::numeric_limits<int64_t>::min(). BaseOffset = -(uint64_t)BaseOffset; return TTI.isLegalICmpImmediate(BaseOffset); } @@ -1645,6 +1682,16 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F) { + // Target may want to look at the user instructions. + if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) { + for (const LSRFixup &Fixup : LU.Fixups) + if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, + (F.BaseOffset + Fixup.Offset), F.HasBaseReg, + F.Scale, Fixup.UserInst)) + return false; + return true; + } + return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); @@ -1752,22 +1799,21 @@ struct IVInc { Value* IVOperand; const SCEV *IncExpr; - IVInc(Instruction *U, Value *O, const SCEV *E): - UserInst(U), IVOperand(O), IncExpr(E) {} + IVInc(Instruction *U, Value *O, const SCEV *E) + : UserInst(U), IVOperand(O), IncExpr(E) {} }; // The list of IV increments in program order. We typically add the head of a // chain without finding subsequent links. struct IVChain { - SmallVector<IVInc,1> Incs; - const SCEV *ExprBase; - - IVChain() : ExprBase(nullptr) {} + SmallVector<IVInc, 1> Incs; + const SCEV *ExprBase = nullptr; + IVChain() = default; IVChain(const IVInc &Head, const SCEV *Base) - : Incs(1, Head), ExprBase(Base) {} + : Incs(1, Head), ExprBase(Base) {} - typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; + using const_iterator = SmallVectorImpl<IVInc>::const_iterator; // Return the first increment in the chain. const_iterator begin() const { @@ -1809,13 +1855,13 @@ class LSRInstance { LoopInfo &LI; const TargetTransformInfo &TTI; Loop *const L; - bool Changed; + bool Changed = false; /// This is the insert position that the current loop's induction variable /// increment should be placed. In simple loops, this is the latch block's /// terminator. But in more complicated cases, this is a position which will /// dominate all the in-loop post-increment users. - Instruction *IVIncInsertPos; + Instruction *IVIncInsertPos = nullptr; /// Interesting factors between use strides. /// @@ -1861,7 +1907,7 @@ class LSRInstance { void CollectFixupsAndInitialFormulae(); // Support for sharing of LSRUses between LSRFixups. - typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy; + using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>; UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, @@ -2002,6 +2048,14 @@ void LSRInstance::OptimizeShadowIV() { if (!PH) continue; if (PH->getNumIncomingValues() != 2) continue; + // If the calculation in integers overflows, the result in FP type will + // differ. So we only can do this transformation if we are guaranteed to not + // deal with overflowing values + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH)); + if (!AR) continue; + if (IsSigned && !AR->hasNoSignedWrap()) continue; + if (!IsSigned && !AR->hasNoUnsignedWrap()) continue; + Type *SrcTy = PH->getType(); int Mantissa = DestTy->getFPMantissaWidth(); if (Mantissa == -1) continue; @@ -2094,7 +2148,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { /// unfortunately this can come up even for loops where the user didn't use /// a C do-while loop. For example, seemingly well-behaved top-test loops /// will commonly be lowered like this: -// +/// /// if (n > 0) { /// i = 0; /// do { @@ -2128,7 +2182,6 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { /// This function solves this problem by detecting this type of loop and /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting /// the instructions for the maximum computation. -/// ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // Check that the loop matches the pattern we're looking for. if (Cond->getPredicate() != CmpInst::ICMP_EQ && @@ -2268,7 +2321,6 @@ LSRInstance::OptimizeLoopTermCond() { // Otherwise treat this as a rotated loop. for (BasicBlock *ExitingBlock : ExitingBlocks) { - // Get the terminating condition for the loop if possible. If we // can, we want to change it to use a post-incremented version of its // induction variable, to allow coalescing the live ranges for the IV into @@ -2333,7 +2385,7 @@ LSRInstance::OptimizeLoopTermCond() { C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - MemAccessTy AccessTy = getAccessType(UI->getUser()); + MemAccessTy AccessTy = getAccessType(TTI, UI->getUser()); int64_t Scale = C->getSExtValue(); if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, /*BaseOffset=*/0, @@ -2941,7 +2993,7 @@ void LSRInstance::CollectChains() { // consider leaf IV Users. This effectively rediscovers a portion of // IVUsers analysis but in program order this time. if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I))) - continue; + continue; // Remove this instruction from any NearUsers set it may be in. for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); @@ -3003,13 +3055,13 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); - if (!IncConst || !isAddressUse(UserInst, Operand)) + if (!IncConst || !isAddressUse(TTI, UserInst, Operand)) return false; if (IncConst->getAPInt().getMinSignedBits() > 64) return false; - MemAccessTy AccessTy = getAccessType(UserInst); + MemAccessTy AccessTy = getAccessType(TTI, UserInst); int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, IncOffset, /*HaseBaseReg=*/false)) @@ -3136,14 +3188,14 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; - if (isAddressUse(UserInst, U.getOperandValToReplace())) { + if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) { Kind = LSRUse::Address; - AccessTy = getAccessType(UserInst); + AccessTy = getAccessType(TTI, UserInst); } const SCEV *S = IU.getExpr(U); PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops(); - + // Equality (== and !=) ICmps are special. We can rewrite (i == N) as // (N - i == 0), and this allows (N - i) to be the expression that we work // with rather than just N or i, so we can consider the register @@ -3432,7 +3484,6 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), JE = AddOps.end(); J != JE; ++J) { - // Loop-variant "unknown" values are uninteresting; we won't be able to // do anything meaningful with them. if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) @@ -3654,12 +3705,18 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Don't do this if there is more than one offset. if (LU.MinOffset != LU.MaxOffset) return; + // Check if transformation is valid. It is illegal to multiply pointer. + if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy()) + return; + for (const SCEV *BaseReg : Base.BaseRegs) + if (BaseReg->getType()->isPointerTy()) + return; assert(!Base.BaseGV && "ICmpZero use is not legal!"); // Check each interesting stride. for (int64_t Factor : Factors) { // Check that the multiplication doesn't overflow. - if (Base.BaseOffset == INT64_MIN && Factor == -1) + if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1) continue; int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; if (NewBaseOffset / Factor != Base.BaseOffset) @@ -3671,7 +3728,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Check that multiplying with the use offset doesn't overflow. int64_t Offset = LU.MinOffset; - if (Offset == INT64_MIN && Factor == -1) + if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1) continue; Offset = (uint64_t)Offset * Factor; if (Offset / Factor != LU.MinOffset) @@ -3709,7 +3766,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Check that multiplying with the unfolded offset doesn't overflow. if (F.UnfoldedOffset != 0) { - if (F.UnfoldedOffset == INT64_MIN && Factor == -1) + if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() && + Factor == -1) continue; F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) @@ -3833,7 +3891,7 @@ struct WorkItem { const SCEV *OrigReg; WorkItem(size_t LI, int64_t I, const SCEV *R) - : LUIdx(LI), Imm(I), OrigReg(R) {} + : LUIdx(LI), Imm(I), OrigReg(R) {} void print(raw_ostream &OS) const; void dump() const; @@ -3841,12 +3899,12 @@ struct WorkItem { } // end anonymous namespace +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void WorkItem::print(raw_ostream &OS) const { OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx << " , add offset " << Imm; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void WorkItem::dump() const { print(errs()); errs() << '\n'; } @@ -3856,7 +3914,8 @@ LLVM_DUMP_METHOD void WorkItem::dump() const { /// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. - typedef std::map<int64_t, const SCEV *> ImmMapTy; + using ImmMapTy = std::map<int64_t, const SCEV *>; + DenseMap<const SCEV *, ImmMapTy> Map; DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; SmallVector<const SCEV *, 8> Sequence; @@ -4060,8 +4119,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // Collect the best formula for each unique set of shared registers. This // is reset for each use. - typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo> - BestFormulaeTy; + using BestFormulaeTy = + DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>; + BestFormulaeTy BestFormulae; for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { @@ -4148,7 +4208,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { } // This is a rough guess that seems to work fairly well. -static const size_t ComplexityLimit = UINT16_MAX; +static const size_t ComplexityLimit = std::numeric_limits<uint16_t>::max(); /// Estimate the worst-case number of solutions the solver might have to /// consider. It almost never considers this many solutions because it prune the @@ -4267,7 +4327,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LUThatHas->pushFixup(Fixup); DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); } - + // Delete formulae from the new use which are no longer legal. bool Any = false; for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { @@ -4332,7 +4392,8 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { "from the Formulae with the same Scale and ScaledReg.\n"); // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse. - typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy; + using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>; + BestFormulaeTy BestFormulae; #ifndef NDEBUG bool ChangedFormulae = false; @@ -4454,7 +4515,6 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { /// Use3: /// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted /// reg(c) + reg({b,+,1}) 1 + 2/3 - void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; @@ -4549,7 +4609,6 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { print_uses(dbgs())); } - /// Pick a register which seems likely to be profitable, and then in any use /// which has any reference to that register, delete all formulae which do not /// reference that register. @@ -5196,8 +5255,7 @@ void LSRInstance::ImplementSolution( LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI) - : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false), - IVIncInsertPos(nullptr) { + : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -5302,6 +5360,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, ImplementSolution(Solution); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRInstance::print_factors_and_types(raw_ostream &OS) const { if (Factors.empty() && Types.empty()) return; @@ -5352,7 +5411,6 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void LSRInstance::dump() const { print(errs()); errs() << '\n'; } @@ -5448,6 +5506,7 @@ PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM, } char LoopStrengthReduce::ID = 0; + INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 530a68424d5c..7b1d6446a24a 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1,4 +1,4 @@ -//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===// +//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -13,29 +13,55 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopUnrollPass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/LoopUnrollAnalyzer.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/IR/DataLayout.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include <climits> +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <limits> +#include <string> +#include <tuple> #include <utility> using namespace llvm; @@ -79,6 +105,10 @@ static cl::opt<unsigned> UnrollFullMaxCount( cl::desc( "Set the max unroll count for full unrolling, for testing purposes")); +static cl::opt<unsigned> UnrollPeelCount( + "unroll-peel-count", cl::Hidden, + cl::desc("Set the unroll peeling count, for testing purposes")); + static cl::opt<bool> UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " @@ -114,6 +144,10 @@ static cl::opt<bool> cl::desc("Allows loops to be peeled when the dynamic " "trip count is known to be low.")); +static cl::opt<bool> UnrollUnrollRemainder( + "unroll-remainder", cl::Hidden, + cl::desc("Allow the loop remainder to be unrolled.")); + // This option isn't ever intended to be enabled, it serves to allow // experiments to check the assumptions about when this kind of revisit is // necessary. @@ -126,7 +160,7 @@ static cl::opt<bool> UnrollRevisitChildLoops( /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. -static const unsigned NoThreshold = UINT_MAX; +static const unsigned NoThreshold = std::numeric_limits<unsigned>::max(); /// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. @@ -134,7 +168,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, - Optional<bool> UserUpperBound) { + Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults @@ -146,12 +180,13 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.Count = 0; UP.PeelCount = 0; UP.DefaultUnrollRuntimeCount = 8; - UP.MaxCount = UINT_MAX; - UP.FullUnrollMaxCount = UINT_MAX; + UP.MaxCount = std::numeric_limits<unsigned>::max(); + UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max(); UP.BEInsns = 2; UP.Partial = false; UP.Runtime = false; UP.AllowRemainder = true; + UP.UnrollRemainder = false; UP.AllowExpensiveTripCount = false; UP.Force = false; UP.UpperBound = false; @@ -177,6 +212,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.MaxCount = UnrollMaxCount; if (UnrollFullMaxCount.getNumOccurrences() > 0) UP.FullUnrollMaxCount = UnrollFullMaxCount; + if (UnrollPeelCount.getNumOccurrences() > 0) + UP.PeelCount = UnrollPeelCount; if (UnrollAllowPartial.getNumOccurrences() > 0) UP.Partial = UnrollAllowPartial; if (UnrollAllowRemainder.getNumOccurrences() > 0) @@ -187,6 +224,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.UpperBound = false; if (UnrollAllowPeeling.getNumOccurrences() > 0) UP.AllowPeeling = UnrollAllowPeeling; + if (UnrollUnrollRemainder.getNumOccurrences() > 0) + UP.UnrollRemainder = UnrollUnrollRemainder; // Apply user values provided by argument if (UserThreshold.hasValue()) { @@ -201,11 +240,14 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.Runtime = *UserRuntime; if (UserUpperBound.hasValue()) UP.UpperBound = *UserUpperBound; + if (UserAllowPeeling.hasValue()) + UP.AllowPeeling = *UserAllowPeeling; return UP; } namespace { + /// A struct to densely store the state of an instruction after unrolling at /// each iteration. /// @@ -221,25 +263,27 @@ struct UnrolledInstState { /// Hashing and equality testing for a set of the instruction states. struct UnrolledInstStateKeyInfo { - typedef DenseMapInfo<Instruction *> PtrInfo; - typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo; + using PtrInfo = DenseMapInfo<Instruction *>; + using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>; + static inline UnrolledInstState getEmptyKey() { return {PtrInfo::getEmptyKey(), 0, 0, 0}; } + static inline UnrolledInstState getTombstoneKey() { return {PtrInfo::getTombstoneKey(), 0, 0, 0}; } + static inline unsigned getHashValue(const UnrolledInstState &S) { return PairInfo::getHashValue({S.I, S.Iteration}); } + static inline bool isEqual(const UnrolledInstState &LHS, const UnrolledInstState &RHS) { return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); } }; -} -namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. unsigned UnrolledCost; @@ -248,7 +292,8 @@ struct EstimatedUnrollCost { /// rolled form. unsigned RolledDynamicCost; }; -} + +} // end anonymous namespace /// \brief Figure out if the loop is worth full unrolling. /// @@ -270,7 +315,8 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. - assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && + assert(UnrollMaxIterationsCountToAnalyze < + (unsigned)(std::numeric_limits<int>::max() / 2) && "The unroll iterations max is too large!"); // Only analyze inner loops. We can't properly estimate cost of nested loops @@ -633,43 +679,6 @@ static unsigned UnrollCountPragmaValue(const Loop *L) { return 0; } -// Remove existing unroll metadata and add unroll disable metadata to -// indicate the loop has already been unrolled. This prevents a loop -// from being unrolled more than is directed by a pragma if the loop -// unrolling pass is run more than once (which it generally is). -static void SetLoopAlreadyUnrolled(Loop *L) { - MDNode *LoopID = L->getLoopID(); - // First remove any existing loop unrolling metadata. - SmallVector<Metadata *, 4> MDs; - // Reserve first location for self reference to the LoopID metadata node. - MDs.push_back(nullptr); - - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - bool IsUnrollMetadata = false; - MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (MD) { - const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); - } - if (!IsUnrollMetadata) - MDs.push_back(LoopID->getOperand(i)); - } - } - - // Add unroll(disable) metadata to disable future unrolling. - LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Metadata *, 1> DisableOperands; - DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); - - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - L->setLoopID(NewLoopID); -} - // Computes the boosting factor for complete unrolling. // If fully unrolling the loop would save a lot of RolledDynamicCost, it would // be beneficial to fully unroll the loop even if unrolledcost is large. We @@ -677,7 +686,7 @@ static void SetLoopAlreadyUnrolled(Loop *L) { // the unroll threshold. static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, unsigned MaxPercentThresholdBoost) { - if (Cost.RolledDynamicCost >= UINT_MAX / 100) + if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100) return 100; else if (Cost.UnrolledCost != 0) // The boosting factor is RolledDynamicCost / UnrolledCost @@ -826,11 +835,14 @@ static bool computeUnrollCount( } if (UP.Count < 2) { if (PragmaEnableUnroll) - ORE->emit( - OptimizationRemarkMissed(DEBUG_TYPE, "UnrollAsDirectedTooLarge", - L->getStartLoc(), L->getHeader()) - << "Unable to unroll loop as directed by unroll(enable) pragma " - "because unrolled size is too large."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, + "UnrollAsDirectedTooLarge", + L->getStartLoc(), L->getHeader()) + << "Unable to unroll loop as directed by unroll(enable) " + "pragma " + "because unrolled size is too large."; + }); UP.Count = 0; } } else { @@ -840,22 +852,27 @@ static bool computeUnrollCount( UP.Count = UP.MaxCount; if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && UP.Count != TripCount) - ORE->emit( - OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedTooLarge", - L->getStartLoc(), L->getHeader()) - << "Unable to fully unroll loop as directed by unroll pragma because " - "unrolled size is too large."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, + "FullUnrollAsDirectedTooLarge", + L->getStartLoc(), L->getHeader()) + << "Unable to fully unroll loop as directed by unroll pragma " + "because " + "unrolled size is too large."; + }); return ExplicitUnroll; } assert(TripCount == 0 && "All cases when TripCount is constant should be covered here."); if (PragmaFullUnroll) - ORE->emit( - OptimizationRemarkMissed(DEBUG_TYPE, - "CantFullUnrollAsDirectedRuntimeTripCount", - L->getStartLoc(), L->getHeader()) - << "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); + ORE->emit([&]() { + return OptimizationRemarkMissed( + DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount", + L->getStartLoc(), L->getHeader()) + << "Unable to fully unroll loop as directed by unroll(full) " + "pragma " + "because loop has a runtime trip count."; + }); // 6th priority is runtime unrolling. // Don't unroll a runtime trip count loop when it is disabled. @@ -904,19 +921,23 @@ static bool computeUnrollCount( "multiple, " << TripMultiple << ". Reducing unroll count from " << OrigCount << " to " << UP.Count << ".\n"); + using namespace ore; + if (PragmaCount > 0 && !UP.AllowRemainder) - ORE->emit( - OptimizationRemarkMissed(DEBUG_TYPE, - "DifferentUnrollCountFromDirected", - L->getStartLoc(), L->getHeader()) - << "Unable to unroll loop the number of times directed by " - "unroll_count pragma because remainder loop is restricted " - "(that could architecture specific or because the loop " - "contains a convergent instruction) and so must have an unroll " - "count that divides the loop trip multiple of " - << NV("TripMultiple", TripMultiple) << ". Unrolling instead " - << NV("UnrollCount", UP.Count) << " time(s)."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, + "DifferentUnrollCountFromDirected", + L->getStartLoc(), L->getHeader()) + << "Unable to unroll loop the number of times directed by " + "unroll_count pragma because remainder loop is restricted " + "(that could architecture specific or because the loop " + "contains a convergent instruction) and so must have an " + "unroll " + "count that divides the loop trip multiple of " + << NV("TripMultiple", TripMultiple) << ". Unrolling instead " + << NV("UnrollCount", UP.Count) << " time(s)."; + }); } if (UP.Count > UP.MaxCount) @@ -927,23 +948,21 @@ static bool computeUnrollCount( return ExplicitUnroll; } -static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution &SE, const TargetTransformInfo &TTI, - AssumptionCache &AC, OptimizationRemarkEmitter &ORE, - bool PreserveLCSSA, int OptLevel, - Optional<unsigned> ProvidedCount, - Optional<unsigned> ProvidedThreshold, - Optional<bool> ProvidedAllowPartial, - Optional<bool> ProvidedRuntime, - Optional<bool> ProvidedUpperBound) { +static LoopUnrollResult tryToUnrollLoop( + Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, + const TargetTransformInfo &TTI, AssumptionCache &AC, + OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel, + Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold, + Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime, + Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) { DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); - if (HasUnrollDisablePragma(L)) - return false; - if (!L->isLoopSimplifyForm()) { + if (HasUnrollDisablePragma(L)) + return LoopUnrollResult::Unmodified; + if (!L->isLoopSimplifyForm()) { DEBUG( dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); - return false; + return LoopUnrollResult::Unmodified; } unsigned NumInlineCandidates; @@ -951,21 +970,22 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount, - ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound); + ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, + ProvidedAllowPeeling); // Exit early if unrolling is disabled. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) - return false; + return LoopUnrollResult::Unmodified; unsigned LoopSize = ApproximateLoopSize( L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); if (NotDuplicatable) { DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" << " instructions.\n"); - return false; + return LoopUnrollResult::Unmodified; } if (NumInlineCandidates != 0) { DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); - return false; + return LoopUnrollResult::Unmodified; } // Find trip count and trip multiple if count is not available @@ -1024,41 +1044,35 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount, TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) - return false; + return LoopUnrollResult::Unmodified; // Unroll factor (Count) must be less or equal to TripCount. if (TripCount && UP.Count > TripCount) UP.Count = TripCount; // Unroll the loop. - if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, - UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero, - TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE, - PreserveLCSSA)) - return false; + LoopUnrollResult UnrollResult = UnrollLoop( + L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, + UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder, + LI, &SE, &DT, &AC, &ORE, PreserveLCSSA); + if (UnrollResult == LoopUnrollResult::Unmodified) + return LoopUnrollResult::Unmodified; // If loop has an unroll count pragma or unrolled by explicitly set count // mark loop as unrolled to prevent unrolling beyond that requested. // If the loop was peeled, we already "used up" the profile information // we had, so we don't want to unroll or peel again. - if (IsCountSetExplicitly || UP.PeelCount) - SetLoopAlreadyUnrolled(L); + if (UnrollResult != LoopUnrollResult::FullyUnrolled && + (IsCountSetExplicitly || UP.PeelCount)) + L->setLoopAlreadyUnrolled(); - return true; + return UnrollResult; } namespace { + class LoopUnroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None, - Optional<unsigned> Count = None, - Optional<bool> AllowPartial = None, Optional<bool> Runtime = None, - Optional<bool> UpperBound = None) - : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)), - ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), - ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) { - initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); - } int OptLevel; Optional<unsigned> ProvidedCount; @@ -1066,8 +1080,21 @@ public: Optional<bool> ProvidedAllowPartial; Optional<bool> ProvidedRuntime; Optional<bool> ProvidedUpperBound; + Optional<bool> ProvidedAllowPeeling; - bool runOnLoop(Loop *L, LPPassManager &) override { + LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None, + Optional<unsigned> Count = None, + Optional<bool> AllowPartial = None, Optional<bool> Runtime = None, + Optional<bool> UpperBound = None, + Optional<bool> AllowPeeling = None) + : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)), + ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), + ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound), + ProvidedAllowPeeling(AllowPeeling) { + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { if (skipLoop(L)) return false; @@ -1085,15 +1112,19 @@ public: OptimizationRemarkEmitter ORE(&F); bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, - ProvidedCount, ProvidedThreshold, - ProvidedAllowPartial, ProvidedRuntime, - ProvidedUpperBound); + LoopUnrollResult Result = tryToUnrollLoop( + L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount, + ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, + ProvidedUpperBound, ProvidedAllowPeeling); + + if (Result == LoopUnrollResult::FullyUnrolled) + LPM.markLoopAsDeleted(*L); + + return Result != LoopUnrollResult::Unmodified; } /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... - /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); @@ -1102,9 +1133,11 @@ public: getLoopAnalysisUsage(AU); } }; -} + +} // end anonymous namespace char LoopUnroll::ID = 0; + INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) @@ -1112,8 +1145,8 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count, - int AllowPartial, int Runtime, - int UpperBound) { + int AllowPartial, int Runtime, int UpperBound, + int AllowPeeling) { // TODO: It would make more sense for this function to take the optionals // directly, but that's dangerous since it would silently break out of tree // callers. @@ -1122,16 +1155,17 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count, Count == -1 ? None : Optional<unsigned>(Count), AllowPartial == -1 ? None : Optional<bool>(AllowPartial), Runtime == -1 ? None : Optional<bool>(Runtime), - UpperBound == -1 ? None : Optional<bool>(UpperBound)); + UpperBound == -1 ? None : Optional<bool>(UpperBound), + AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling)); } Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) { - return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0); + return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0); } -PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &Updater) { +PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &Updater) { const auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); Function *F = L.getHeader()->getParent(); @@ -1139,8 +1173,9 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F); // FIXME: This should probably be optional rather than required. if (!ORE) - report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not " - "cached at a higher level"); + report_fatal_error( + "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not " + "cached at a higher level"); // Keep track of the previous loop structure so we can identify new loops // created by unrolling. @@ -1151,17 +1186,14 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, else OldLoops.insert(AR.LI.begin(), AR.LI.end()); - // The API here is quite complex to call, but there are only two interesting - // states we support: partial and full (or "simple") unrolling. However, to - // enable these things we actually pass "None" in for the optional to avoid - // providing an explicit choice. - Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam; - if (!AllowPartialUnrolling) - AllowPartialParam = RuntimeParam = UpperBoundParam = false; - bool Changed = tryToUnrollLoop( - &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, - /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, - /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam); + std::string LoopName = L.getName(); + + bool Changed = + tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, + /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, + /*Threshold*/ None, /*AllowPartial*/ false, + /*Runtime*/ false, /*UpperBound*/ false, + /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified; if (!Changed) return PreservedAnalyses::all(); @@ -1172,17 +1204,13 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, #endif // Unrolling can do several things to introduce new loops into a loop nest: - // - Partial unrolling clones child loops within the current loop. If it - // uses a remainder, then it can also create any number of sibling loops. // - Full unrolling clones child loops within the current loop but then // removes the current loop making all of the children appear to be new // sibling loops. - // - Loop peeling can directly introduce new sibling loops by peeling one - // iteration. // - // When a new loop appears as a sibling loop, either from peeling an - // iteration or fully unrolling, its nesting structure has fundamentally - // changed and we want to revisit it to reflect that. + // When a new loop appears as a sibling loop after fully unrolling, + // its nesting structure has fundamentally changed and we want to revisit + // it to reflect that. // // When unrolling has removed the current loop, we need to tell the // infrastructure that it is gone. @@ -1209,13 +1237,11 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, Updater.addSiblingLoops(SibLoops); if (!IsCurrentLoopValid) { - Updater.markLoopAsDeleted(L); + Updater.markLoopAsDeleted(L, LoopName); } else { // We can only walk child loops if the current loop remained valid. if (UnrollRevisitChildLoops) { - // Walk *all* of the child loops. This is a highly speculative mode - // anyways so look for any simplifications that arose from partial - // unrolling or peeling off of iterations. + // Walk *all* of the child loops. SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end()); Updater.addChildLoops(ChildLoops); } @@ -1223,3 +1249,105 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, return getLoopPassPreservedAnalyses(); } + +template <typename RangeT> +static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) { + SmallVector<Loop *, 8> Worklist; + // We use an internal worklist to build up the preorder traversal without + // recursion. + SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist; + + for (Loop *RootL : Loops) { + assert(PreOrderLoops.empty() && "Must start with an empty preorder walk."); + assert(PreOrderWorklist.empty() && + "Must start with an empty preorder walk worklist."); + PreOrderWorklist.push_back(RootL); + do { + Loop *L = PreOrderWorklist.pop_back_val(); + PreOrderWorklist.append(L->begin(), L->end()); + PreOrderLoops.push_back(L); + } while (!PreOrderWorklist.empty()); + + Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end()); + PreOrderLoops.clear(); + } + return Worklist; +} + +PreservedAnalyses LoopUnrollPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + LoopAnalysisManager *LAM = nullptr; + if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F)) + LAM = &LAMProxy->getManager(); + + const ModuleAnalysisManager &MAM = + AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); + + bool Changed = false; + + // The unroller requires loops to be in simplified form, and also needs LCSSA. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop unroller + // will simplify all loops, regardless of whether anything end up being + // unrolled. + for (auto &L : LI) { + Changed |= simplifyLoop(L, &DT, &LI, &SE, &AC, false /* PreserveLCSSA */); + Changed |= formLCSSARecursively(*L, DT, &LI, &SE); + } + + SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI); + + while (!Worklist.empty()) { + // Because the LoopInfo stores the loops in RPO, we walk the worklist + // from back to front so that we work forward across the CFG, which + // for unrolling is only needed to get optimization remarks emitted in + // a forward order. + Loop &L = *Worklist.pop_back_val(); +#ifndef NDEBUG + Loop *ParentL = L.getParentLoop(); +#endif + + // The API here is quite complex to call, but there are only two interesting + // states we support: partial and full (or "simple") unrolling. However, to + // enable these things we actually pass "None" in for the optional to avoid + // providing an explicit choice. + Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam, + AllowPeeling; + // Check if the profile summary indicates that the profiled application + // has a huge working set size, in which case we disable peeling to avoid + // bloating it further. + if (PSI && PSI->hasHugeWorkingSetSize()) + AllowPeeling = false; + std::string LoopName = L.getName(); + LoopUnrollResult Result = + tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE, + /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, + /*Threshold*/ None, AllowPartialParam, RuntimeParam, + UpperBoundParam, AllowPeeling); + Changed |= Result != LoopUnrollResult::Unmodified; + + // The parent must not be damaged by unrolling! +#ifndef NDEBUG + if (Result != LoopUnrollResult::Unmodified && ParentL) + ParentL->verifyLoop(); +#endif + + // Clear any cached analysis results for L if we removed it completely. + if (LAM && Result == LoopUnrollResult::FullyUnrolled) + LAM->clear(L, LoopName); + } + + if (!Changed) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index d0c96fa627a4..bd468338a1d0 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -1,4 +1,4 @@ -//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===// +//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===// // // The LLVM Compiler Infrastructure // @@ -26,30 +26,40 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/BlockFrequencyInfoImpl.h" -#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DivergenceAnalysis.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" -#include "llvm/Support/BranchProbability.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -58,9 +68,15 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> +#include <cassert> #include <map> #include <set> +#include <tuple> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "loop-unswitch" @@ -82,11 +98,9 @@ Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), namespace { class LUAnalysisCache { - - typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> > - UnswitchedValsMap; - - typedef UnswitchedValsMap::iterator UnswitchedValsIt; + using UnswitchedValsMap = + DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>; + using UnswitchedValsIt = UnswitchedValsMap::iterator; struct LoopProperties { unsigned CanBeUnswitchedCount; @@ -97,12 +111,12 @@ namespace { // Here we use std::map instead of DenseMap, since we need to keep valid // LoopProperties pointer for current loop for better performance. - typedef std::map<const Loop*, LoopProperties> LoopPropsMap; - typedef LoopPropsMap::iterator LoopPropsMapIt; + using LoopPropsMap = std::map<const Loop *, LoopProperties>; + using LoopPropsMapIt = LoopPropsMap::iterator; LoopPropsMap LoopsProperties; - UnswitchedValsMap *CurLoopInstructions; - LoopProperties *CurrentLoopProperties; + UnswitchedValsMap *CurLoopInstructions = nullptr; + LoopProperties *CurrentLoopProperties = nullptr; // A loop unswitching with an estimated cost above this threshold // is not performed. MaxSize is turned into unswitching quota for @@ -121,9 +135,7 @@ namespace { unsigned MaxSize; public: - LUAnalysisCache() - : CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr), - MaxSize(Threshold) {} + LUAnalysisCache() : MaxSize(Threshold) {} // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. @@ -164,12 +176,12 @@ namespace { LUAnalysisCache BranchesInfo; bool OptimizeForSize; - bool redoLoop; + bool redoLoop = false; - Loop *currentLoop; - DominatorTree *DT; - BasicBlock *loopHeader; - BasicBlock *loopPreheader; + Loop *currentLoop = nullptr; + DominatorTree *DT = nullptr; + BasicBlock *loopHeader = nullptr; + BasicBlock *loopPreheader = nullptr; bool SanitizeMemory; LoopSafetyInfo SafetyInfo; @@ -185,16 +197,17 @@ namespace { public: static char ID; // Pass ID, replacement for typeid - explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) : - LoopPass(ID), OptimizeForSize(Os), redoLoop(false), - currentLoop(nullptr), DT(nullptr), loopHeader(nullptr), - loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) { + + explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) + : LoopPass(ID), OptimizeForSize(Os), + hasBranchDivergence(hasBranchDivergence) { initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); - } + } bool runOnLoop(Loop *L, LPPassManager &LPM) override; bool processCurrentLoop(); bool isUnreachableDueToPreviousUnswitching(BasicBlock *); + /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG. /// @@ -207,7 +220,6 @@ namespace { } private: - void releaseMemory() override { BranchesInfo.forgetLoop(currentLoop); } @@ -237,7 +249,7 @@ namespace { void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, - Instruction *InsertPt, + BranchInst *OldBranch, TerminatorInst *TI); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); @@ -247,13 +259,13 @@ namespace { Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant, Constant *Val); }; -} + +} // end anonymous namespace // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, AssumptionCache *AC) { - LoopPropsMapIt PropsIt; bool Inserted; std::tie(PropsIt, Inserted) = @@ -302,7 +314,6 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, // Clean all data related to given loop. void LUAnalysisCache::forgetLoop(const Loop *L) { - LoopPropsMapIt LIt = LoopsProperties.find(L); if (LIt != LoopsProperties.end()) { @@ -337,7 +348,6 @@ bool LUAnalysisCache::CostAllowsUnswitching() { // Note, that new loop data is stored inside the VMap. void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, const ValueToValueMapTy &VMap) { - LoopProperties &NewLoopProps = LoopsProperties[NewLoop]; LoopProperties &OldLoopProps = *CurrentLoopProperties; UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals; @@ -367,6 +377,7 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, } char LoopUnswitch::ID = 0; + INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) @@ -518,9 +529,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { Changed |= processCurrentLoop(); } while(redoLoop); - // FIXME: Reconstruct dom info, because it is not preserved properly. - if (Changed) - DT->recalculate(*F); return Changed; } @@ -553,6 +561,48 @@ bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) { return false; } +/// FIXME: Remove this workaround when freeze related patches are done. +/// LoopUnswitch and Equality propagation in GVN have discrepancy about +/// whether branch on undef/poison has undefine behavior. Here it is to +/// rule out some common cases that we found such discrepancy already +/// causing problems. Detail could be found in PR31652. Note if the +/// func returns true, it is unsafe. But if it is false, it doesn't mean +/// it is necessarily safe. +static bool EqualityPropUnSafe(Value &LoopCond) { + ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond); + if (!CI || !CI->isEquality()) + return false; + + Value *LHS = CI->getOperand(0); + Value *RHS = CI->getOperand(1); + if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) + return true; + + auto hasUndefInPHI = [](PHINode &PN) { + for (Value *Opd : PN.incoming_values()) { + if (isa<UndefValue>(Opd)) + return true; + } + return false; + }; + PHINode *LPHI = dyn_cast<PHINode>(LHS); + PHINode *RPHI = dyn_cast<PHINode>(RHS); + if ((LPHI && hasUndefInPHI(*LPHI)) || (RPHI && hasUndefInPHI(*RPHI))) + return true; + + auto hasUndefInSelect = [](SelectInst &SI) { + if (isa<UndefValue>(SI.getTrueValue()) || + isa<UndefValue>(SI.getFalseValue())) + return true; + return false; + }; + SelectInst *LSI = dyn_cast<SelectInst>(LHS); + SelectInst *RSI = dyn_cast<SelectInst>(RHS); + if ((LSI && hasUndefInSelect(*LSI)) || (RSI && hasUndefInSelect(*RSI))) + return true; + return false; +} + /// Do actual work and unswitch loop if possible and profitable. bool LoopUnswitch::processCurrentLoop() { bool Changed = false; @@ -666,7 +716,7 @@ bool LoopUnswitch::processCurrentLoop() { // unswitch on it if we desire. Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop, Changed).first; - if (LoopCond && + if (LoopCond && !EqualityPropUnSafe(*LoopCond) && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) { ++NumBranches; return true; @@ -831,7 +881,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, /// mapping the blocks with the specified map. static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { - Loop &New = *new Loop(); + Loop &New = *LI->AllocateLoop(); if (PL) PL->addChildLoop(&New); else @@ -852,31 +902,59 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, } /// Emit a conditional branch on two values if LIC == Val, branch to TrueDst, -/// otherwise branch to FalseDest. Insert the code immediately before InsertPt. +/// otherwise branch to FalseDest. Insert the code immediately before OldBranch +/// and remove (but not erase!) it from the function. void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, - Instruction *InsertPt, + BranchInst *OldBranch, TerminatorInst *TI) { + assert(OldBranch->isUnconditional() && "Preheader is not split correctly"); // Insert a conditional branch on LIC to the two preheaders. The original // code is the true version and the new code is the false version. Value *BranchVal = LIC; bool Swapped = false; if (!isa<ConstantInt>(Val) || Val->getType() != Type::getInt1Ty(LIC->getContext())) - BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val); + BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val); else if (Val != ConstantInt::getTrue(Val->getContext())) { // We want to enter the new loop when the condition is true. std::swap(TrueDest, FalseDest); Swapped = true; } + // Old branch will be removed, so save its parent and successor to update the + // DomTree. + auto *OldBranchSucc = OldBranch->getSuccessor(0); + auto *OldBranchParent = OldBranch->getParent(); + // Insert the new branch. BranchInst *BI = - IRBuilder<>(InsertPt).CreateCondBr(BranchVal, TrueDest, FalseDest, TI); + IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI); if (Swapped) BI->swapProfMetadata(); + // Remove the old branch so there is only one branch at the end. This is + // needed to perform DomTree's internal DFS walk on the function's CFG. + OldBranch->removeFromParent(); + + // Inform the DT about the new branch. + if (DT) { + // First, add both successors. + SmallVector<DominatorTree::UpdateType, 3> Updates; + if (TrueDest != OldBranchParent) + Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest}); + if (FalseDest != OldBranchParent) + Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest}); + // If both of the new successors are different from the old one, inform the + // DT that the edge was deleted. + if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) { + Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc}); + } + + DT->applyUpdates(Updates); + } + // If either edge is critical, split it. This helps preserve LoopSimplify // form for enclosing loops. auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA(); @@ -916,10 +994,14 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. - EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, - loopPreheader->getTerminator(), TI); - LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L); - loopPreheader->getTerminator()->eraseFromParent(); + auto *OldBranch = dyn_cast<BranchInst>(loopPreheader->getTerminator()); + assert(OldBranch && "Failed to split the preheader"); + EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI); + LPM->deleteSimpleAnalysisValue(OldBranch, L); + + // EmitPreheaderBranchOnCondition removed the OldBranch from the function. + // Delete it, as it is no longer needed. + delete OldBranch; // We need to reprocess this loop, it could be unswitched again. redoLoop = true; @@ -1035,6 +1117,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) return false; // Can't handle this. + if (EqualityPropUnSafe(*LoopCond)) + return false; + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, CurrentTerm); ++NumBranches; @@ -1231,7 +1316,10 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR, TI); LPM->deleteSimpleAnalysisValue(OldBR, L); - OldBR->eraseFromParent(); + + // The OldBr was replaced by a new one and removed (but not erased) by + // EmitPreheaderBranchOnCondition. It is no longer needed, so delete it. + delete OldBR; LoopProcessWorklist.push_back(NewLoop); redoLoop = true; diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp index c23d891b6504..53b25e688e82 100644 --- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -1,4 +1,4 @@ -//===----------- LoopVersioningLICM.cpp - LICM Loop Versioning ------------===// +//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===// // // The LLVM Compiler Infrastructure // @@ -60,41 +60,41 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/IR/PredIteratorCache.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include <cassert> +#include <memory> + +using namespace llvm; #define DEBUG_TYPE "loop-versioning-licm" -static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable"; -using namespace llvm; +static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable"; /// Threshold minimum allowed percentage for possible /// invariant instructions in a loop. @@ -143,9 +143,16 @@ void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString, } namespace { + struct LoopVersioningLICM : public LoopPass { static char ID; + LoopVersioningLICM() + : LoopPass(ID), LoopDepthThreshold(LVLoopDepthThreshold), + InvariantThreshold(LVInvarThreshold) { + initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry()); + } + bool runOnLoop(Loop *L, LPPassManager &LPM) override; void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -161,13 +168,6 @@ struct LoopVersioningLICM : public LoopPass { AU.addPreserved<GlobalsAAWrapperPass>(); } - LoopVersioningLICM() - : LoopPass(ID), AA(nullptr), SE(nullptr), LAA(nullptr), LAI(nullptr), - CurLoop(nullptr), LoopDepthThreshold(LVLoopDepthThreshold), - InvariantThreshold(LVInvarThreshold), LoadAndStoreCounter(0), - InvariantCounter(0), IsReadOnlyLoop(true) { - initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry()); - } StringRef getPassName() const override { return "Loop Versioning for LICM"; } void reset() { @@ -191,30 +191,49 @@ struct LoopVersioningLICM : public LoopPass { }; private: - AliasAnalysis *AA; // Current AliasAnalysis information - ScalarEvolution *SE; // Current ScalarEvolution - LoopAccessLegacyAnalysis *LAA; // Current LoopAccessAnalysis - const LoopAccessInfo *LAI; // Current Loop's LoopAccessInfo + // Current AliasAnalysis information + AliasAnalysis *AA = nullptr; + + // Current ScalarEvolution + ScalarEvolution *SE = nullptr; + + // Current LoopAccessAnalysis + LoopAccessLegacyAnalysis *LAA = nullptr; + + // Current Loop's LoopAccessInfo + const LoopAccessInfo *LAI = nullptr; + + // The current loop we are working on. + Loop *CurLoop = nullptr; + + // AliasSet information for the current loop. + std::unique_ptr<AliasSetTracker> CurAST; - Loop *CurLoop; // The current loop we are working on. - std::unique_ptr<AliasSetTracker> - CurAST; // AliasSet information for the current loop. + // Maximum loop nest threshold + unsigned LoopDepthThreshold; - unsigned LoopDepthThreshold; // Maximum loop nest threshold - float InvariantThreshold; // Minimum invariant threshold - unsigned LoadAndStoreCounter; // Counter to track num of load & store - unsigned InvariantCounter; // Counter to track num of invariant - bool IsReadOnlyLoop; // Read only loop marker. + // Minimum invariant threshold + float InvariantThreshold; + + // Counter to track num of load & store + unsigned LoadAndStoreCounter = 0; + + // Counter to track num of invariant + unsigned InvariantCounter = 0; + + // Read only loop marker. + bool IsReadOnlyLoop = true; bool isLegalForVersioning(); bool legalLoopStructure(); bool legalLoopInstructions(); bool legalLoopMemoryAccesses(); bool isLoopAlreadyVisited(); - void setNoAliasToLoop(Loop *); - bool instructionSafeForVersioning(Instruction *); + void setNoAliasToLoop(Loop *VerLoop); + bool instructionSafeForVersioning(Instruction *I); }; -} + +} // end anonymous namespace /// \brief Check loop structure and confirms it's good for LoopVersioningLICM. bool LoopVersioningLICM::legalLoopStructure() { @@ -225,7 +244,7 @@ bool LoopVersioningLICM::legalLoopStructure() { return false; } // Loop should be innermost loop, if not return false. - if (CurLoop->getSubLoops().size()) { + if (!CurLoop->getSubLoops().empty()) { DEBUG(dbgs() << " loop is not innermost\n"); return false; } @@ -562,6 +581,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) { } char LoopVersioningLICM::ID = 0; + INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm", "Loop Versioning For LICM", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 6f77c5bd0d07..c165c5ece95c 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -15,7 +15,6 @@ #include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 7896396f0898..9c870b42a747 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -14,10 +14,12 @@ #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" @@ -25,6 +27,8 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -41,6 +45,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -54,6 +59,7 @@ #include <algorithm> #include <cassert> #include <cstdint> +#include <utility> using namespace llvm; @@ -225,15 +231,18 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { namespace { class MemsetRanges { + using range_iterator = SmallVectorImpl<MemsetRange>::iterator; + /// A sorted list of the memset ranges. SmallVector<MemsetRange, 8> Ranges; - typedef SmallVectorImpl<MemsetRange>::iterator range_iterator; + const DataLayout &DL; public: MemsetRanges(const DataLayout &DL) : DL(DL) {} - typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator; + using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator; + const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } @@ -259,7 +268,6 @@ public: void addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst); - }; } // end anonymous namespace @@ -356,10 +364,10 @@ private: } }; -char MemCpyOptLegacyPass::ID = 0; - } // end anonymous namespace +char MemCpyOptLegacyPass::ID = 0; + /// The public interface to this file... FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); } @@ -450,7 +458,6 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, // emit memset's for anything big enough to be worthwhile. Instruction *AMemSet = nullptr; for (const MemsetRange &Range : Ranges) { - if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. @@ -511,7 +518,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, const LoadInst *LI) { // If the store alias this position, early bail out. MemoryLocation StoreLoc = MemoryLocation::get(SI); - if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc))) return false; // Keep track of the arguments of all instruction we plan to lift @@ -535,20 +542,20 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { auto *C = &*I; - bool MayAlias = AA.getModRefInfo(C) != MRI_NoModRef; + bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None)); bool NeedLift = false; if (Args.erase(C)) NeedLift = true; else if (MayAlias) { NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) { - return AA.getModRefInfo(C, ML); + return isModOrRefSet(AA.getModRefInfo(C, ML)); }); if (!NeedLift) NeedLift = llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) { - return AA.getModRefInfo(C, CS); + return isModOrRefSet(AA.getModRefInfo(C, CS)); }); } @@ -558,18 +565,18 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, if (MayAlias) { // Since LI is implicitly moved downwards past the lifted instructions, // none of them may modify its source. - if (AA.getModRefInfo(C, LoadLoc) & MRI_Mod) + if (isModSet(AA.getModRefInfo(C, LoadLoc))) return false; else if (auto CS = ImmutableCallSite(C)) { // If we can't lift this before P, it's game over. - if (AA.getModRefInfo(P, CS) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo(P, CS))) return false; CallSites.push_back(CS); } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) { // If we can't lift this before P, it's game over. auto ML = MemoryLocation::get(C); - if (AA.getModRefInfo(P, ML) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo(P, ML))) return false; MemLocs.push_back(ML); @@ -624,7 +631,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // of at the store position. Instruction *P = SI; for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { - if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) { + if (isModSet(AA.getModRefInfo(&I, LoadLoc))) { P = &I; break; } @@ -695,7 +702,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { MemoryLocation StoreLoc = MemoryLocation::get(SI); for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); I != E; --I) { - if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { + if (isModOrRefSet(AA.getModRefInfo(&*I, StoreLoc))) { C = nullptr; break; } @@ -927,9 +934,9 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, AliasAnalysis &AA = LookupAliasAnalysis(); ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize); // If necessary, perform additional analysis. - if (MR != MRI_NoModRef) + if (isModOrRefSet(MR)) MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); - if (MR != MRI_NoModRef) + if (isModOrRefSet(MR)) return false; // We can't create address space casts here because we don't know if they're diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp new file mode 100644 index 000000000000..9869a3fb96fa --- /dev/null +++ b/lib/Transforms/Scalar/MergeICmps.cpp @@ -0,0 +1,650 @@ +//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass turns chains of integer comparisons into memcmp (the memcmp is +// later typically inlined as a chain of efficient hardware comparisons). This +// typically benefits c++ member or nonmember operator==(). +// +// The basic idea is to replace a larger chain of integer comparisons loaded +// from contiguous memory locations into a smaller chain of such integer +// comparisons. Benefits are double: +// - There are less jumps, and therefore less opportunities for mispredictions +// and I-cache misses. +// - Code size is smaller, both because jumps are removed and because the +// encoding of a 2*n byte compare is smaller than that of two n-byte +// compares. + +//===----------------------------------------------------------------------===// + +#include <algorithm> +#include <numeric> +#include <utility> +#include <vector> +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" + +using namespace llvm; + +namespace { + +#define DEBUG_TYPE "mergeicmps" + +// A BCE atom. +struct BCEAtom { + BCEAtom() : GEP(nullptr), LoadI(nullptr), Offset() {} + + const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; } + + bool operator<(const BCEAtom &O) const { + assert(Base() && "invalid atom"); + assert(O.Base() && "invalid atom"); + // Just ordering by (Base(), Offset) is sufficient. However because this + // means that the ordering will depend on the addresses of the base + // values, which are not reproducible from run to run. To guarantee + // stability, we use the names of the values if they exist; we sort by: + // (Base.getName(), Base(), Offset). + const int NameCmp = Base()->getName().compare(O.Base()->getName()); + if (NameCmp == 0) { + if (Base() == O.Base()) { + return Offset.slt(O.Offset); + } + return Base() < O.Base(); + } + return NameCmp < 0; + } + + GetElementPtrInst *GEP; + LoadInst *LoadI; + APInt Offset; +}; + +// If this value is a load from a constant offset w.r.t. a base address, and +// there are no othe rusers of the load or address, returns the base address and +// the offset. +BCEAtom visitICmpLoadOperand(Value *const Val) { + BCEAtom Result; + if (auto *const LoadI = dyn_cast<LoadInst>(Val)) { + DEBUG(dbgs() << "load\n"); + if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { + DEBUG(dbgs() << "used outside of block\n"); + return {}; + } + if (LoadI->isVolatile()) { + DEBUG(dbgs() << "volatile\n"); + return {}; + } + Value *const Addr = LoadI->getOperand(0); + if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) { + DEBUG(dbgs() << "GEP\n"); + if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { + DEBUG(dbgs() << "used outside of block\n"); + return {}; + } + const auto &DL = GEP->getModule()->getDataLayout(); + if (!isDereferenceablePointer(GEP, DL)) { + DEBUG(dbgs() << "not dereferenceable\n"); + // We need to make sure that we can do comparison in any order, so we + // require memory to be unconditionnally dereferencable. + return {}; + } + Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0); + if (GEP->accumulateConstantOffset(DL, Result.Offset)) { + Result.GEP = GEP; + Result.LoadI = LoadI; + } + } + } + return Result; +} + +// A basic block with a comparison between two BCE atoms. +// Note: the terminology is misleading: the comparison is symmetric, so there +// is no real {l/r}hs. What we want though is to have the same base on the +// left (resp. right), so that we can detect consecutive loads. To ensure this +// we put the smallest atom on the left. +class BCECmpBlock { + public: + BCECmpBlock() {} + + BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits) + : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) { + if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_); + } + + bool IsValid() const { + return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr; + } + + // Assert the the block is consistent: If valid, it should also have + // non-null members besides Lhs_ and Rhs_. + void AssertConsistent() const { + if (IsValid()) { + assert(BB); + assert(CmpI); + assert(BranchI); + } + } + + const BCEAtom &Lhs() const { return Lhs_; } + const BCEAtom &Rhs() const { return Rhs_; } + int SizeBits() const { return SizeBits_; } + + // Returns true if the block does other works besides comparison. + bool doesOtherWork() const; + + // The basic block where this comparison happens. + BasicBlock *BB = nullptr; + // The ICMP for this comparison. + ICmpInst *CmpI = nullptr; + // The terminating branch. + BranchInst *BranchI = nullptr; + + private: + BCEAtom Lhs_; + BCEAtom Rhs_; + int SizeBits_ = 0; +}; + +bool BCECmpBlock::doesOtherWork() const { + AssertConsistent(); + // TODO(courbet): Can we allow some other things ? This is very conservative. + // We might be able to get away with anything does does not have any side + // effects outside of the basic block. + // Note: The GEPs and/or loads are not necessarily in the same block. + for (const Instruction &Inst : *BB) { + if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) { + if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true; + } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) { + if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true; + } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) { + if (C != CmpI) return true; + } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) { + if (Br != BranchI) return true; + } else { + return true; + } + } + return false; +} + +// Visit the given comparison. If this is a comparison between two valid +// BCE atoms, returns the comparison. +BCECmpBlock visitICmp(const ICmpInst *const CmpI, + const ICmpInst::Predicate ExpectedPredicate) { + if (CmpI->getPredicate() == ExpectedPredicate) { + DEBUG(dbgs() << "cmp " + << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") + << "\n"); + auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0)); + if (!Lhs.Base()) return {}; + auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1)); + if (!Rhs.Base()) return {}; + return BCECmpBlock(std::move(Lhs), std::move(Rhs), + CmpI->getOperand(0)->getType()->getScalarSizeInBits()); + } + return {}; +} + +// Visit the given comparison block. If this is a comparison between two valid +// BCE atoms, returns the comparison. +BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, + const BasicBlock *const PhiBlock) { + if (Block->empty()) return {}; + auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator()); + if (!BranchI) return {}; + DEBUG(dbgs() << "branch\n"); + if (BranchI->isUnconditional()) { + // In this case, we expect an incoming value which is the result of the + // comparison. This is the last link in the chain of comparisons (note + // that this does not mean that this is the last incoming value, blocks + // can be reordered). + auto *const CmpI = dyn_cast<ICmpInst>(Val); + if (!CmpI) return {}; + DEBUG(dbgs() << "icmp\n"); + auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ); + Result.CmpI = CmpI; + Result.BranchI = BranchI; + return Result; + } else { + // In this case, we expect a constant incoming value (the comparison is + // chained). + const auto *const Const = dyn_cast<ConstantInt>(Val); + DEBUG(dbgs() << "const\n"); + if (!Const->isZero()) return {}; + DEBUG(dbgs() << "false\n"); + auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition()); + if (!CmpI) return {}; + DEBUG(dbgs() << "icmp\n"); + assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch"); + BasicBlock *const FalseBlock = BranchI->getSuccessor(1); + auto Result = visitICmp( + CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE); + Result.CmpI = CmpI; + Result.BranchI = BranchI; + return Result; + } + return {}; +} + +// A chain of comparisons. +class BCECmpChain { + public: + BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi); + + int size() const { return Comparisons_.size(); } + +#ifdef MERGEICMPS_DOT_ON + void dump() const; +#endif // MERGEICMPS_DOT_ON + + bool simplify(const TargetLibraryInfo *const TLI); + + private: + static bool IsContiguous(const BCECmpBlock &First, + const BCECmpBlock &Second) { + return First.Lhs().Base() == Second.Lhs().Base() && + First.Rhs().Base() == Second.Rhs().Base() && + First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && + First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; + } + + // Merges the given comparison blocks into one memcmp block and update + // branches. Comparisons are assumed to be continguous. If NextBBInChain is + // null, the merged block will link to the phi block. + static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, + BasicBlock *const NextBBInChain, PHINode &Phi, + const TargetLibraryInfo *const TLI); + + PHINode &Phi_; + std::vector<BCECmpBlock> Comparisons_; + // The original entry block (before sorting); + BasicBlock *EntryBlock_; +}; + +BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi) + : Phi_(Phi) { + // Now look inside blocks to check for BCE comparisons. + std::vector<BCECmpBlock> Comparisons; + for (BasicBlock *Block : Blocks) { + BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block), + Block, Phi.getParent()); + Comparison.BB = Block; + if (!Comparison.IsValid()) { + DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n"); + return; + } + if (Comparison.doesOtherWork()) { + DEBUG(dbgs() << "block does extra work besides compare\n"); + if (Comparisons.empty()) { // First block. + // TODO(courbet): The first block can do other things, and we should + // split them apart in a separate block before the comparison chain. + // Right now we just discard it and make the chain shorter. + DEBUG(dbgs() + << "ignoring first block that does extra work besides compare\n"); + continue; + } + // TODO(courbet): Right now we abort the whole chain. We could be + // merging only the blocks that don't do other work and resume the + // chain from there. For example: + // if (a[0] == b[0]) { // bb1 + // if (a[1] == b[1]) { // bb2 + // some_value = 3; //bb3 + // if (a[2] == b[2]) { //bb3 + // do a ton of stuff //bb4 + // } + // } + // } + // + // This is: + // + // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+ + // \ \ \ \ + // ne ne ne \ + // \ \ \ v + // +------------+-----------+----------> bb_phi + // + // We can only merge the first two comparisons, because bb3* does + // "other work" (setting some_value to 3). + // We could still merge bb1 and bb2 though. + return; + } + DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits() + << " bits between " << Comparison.Lhs().Base() << " + " + << Comparison.Lhs().Offset << " and " + << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset + << "\n"); + DEBUG(dbgs() << "\n"); + Comparisons.push_back(Comparison); + } + EntryBlock_ = Comparisons[0].BB; + Comparisons_ = std::move(Comparisons); +#ifdef MERGEICMPS_DOT_ON + errs() << "BEFORE REORDERING:\n\n"; + dump(); +#endif // MERGEICMPS_DOT_ON + // Reorder blocks by LHS. We can do that without changing the + // semantics because we are only accessing dereferencable memory. + std::sort(Comparisons_.begin(), Comparisons_.end(), + [](const BCECmpBlock &a, const BCECmpBlock &b) { + return a.Lhs() < b.Lhs(); + }); +#ifdef MERGEICMPS_DOT_ON + errs() << "AFTER REORDERING:\n\n"; + dump(); +#endif // MERGEICMPS_DOT_ON +} + +#ifdef MERGEICMPS_DOT_ON +void BCECmpChain::dump() const { + errs() << "digraph dag {\n"; + errs() << " graph [bgcolor=transparent];\n"; + errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n"; + errs() << " edge [color=black];\n"; + for (size_t I = 0; I < Comparisons_.size(); ++I) { + const auto &Comparison = Comparisons_[I]; + errs() << " \"" << I << "\" [label=\"%" + << Comparison.Lhs().Base()->getName() << " + " + << Comparison.Lhs().Offset << " == %" + << Comparison.Rhs().Base()->getName() << " + " + << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8) + << " bytes)\"];\n"; + const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB); + if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n"; + errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n"; + } + errs() << " \"Phi\" [label=\"Phi\"];\n"; + errs() << "}\n\n"; +} +#endif // MERGEICMPS_DOT_ON + +bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) { + // First pass to check if there is at least one merge. If not, we don't do + // anything and we keep analysis passes intact. + { + bool AtLeastOneMerged = false; + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { + AtLeastOneMerged = true; + break; + } + } + if (!AtLeastOneMerged) return false; + } + + // Remove phi references to comparison blocks, they will be rebuilt as we + // merge the blocks. + for (const auto &Comparison : Comparisons_) { + Phi_.removeIncomingValue(Comparison.BB, false); + } + + // Point the predecessors of the chain to the first comparison block (which is + // the new entry point). + if (EntryBlock_ != Comparisons_[0].BB) + EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); + + // Effectively merge blocks. + int NumMerged = 1; + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { + ++NumMerged; + } else { + // Merge all previous comparisons and start a new merge block. + mergeComparisons( + makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged), + Comparisons_[I].BB, Phi_, TLI); + NumMerged = 1; + } + } + mergeComparisons(makeArrayRef(Comparisons_) + .slice(Comparisons_.size() - NumMerged, NumMerged), + nullptr, Phi_, TLI); + + return true; +} + +void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, + BasicBlock *const NextBBInChain, + PHINode &Phi, + const TargetLibraryInfo *const TLI) { + assert(!Comparisons.empty()); + const auto &FirstComparison = *Comparisons.begin(); + BasicBlock *const BB = FirstComparison.BB; + LLVMContext &Context = BB->getContext(); + + if (Comparisons.size() >= 2) { + DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); + const auto TotalSize = + std::accumulate(Comparisons.begin(), Comparisons.end(), 0, + [](int Size, const BCECmpBlock &C) { + return Size + C.SizeBits(); + }) / + 8; + + // Incoming edges do not need to be updated, and both GEPs are already + // computing the right address, we just need to: + // - replace the two loads and the icmp with the memcmp + // - update the branch + // - update the incoming values in the phi. + FirstComparison.BranchI->eraseFromParent(); + FirstComparison.CmpI->eraseFromParent(); + FirstComparison.Lhs().LoadI->eraseFromParent(); + FirstComparison.Rhs().LoadI->eraseFromParent(); + + IRBuilder<> Builder(BB); + const auto &DL = Phi.getModule()->getDataLayout(); + Value *const MemCmpCall = emitMemCmp( + FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize), + Builder, DL, TLI); + Value *const MemCmpIsZero = Builder.CreateICmpEQ( + MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); + + // Add a branch to the next basic block in the chain. + if (NextBBInChain) { + Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent()); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + } else { + Builder.CreateBr(Phi.getParent()); + Phi.addIncoming(MemCmpIsZero, BB); + } + + // Delete merged blocks. + for (size_t I = 1; I < Comparisons.size(); ++I) { + BasicBlock *CBB = Comparisons[I].BB; + CBB->replaceAllUsesWith(BB); + CBB->eraseFromParent(); + } + } else { + assert(Comparisons.size() == 1); + // There are no blocks to merge, but we still need to update the branches. + DEBUG(dbgs() << "Only one comparison, updating branches\n"); + if (NextBBInChain) { + if (FirstComparison.BranchI->isConditional()) { + DEBUG(dbgs() << "conditional -> conditional\n"); + // Just update the "true" target, the "false" target should already be + // the phi block. + assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent()); + FirstComparison.BranchI->setSuccessor(0, NextBBInChain); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + } else { + DEBUG(dbgs() << "unconditional -> conditional\n"); + // Replace the unconditional branch by a conditional one. + FirstComparison.BranchI->eraseFromParent(); + IRBuilder<> Builder(BB); + Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain, + Phi.getParent()); + Phi.addIncoming(FirstComparison.CmpI, BB); + } + } else { + if (FirstComparison.BranchI->isConditional()) { + DEBUG(dbgs() << "conditional -> unconditional\n"); + // Replace the conditional branch by an unconditional one. + FirstComparison.BranchI->eraseFromParent(); + IRBuilder<> Builder(BB); + Builder.CreateBr(Phi.getParent()); + Phi.addIncoming(FirstComparison.CmpI, BB); + } else { + DEBUG(dbgs() << "unconditional -> unconditional\n"); + Phi.addIncoming(FirstComparison.CmpI, BB); + } + } + } +} + +std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, + BasicBlock *const LastBlock, + int NumBlocks) { + // Walk up from the last block to find other blocks. + std::vector<BasicBlock *> Blocks(NumBlocks); + BasicBlock *CurBlock = LastBlock; + for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) { + if (CurBlock->hasAddressTaken()) { + // Somebody is jumping to the block through an address, all bets are + // off. + DEBUG(dbgs() << "skip: block " << BlockIndex + << " has its address taken\n"); + return {}; + } + Blocks[BlockIndex] = CurBlock; + auto *SinglePredecessor = CurBlock->getSinglePredecessor(); + if (!SinglePredecessor) { + // The block has two or more predecessors. + DEBUG(dbgs() << "skip: block " << BlockIndex + << " has two or more predecessors\n"); + return {}; + } + if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) { + // The block does not link back to the phi. + DEBUG(dbgs() << "skip: block " << BlockIndex + << " does not link back to the phi\n"); + return {}; + } + CurBlock = SinglePredecessor; + } + Blocks[0] = CurBlock; + return Blocks; +} + +bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { + DEBUG(dbgs() << "processPhi()\n"); + if (Phi.getNumIncomingValues() <= 1) { + DEBUG(dbgs() << "skip: only one incoming value in phi\n"); + return false; + } + // We are looking for something that has the following structure: + // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+ + // \ \ \ \ + // ne ne ne \ + // \ \ \ v + // +------------+-----------+----------> bb_phi + // + // - The last basic block (bb4 here) must branch unconditionally to bb_phi. + // It's the only block that contributes a non-constant value to the Phi. + // - All other blocks (b1, b2, b3) must have exactly two successors, one of + // them being the the phi block. + // - All intermediate blocks (bb2, bb3) must have only one predecessor. + // - Blocks cannot do other work besides the comparison, see doesOtherWork() + + // The blocks are not necessarily ordered in the phi, so we start from the + // last block and reconstruct the order. + BasicBlock *LastBlock = nullptr; + for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) { + if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue; + if (LastBlock) { + // There are several non-constant values. + DEBUG(dbgs() << "skip: several non-constant values\n"); + return false; + } + LastBlock = Phi.getIncomingBlock(I); + } + if (!LastBlock) { + // There is no non-constant block. + DEBUG(dbgs() << "skip: no non-constant block\n"); + return false; + } + if (LastBlock->getSingleSuccessor() != Phi.getParent()) { + DEBUG(dbgs() << "skip: last block non-phi successor\n"); + return false; + } + + const auto Blocks = + getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues()); + if (Blocks.empty()) return false; + BCECmpChain CmpChain(Blocks, Phi); + + if (CmpChain.size() < 2) { + DEBUG(dbgs() << "skip: only one compare block\n"); + return false; + } + + return CmpChain.simplify(TLI); +} + +class MergeICmps : public FunctionPass { + public: + static char ID; + + MergeICmps() : FunctionPass(ID) { + initializeMergeICmpsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; + const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto PA = runImpl(F, &TLI, &TTI); + return !PA.areAllPreserved(); + } + + private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + + PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI); +}; + +PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI) { + DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n"); + + // We only try merging comparisons if the target wants to expand memcmp later. + // The rationale is to avoid turning small chains into memcmp calls. + if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all(); + + bool MadeChange = false; + + for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { + // A Phi operation is always first in a basic block. + if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin())) + MadeChange |= processPhi(*Phi, TLI); + } + + if (MadeChange) return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +} // namespace + +char MergeICmps::ID = 0; +INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps", + "Merge contiguous icmps into a memcmp", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(MergeICmps, "mergeicmps", + "Merge contiguous icmps into a memcmp", false, false) + +Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); } diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 6727cf0179c1..f2f615cb9b0f 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -80,11 +80,9 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" -#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Metadata.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" @@ -195,7 +193,7 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start, make_range(Start.getIterator(), End.getIterator())) if (Inst.mayThrow()) return true; - return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef); + return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef); } /// diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp index d0bfe3603897..b026c8d692c3 100644 --- a/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/lib/Transforms/Scalar/NaryReassociate.cpp @@ -77,19 +77,45 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/NaryReassociate.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <cassert> +#include <cstdint> + using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "nary-reassociate" namespace { + class NaryReassociateLegacyPass : public FunctionPass { public: static char ID; @@ -101,6 +127,7 @@ public: bool doInitialization(Module &M) override { return false; } + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -118,9 +145,11 @@ public: private: NaryReassociatePass Impl; }; -} // anonymous namespace + +} // end anonymous namespace char NaryReassociateLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate", "Nary reassociation", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 8ac10348eb77..9ebf2d769356 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -1,4 +1,4 @@ -//===---- NewGVN.cpp - Global Value Numbering Pass --------------*- C++ -*-===// +//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===// // // The LLVM Compiler Infrastructure // @@ -6,6 +6,7 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// /// \file /// This file implements the new LLVM's Global Value Numbering pass. /// GVN partitions values computed by a function into congruence classes. @@ -48,59 +49,81 @@ /// published algorithms are O(Instructions). Instead, we use a technique that /// is O(number of operations with the same value number), enabling us to skip /// trying to eliminate things that have unique value numbers. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/NewGVN.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Hashing.h" -#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CFGPrinter.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/ArrayRecycler.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/PointerLikeTypeTraits.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVNExpression.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include "llvm/Transforms/Utils/VNCoercion.h" -#include <numeric> -#include <unordered_map> +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <map> +#include <memory> +#include <set> +#include <string> +#include <tuple> #include <utility> #include <vector> + using namespace llvm; -using namespace PatternMatch; using namespace llvm::GVNExpression; using namespace llvm::VNCoercion; + #define DEBUG_TYPE "newgvn" STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted"); @@ -118,15 +141,19 @@ STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created"); STATISTIC(NumGVNPHIOfOpsEliminations, "Number of things eliminated using PHI of ops"); DEBUG_COUNTER(VNCounter, "newgvn-vn", - "Controls which instructions are value numbered") + "Controls which instructions are value numbered"); DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi", - "Controls which instructions we create phi of ops for") + "Controls which instructions we create phi of ops for"); // Currently store defining access refinement is too slow due to basicaa being // egregiously slow. This flag lets us keep it working while we work on this // issue. static cl::opt<bool> EnableStoreRefinement("enable-store-refinement", cl::init(false), cl::Hidden); +/// Currently, the generation "phi of ops" can result in correctness issues. +static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true), + cl::Hidden); + //===----------------------------------------------------------------------===// // GVN Pass //===----------------------------------------------------------------------===// @@ -134,6 +161,7 @@ static cl::opt<bool> EnableStoreRefinement("enable-store-refinement", // Anchor methods. namespace llvm { namespace GVNExpression { + Expression::~Expression() = default; BasicExpression::~BasicExpression() = default; CallExpression::~CallExpression() = default; @@ -141,8 +169,11 @@ LoadExpression::~LoadExpression() = default; StoreExpression::~StoreExpression() = default; AggregateValueExpression::~AggregateValueExpression() = default; PHIExpression::~PHIExpression() = default; -} -} + +} // end namespace GVNExpression +} // end namespace llvm + +namespace { // Tarjan's SCC finding algorithm with Nuutila's improvements // SCCIterator is actually fairly complex for the simple thing we want. @@ -153,7 +184,6 @@ PHIExpression::~PHIExpression() = default; // instructions, // not generic values (arguments, etc). struct TarjanSCC { - TarjanSCC() : Components(1) {} void Start(const Instruction *Start) { @@ -208,15 +238,19 @@ private: Stack.push_back(I); } } + unsigned int DFSNum = 1; SmallPtrSet<const Value *, 8> InComponent; DenseMap<const Value *, unsigned int> Root; SmallVector<const Value *, 8> Stack; + // Store the components as vector of ptr sets, because we need the topo order // of SCC's, but not individual member order SmallVector<SmallPtrSet<const Value *, 8>, 8> Components; + DenseMap<const Value *, unsigned> ValueToComponent; }; + // Congruence classes represent the set of expressions/instructions // that are all the same *during some scope in the function*. // That is, because of the way we perform equality propagation, and @@ -265,7 +299,9 @@ public: explicit CongruenceClass(unsigned ID) : ID(ID) {} CongruenceClass(unsigned ID, Value *Leader, const Expression *E) : ID(ID), RepLeader(Leader), DefiningExpr(E) {} + unsigned getID() const { return ID; } + // True if this class has no members left. This is mainly used for assertion // purposes, and for skipping empty classes. bool isDead() const { @@ -273,6 +309,7 @@ public: // perspective, it's really dead. return empty() && memory_empty(); } + // Leader functions Value *getLeader() const { return RepLeader; } void setLeader(Value *Leader) { RepLeader = Leader; } @@ -280,7 +317,6 @@ public: return NextLeader; } void resetNextLeader() { NextLeader = {nullptr, ~0}; } - void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) { if (LeaderPair.second < NextLeader.second) NextLeader = LeaderPair; @@ -315,6 +351,7 @@ public: iterator_range<MemoryMemberSet::const_iterator> memory() const { return make_range(memory_begin(), memory_end()); } + void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); } void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); } @@ -354,34 +391,48 @@ public: private: unsigned ID; + // Representative leader. Value *RepLeader = nullptr; + // The most dominating leader after our current leader, because the member set // is not sorted and is expensive to keep sorted all the time. std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U}; + // If this is represented by a store, the value of the store. Value *RepStoredValue = nullptr; + // If this class contains MemoryDefs or MemoryPhis, this is the leading memory // access. const MemoryAccess *RepMemoryAccess = nullptr; + // Defining Expression. const Expression *DefiningExpr = nullptr; + // Actual members of this class. MemberSet Members; + // This is the set of MemoryPhis that exist in the class. MemoryDefs and // MemoryUses have real instructions representing them, so we only need to // track MemoryPhis here. MemoryMemberSet MemoryMembers; + // Number of stores in this congruence class. // This is used so we can detect store equivalence changes properly. int StoreCount = 0; }; +} // end anonymous namespace + namespace llvm { + struct ExactEqualsExpression { const Expression &E; + explicit ExactEqualsExpression(const Expression &E) : E(E) {} + hash_code getComputedHash() const { return E.getComputedHash(); } + bool operator==(const Expression &Other) const { return E.exactlyEquals(Other); } @@ -393,17 +444,21 @@ template <> struct DenseMapInfo<const Expression *> { Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable; return reinterpret_cast<const Expression *>(Val); } + static const Expression *getTombstoneKey() { auto Val = static_cast<uintptr_t>(~1U); Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable; return reinterpret_cast<const Expression *>(Val); } + static unsigned getHashValue(const Expression *E) { return E->getComputedHash(); } + static unsigned getHashValue(const ExactEqualsExpression &E) { return E.getComputedHash(); } + static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) { if (RHS == getTombstoneKey() || RHS == getEmptyKey()) return false; @@ -425,9 +480,11 @@ template <> struct DenseMapInfo<const Expression *> { return *LHS == *RHS; } }; + } // end namespace llvm namespace { + class NewGVN { Function &F; DominatorTree *DT; @@ -464,16 +521,22 @@ class NewGVN { // Value Mappings. DenseMap<Value *, CongruenceClass *> ValueToClass; DenseMap<Value *, const Expression *> ValueToExpression; + // Value PHI handling, used to make equivalence between phi(op, op) and // op(phi, phi). // These mappings just store various data that would normally be part of the // IR. - DenseSet<const Instruction *> PHINodeUses; + SmallPtrSet<const Instruction *, 8> PHINodeUses; + + DenseMap<const Value *, bool> OpSafeForPHIOfOps; + // Map a temporary instruction we created to a parent block. DenseMap<const Value *, BasicBlock *> TempToBlock; - // Map between the temporary phis we created and the real instructions they - // are known equivalent to. + + // Map between the already in-program instructions and the temporary phis we + // created that they are known equivalent to. DenseMap<const Value *, PHINode *> RealToTemp; + // In order to know when we should re-process instructions that have // phi-of-ops, we track the set of expressions that they needed as // leaders. When we discover new leaders for those expressions, we process the @@ -485,19 +548,32 @@ class NewGVN { mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers; DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>> ExpressionToPhiOfOps; - // Map from basic block to the temporary operations we created - DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs; + // Map from temporary operation to MemoryAccess. DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory; + // Set of all temporary instructions we created. + // Note: This will include instructions that were just created during value + // numbering. The way to test if something is using them is to check + // RealToTemp. DenseSet<Instruction *> AllTempInstructions; + // This is the set of instructions to revisit on a reachability change. At + // the end of the main iteration loop it will contain at least all the phi of + // ops instructions that will be changed to phis, as well as regular phis. + // During the iteration loop, it may contain other things, such as phi of ops + // instructions that used edge reachability to reach a result, and so need to + // be revisited when the edge changes, independent of whether the phi they + // depended on changes. + DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange; + // Mapping from predicate info we used to the instructions we used it with. // In order to correctly ensure propagation, we must keep track of what // comparisons we used, so that when the values of the comparisons change, we // propagate the information to the places we used the comparison. mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers; + // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for // stores, we no longer can rely solely on the def-use chains of MemorySSA. mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> @@ -525,6 +601,7 @@ class NewGVN { enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle }; mutable DenseMap<const Instruction *, InstCycleState> InstCycleState; + // Expression to class mapping. using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>; ExpressionClassMap ExpressionToClass; @@ -581,6 +658,7 @@ public: : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL), PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) { } + bool runGVN(); private: @@ -588,7 +666,13 @@ private: const Expression *createExpression(Instruction *) const; const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *, Instruction *) const; - PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge, + + // Our canonical form for phi arguments is a pair of incoming value, incoming + // basic block. + using ValPair = std::pair<Value *, BasicBlock *>; + + PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *, + BasicBlock *, bool &HasBackEdge, bool &OriginalOpsConstant) const; const DeadExpression *createDeadExpression() const; const VariableExpression *createVariableExpression(Value *) const; @@ -617,6 +701,7 @@ private: CC->setMemoryLeader(MA); return CC; } + CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) { auto *CC = getMemoryClass(MA); if (CC->getMemoryLeader() != MA) @@ -630,10 +715,21 @@ private: ValueToClass[Member] = CClass; return CClass; } + void initializeCongruenceClasses(Function &F); - const Expression *makePossiblePhiOfOps(Instruction *, + const Expression *makePossiblePHIOfOps(Instruction *, SmallPtrSetImpl<Value *> &); + Value *findLeaderForInst(Instruction *ValueOp, + SmallPtrSetImpl<Value *> &Visited, + MemoryAccess *MemAccess, Instruction *OrigInst, + BasicBlock *PredBB); + bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock, + SmallPtrSetImpl<const Value *> &Visited, + SmallVectorImpl<Instruction *> &Worklist); + bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock, + SmallPtrSetImpl<const Value *> &); void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue); + void removePhiOfOps(Instruction *I, PHINode *PHITemp); // Value number an Instruction or MemoryPhi. void valueNumberMemoryPhi(MemoryPhi *); @@ -650,7 +746,10 @@ private: const Expression *performSymbolicLoadEvaluation(Instruction *) const; const Expression *performSymbolicStoreEvaluation(Instruction *) const; const Expression *performSymbolicCallEvaluation(Instruction *) const; - const Expression *performSymbolicPHIEvaluation(Instruction *) const; + void sortPHIOps(MutableArrayRef<ValPair> Ops) const; + const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>, + Instruction *I, + BasicBlock *PHIBlock) const; const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; const Expression *performSymbolicCmpEvaluation(Instruction *) const; const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const; @@ -658,6 +757,7 @@ private: // Congruence finding. bool someEquivalentDominates(const Instruction *, const Instruction *) const; Value *lookupOperandLeader(Value *) const; + CongruenceClass *getClassForExpression(const Expression *E) const; void performCongruenceFinding(Instruction *, const Expression *); void moveValueToNewCongruenceClass(Instruction *, const Expression *, CongruenceClass *, CongruenceClass *); @@ -692,10 +792,11 @@ private: void replaceInstruction(Instruction *, Value *); void markInstructionForDeletion(Instruction *); void deleteInstructionsInBlock(BasicBlock *); - Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const; + Value *findPHIOfOpsLeader(const Expression *, const Instruction *, + const BasicBlock *) const; // New instruction creation. - void handleNewInstruction(Instruction *){}; + void handleNewInstruction(Instruction *) {} // Various instruction touch utilities template <typename Map, typename KeyType, typename Func> @@ -731,6 +832,7 @@ private: MemoryAccess *getDefiningAccess(const MemoryAccess *) const; MemoryPhi *getMemoryAccess(const BasicBlock *) const; template <class T, class Range> T *getMinDFSOfRange(const Range &) const; + unsigned InstrToDFSNum(const Value *V) const { assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses"); return InstrDFS.lookup(V); @@ -739,7 +841,9 @@ private: unsigned InstrToDFSNum(const MemoryAccess *MA) const { return MemoryToDFSNum(MA); } + Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; } + // Given a MemoryAccess, return the relevant instruction DFS number. Note: // This deliberately takes a value so it can be used with Use's, which will // auto-convert to Value's but not to MemoryAccess's. @@ -750,12 +854,15 @@ private: ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst()) : InstrDFS.lookup(MA); } + bool isCycleFree(const Instruction *) const; bool isBackedge(BasicBlock *From, BasicBlock *To) const; + // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. std::pair<int, int> StartingVNCounter; }; + } // end anonymous namespace template <typename T> @@ -781,11 +888,9 @@ bool StoreExpression::equals(const Expression &Other) const { // Determine if the edge From->To is a backedge bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const { - if (From == To) - return true; - auto *FromDTN = DT->getNode(From); - auto *ToDTN = DT->getNode(To); - return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN); + return From == To || + RPOOrdering.lookup(DT->getNode(From)) >= + RPOOrdering.lookup(DT->getNode(To)); } #ifndef NDEBUG @@ -830,51 +935,77 @@ void NewGVN::deleteExpression(const Expression *E) const { const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler); ExpressionAllocator.Deallocate(E); } -PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, + +// If V is a predicateinfo copy, get the thing it is a copy of. +static Value *getCopyOf(const Value *V) { + if (auto *II = dyn_cast<IntrinsicInst>(V)) + if (II->getIntrinsicID() == Intrinsic::ssa_copy) + return II->getOperand(0); + return nullptr; +} + +// Return true if V is really PN, even accounting for predicateinfo copies. +static bool isCopyOfPHI(const Value *V, const PHINode *PN) { + return V == PN || getCopyOf(V) == PN; +} + +static bool isCopyOfAPHI(const Value *V) { + auto *CO = getCopyOf(V); + return CO && isa<PHINode>(CO); +} + +// Sort PHI Operands into a canonical order. What we use here is an RPO +// order. The BlockInstRange numbers are generated in an RPO walk of the basic +// blocks. +void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const { + std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) { + return BlockInstRange.lookup(P1.second).first < + BlockInstRange.lookup(P2.second).first; + }); +} + +// Return true if V is a value that will always be available (IE can +// be placed anywhere) in the function. We don't do globals here +// because they are often worse to put in place. +static bool alwaysAvailable(Value *V) { + return isa<Constant>(V) || isa<Argument>(V); +} + +// Create a PHIExpression from an array of {incoming edge, value} pairs. I is +// the original instruction we are creating a PHIExpression for (but may not be +// a phi node). We require, as an invariant, that all the PHIOperands in the +// same block are sorted the same way. sortPHIOps will sort them into a +// canonical order. +PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands, + const Instruction *I, + BasicBlock *PHIBlock, + bool &HasBackedge, bool &OriginalOpsConstant) const { - BasicBlock *PHIBlock = getBlockForValue(I); - auto *PN = cast<PHINode>(I); - auto *E = - new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock); + unsigned NumOps = PHIOperands.size(); + auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock); E->allocateOperands(ArgRecycler, ExpressionAllocator); - E->setType(I->getType()); - E->setOpcode(I->getOpcode()); - - // NewGVN assumes the operands of a PHI node are in a consistent order across - // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix - // this in LLVM at some point we don't want GVN to find wrong congruences. - // Therefore, here we sort uses in predecessor order. - // We're sorting the values by pointer. In theory this might be cause of - // non-determinism, but here we don't rely on the ordering for anything - // significant, e.g. we don't create new instructions based on it so we're - // fine. - SmallVector<const Use *, 4> PHIOperands; - for (const Use &U : PN->operands()) - PHIOperands.push_back(&U); - std::sort(PHIOperands.begin(), PHIOperands.end(), - [&](const Use *U1, const Use *U2) { - return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2); - }); + E->setType(PHIOperands.begin()->first->getType()); + E->setOpcode(Instruction::PHI); // Filter out unreachable phi operands. - auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) { - if (*U == PN) - return false; - if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock})) + auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) { + auto *BB = P.second; + if (auto *PHIOp = dyn_cast<PHINode>(I)) + if (isCopyOfPHI(P.first, PHIOp)) + return false; + if (!ReachableEdges.count({BB, PHIBlock})) return false; // Things in TOPClass are equivalent to everything. - if (ValueToClass.lookup(*U) == TOPClass) + if (ValueToClass.lookup(P.first) == TOPClass) return false; - return lookupOperandLeader(*U) != PN; + OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first); + HasBackedge = HasBackedge || isBackedge(BB, PHIBlock); + return lookupOperandLeader(P.first) != I; }); std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), - [&](const Use *U) -> Value * { - auto *BB = PN->getIncomingBlock(*U); - HasBackedge = HasBackedge || isBackedge(BB, PHIBlock); - OriginalOpsConstant = - OriginalOpsConstant && isa<Constant>(*U); - return lookupOperandLeader(*U); + [&](const ValPair &P) -> Value * { + return lookupOperandLeader(P.first); }); return E; } @@ -929,8 +1060,6 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, // Take a Value returned by simplification of Expression E/Instruction // I, and see if it resulted in a simpler expression. If so, return // that expression. -// TODO: Once finished, this should not take an Instruction, we only -// use it for printing. const Expression *NewGVN::checkSimplificationResults(Expression *E, Instruction *I, Value *V) const { @@ -954,25 +1083,37 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, } CongruenceClass *CC = ValueToClass.lookup(V); - if (CC && CC->getDefiningExpr()) { - // If we simplified to something else, we need to communicate - // that we're users of the value we simplified to. - if (I != V) { + if (CC) { + if (CC->getLeader() && CC->getLeader() != I) { // Don't add temporary instructions to the user lists. if (!AllTempInstructions.count(I)) addAdditionalUsers(V, I); + return createVariableOrConstant(CC->getLeader()); } + if (CC->getDefiningExpr()) { + // If we simplified to something else, we need to communicate + // that we're users of the value we simplified to. + if (I != V) { + // Don't add temporary instructions to the user lists. + if (!AllTempInstructions.count(I)) + addAdditionalUsers(V, I); + } - if (I) - DEBUG(dbgs() << "Simplified " << *I << " to " - << " expression " << *CC->getDefiningExpr() << "\n"); - NumGVNOpsSimplified++; - deleteExpression(E); - return CC->getDefiningExpr(); + if (I) + DEBUG(dbgs() << "Simplified " << *I << " to " + << " expression " << *CC->getDefiningExpr() << "\n"); + NumGVNOpsSimplified++; + deleteExpression(E); + return CC->getDefiningExpr(); + } } + return nullptr; } +// Create a value expression from the instruction I, replacing operands with +// their leaders. + const Expression *NewGVN::createExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands()); @@ -987,15 +1128,7 @@ const Expression *NewGVN::createExpression(Instruction *I) const { if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) E->swapOperands(0, 1); } - - // Perform simplificaiton - // TODO: Right now we only check to see if we get a constant result. - // We may get a less than constant, but still better, result for - // some operations. - // IE - // add 0, x -> x - // and x, x -> x - // We should handle this by simply rewriting the expression. + // Perform simplification. if (auto *CI = dyn_cast<CmpInst>(I)) { // Sort the operand value numbers so x<y and y>x get the same value // number. @@ -1016,7 +1149,7 @@ const Expression *NewGVN::createExpression(Instruction *I) const { return SimplifiedE; } else if (isa<SelectInst>(I)) { if (isa<Constant>(E->getOperand(0)) || - E->getOperand(0) == E->getOperand(1)) { + E->getOperand(1) == E->getOperand(2)) { assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() && E->getOperand(2)->getType() == I->getOperand(2)->getType()); Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1), @@ -1121,7 +1254,7 @@ NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const { bool NewGVN::someEquivalentDominates(const Instruction *Inst, const Instruction *U) const { auto *CC = ValueToClass.lookup(Inst); - // This must be an instruction because we are only called from phi nodes + // This must be an instruction because we are only called from phi nodes // in the case that the value it needs to check against is an instruction. // The most likely candiates for dominance are the leader and the next leader. @@ -1139,6 +1272,8 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst, // any of these siblings. if (!CC) return false; + if (alwaysAvailable(CC->getLeader())) + return true; if (DT->dominates(cast<Instruction>(CC->getLeader()), U)) return true; if (CC->getNextLeader().first && @@ -1229,9 +1364,9 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const { if (EnableStoreRefinement) StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess); // If we bypassed the use-def chains, make sure we add a use. + StoreRHS = lookupMemoryLeader(StoreRHS); if (StoreRHS != StoreAccess->getDefiningAccess()) addMemoryUsers(StoreRHS, StoreAccess); - StoreRHS = lookupMemoryLeader(StoreRHS); // If we are defined by ourselves, use the live on entry def. if (StoreRHS == StoreAccess) StoreRHS = MSSA->getLiveOnEntryDef(); @@ -1278,7 +1413,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) { // Can't forward from non-atomic to atomic without violating memory model. // Also don't need to coerce if they are the same type, we will just - // propogate.. + // propagate. if (LI->isAtomic() > DepSI->isAtomic() || LoadType == DepSI->getValueOperand()->getType()) return nullptr; @@ -1292,14 +1427,13 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, getConstantStoreValueForLoad(C, Offset, LoadType, DL)); } } - - } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { + } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) { // Can't forward from non-atomic to atomic without violating memory model. if (LI->isAtomic() > DepLI->isAtomic()) return nullptr; int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL); if (Offset >= 0) { - // We can coerce a constant load into a load + // We can coerce a constant load into a load. if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI))) if (auto *PossibleConstant = getConstantLoadValueForLoad(C, Offset, LoadType, DL)) { @@ -1308,8 +1442,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, return createConstantExpression(PossibleConstant); } } - - } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) { + } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) { int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL); if (Offset >= 0) { if (auto *PossibleConstant = @@ -1381,9 +1514,13 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { } } - const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader, - LI, DefiningAccess); - return E; + const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI, + DefiningAccess); + // If our MemoryLeader is not our defining access, add a use to the + // MemoryLeader, so that we get reprocessed when it changes. + if (LE->getMemoryLeader() != DefiningAccess) + addMemoryUsers(LE->getMemoryLeader(), OriginalAccess); + return LE; } const Expression * @@ -1402,7 +1539,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { auto *Cond = PWC->Condition; // If this a copy of the condition, it must be either true or false depending - // on the predicate info type and edge + // on the predicate info type and edge. if (CopyOf == Cond) { // We should not need to add predicate users because the predicate info is // already a use of this operand. @@ -1438,7 +1575,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0)); Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1)); bool SwappedOps = false; - // Sort the ops + // Sort the ops. if (shouldSwapOperands(FirstOp, SecondOp)) { std::swap(FirstOp, SecondOp); SwappedOps = true; @@ -1464,7 +1601,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) || (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) { addPredicateUsers(PI, I); - addAdditionalUsers(Cmp->getOperand(0), I); + addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0), + I); return createVariableOrConstant(FirstOp); } // Handle the special case of floating point. @@ -1472,7 +1610,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) && isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) { addPredicateUsers(PI, I); - addAdditionalUsers(Cmp->getOperand(0), I); + addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0), + I); return createConstantExpression(cast<Constant>(FirstOp)); } } @@ -1502,7 +1641,6 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { // Retrieve the memory class for a given MemoryAccess. CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const { - auto *Result = MemoryAccessToClass.lookup(MA); assert(Result && "Should have found memory class"); return Result; @@ -1571,8 +1709,9 @@ bool NewGVN::isCycleFree(const Instruction *I) const { if (SCC.size() == 1) InstCycleState.insert({I, ICS_CycleFree}); else { - bool AllPhis = - llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); }); + bool AllPhis = llvm::all_of(SCC, [](const Value *V) { + return isa<PHINode>(V) || isCopyOfAPHI(V); + }); ICS = AllPhis ? ICS_CycleFree : ICS_Cycle; for (auto *Member : SCC) if (auto *MemberPhi = dyn_cast<PHINode>(Member)) @@ -1584,17 +1723,20 @@ bool NewGVN::isCycleFree(const Instruction *I) const { return true; } -// Evaluate PHI nodes symbolically, and create an expression result. -const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { +// Evaluate PHI nodes symbolically and create an expression result. +const Expression * +NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, + Instruction *I, + BasicBlock *PHIBlock) const { // True if one of the incoming phi edges is a backedge. bool HasBackedge = false; // All constant tracks the state of whether all the *original* phi operands // This is really shorthand for "this phi cannot cycle due to forward // change in value of the phi is guaranteed not to later change the value of // the phi. IE it can't be v = phi(undef, v+1) - bool AllConstant = true; - auto *E = - cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant)); + bool OriginalOpsConstant = true; + auto *E = cast<PHIExpression>(createPHIExpression( + PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant)); // We match the semantics of SimplifyPhiNode from InstructionSimplify here. // See if all arguments are the same. // We track if any were undef because they need special handling. @@ -1620,14 +1762,10 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { deleteExpression(E); return createDeadExpression(); } - unsigned NumOps = 0; Value *AllSameValue = *(Filtered.begin()); ++Filtered.begin(); // Can't use std::equal here, sadly, because filter.begin moves. - if (llvm::all_of(Filtered, [&](Value *Arg) { - ++NumOps; - return Arg == AllSameValue; - })) { + if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) { // In LLVM's non-standard representation of phi nodes, it's possible to have // phi nodes with cycles (IE dependent on other phis that are .... dependent // on the original phi node), especially in weird CFG's where some arguments @@ -1642,9 +1780,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { // multivalued phi, and we need to know if it's cycle free in order to // evaluate whether we can ignore the undef. The other parts of this are // just shortcuts. If there is no backedge, or all operands are - // constants, or all operands are ignored but the undef, it also must be - // cycle free. - if (!AllConstant && HasBackedge && NumOps > 0 && + // constants, it also must be cycle free. + if (HasBackedge && !OriginalOpsConstant && !isa<UndefValue>(AllSameValue) && !isCycleFree(I)) return E; @@ -1708,8 +1845,11 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const { return createAggregateValueExpression(I); } + const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { - auto *CI = dyn_cast<CmpInst>(I); + assert(isa<CmpInst>(I) && "Expected a cmp instruction."); + + auto *CI = cast<CmpInst>(I); // See if our operands are equal to those of a previous predicate, and if so, // if it implies true or false. auto Op0 = lookupOperandLeader(CI->getOperand(0)); @@ -1720,7 +1860,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { OurPredicate = CI->getSwappedPredicate(); } - // Avoid processing the same info twice + // Avoid processing the same info twice. const PredicateBase *LastPredInfo = nullptr; // See if we know something about the comparison itself, like it is the target // of an assume. @@ -1754,7 +1894,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { // %operands are considered users of the icmp. // *Currently* we only check one level of comparisons back, and only mark one - // level back as touched when changes appen . If you modify this code to look + // level back as touched when changes happen. If you modify this code to look // back farther through comparisons, you *must* mark the appropriate // comparisons as users in PredicateInfo.cpp, or you will cause bugs. See if // we know something just from the operands themselves @@ -1767,10 +1907,15 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { if (PI == LastPredInfo) continue; LastPredInfo = PI; - - // TODO: Along the false edge, we may know more things too, like icmp of + // In phi of ops cases, we may have predicate info that we are evaluating + // in a different context. + if (!DT->dominates(PBranch->To, getBlockForValue(I))) + continue; + // TODO: Along the false edge, we may know more things too, like + // icmp of // same operands is false. - // TODO: We only handle actual comparison conditions below, not and/or. + // TODO: We only handle actual comparison conditions below, not + // and/or. auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition); if (!BranchCond) continue; @@ -1798,7 +1943,6 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { return createConstantExpression( ConstantInt::getFalse(CI->getType())); } - } else { // Just handle the ne and eq cases, where if we have the same // operands, we may know something. @@ -1822,14 +1966,6 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { return createExpression(I); } -// Return true if V is a value that will always be available (IE can -// be placed anywhere) in the function. We don't do globals here -// because they are often worse to put in place. -// TODO: Separate cost from availability -static bool alwaysAvailable(Value *V) { - return isa<Constant>(V) || isa<Argument>(V); -} - // Substitute and symbolize the value before value numbering. const Expression * NewGVN::performSymbolicEvaluation(Value *V, @@ -1849,9 +1985,15 @@ NewGVN::performSymbolicEvaluation(Value *V, case Instruction::InsertValue: E = performSymbolicAggrValueEvaluation(I); break; - case Instruction::PHI: - E = performSymbolicPHIEvaluation(I); - break; + case Instruction::PHI: { + SmallVector<ValPair, 3> Ops; + auto *PN = cast<PHINode>(I); + for (unsigned i = 0; i < PN->getNumOperands(); ++i) + Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)}); + // Sort to ensure the invariant createPHIExpression requires is met. + sortPHIOps(Ops); + E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I)); + } break; case Instruction::Call: E = performSymbolicCallEvaluation(I); break; @@ -1861,13 +2003,13 @@ NewGVN::performSymbolicEvaluation(Value *V, case Instruction::Load: E = performSymbolicLoadEvaluation(I); break; - case Instruction::BitCast: { + case Instruction::BitCast: E = createExpression(I); - } break; + break; case Instruction::ICmp: - case Instruction::FCmp: { + case Instruction::FCmp: E = performSymbolicCmpEvaluation(I); - } break; + break; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -2017,7 +2159,7 @@ T *NewGVN::getMinDFSOfRange(const Range &R) const { const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const { // TODO: If this ends up to slow, we can maintain a next memory leader like we // do for regular leaders. - // Make sure there will be a leader to find + // Make sure there will be a leader to find. assert(!CC->definesNoMemory() && "Can't get next leader if there is none"); if (CC->getStoreCount() > 0) { if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first)) @@ -2194,7 +2336,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // For a given expression, mark the phi of ops instructions that could have // changed as a result. void NewGVN::markPhiOfOpsChanged(const Expression *E) { - touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E)); + touchAndErase(ExpressionToPhiOfOps, E); } // Perform congruence finding on a given value numbering expression. @@ -2315,14 +2457,11 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) { if (MemoryAccess *MemPhi = getMemoryAccess(To)) TouchedInstructions.set(InstrToDFSNum(MemPhi)); - auto BI = To->begin(); - while (isa<PHINode>(BI)) { - TouchedInstructions.set(InstrToDFSNum(&*BI)); - ++BI; - } - for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) { - TouchedInstructions.set(InstrToDFSNum(I)); - }); + // FIXME: We should just add a union op on a Bitvector and + // SparseBitVector. We can do it word by word faster than we are doing it + // here. + for (auto InstNum : RevisitOnReachabilityChange[To]) + TouchedInstructions.set(InstNum); } } } @@ -2419,24 +2558,146 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) { } } +// Remove the PHI of Ops PHI for I +void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) { + InstrDFS.erase(PHITemp); + // It's still a temp instruction. We keep it in the array so it gets erased. + // However, it's no longer used by I, or in the block + TempToBlock.erase(PHITemp); + RealToTemp.erase(I); + // We don't remove the users from the phi node uses. This wastes a little + // time, but such is life. We could use two sets to track which were there + // are the start of NewGVN, and which were added, but right nowt he cost of + // tracking is more than the cost of checking for more phi of ops. +} + +// Add PHI Op in BB as a PHI of operations version of ExistingValue. void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue) { InstrDFS[Op] = InstrToDFSNum(ExistingValue); AllTempInstructions.insert(Op); - PHIOfOpsPHIs[BB].push_back(Op); TempToBlock[Op] = BB; RealToTemp[ExistingValue] = Op; + // Add all users to phi node use, as they are now uses of the phi of ops phis + // and may themselves be phi of ops. + for (auto *U : ExistingValue->users()) + if (auto *UI = dyn_cast<Instruction>(U)) + PHINodeUses.insert(UI); } static bool okayForPHIOfOps(const Instruction *I) { + if (!EnablePhiOfOps) + return false; return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) || isa<LoadInst>(I); } +bool NewGVN::OpIsSafeForPHIOfOpsHelper( + Value *V, const BasicBlock *PHIBlock, + SmallPtrSetImpl<const Value *> &Visited, + SmallVectorImpl<Instruction *> &Worklist) { + + if (!isa<Instruction>(V)) + return true; + auto OISIt = OpSafeForPHIOfOps.find(V); + if (OISIt != OpSafeForPHIOfOps.end()) + return OISIt->second; + + // Keep walking until we either dominate the phi block, or hit a phi, or run + // out of things to check. + if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) { + OpSafeForPHIOfOps.insert({V, true}); + return true; + } + // PHI in the same block. + if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) { + OpSafeForPHIOfOps.insert({V, false}); + return false; + } + + auto *OrigI = cast<Instruction>(V); + for (auto *Op : OrigI->operand_values()) { + if (!isa<Instruction>(Op)) + continue; + // Stop now if we find an unsafe operand. + auto OISIt = OpSafeForPHIOfOps.find(OrigI); + if (OISIt != OpSafeForPHIOfOps.end()) { + if (!OISIt->second) { + OpSafeForPHIOfOps.insert({V, false}); + return false; + } + continue; + } + if (!Visited.insert(Op).second) + continue; + Worklist.push_back(cast<Instruction>(Op)); + } + return true; +} + +// Return true if this operand will be safe to use for phi of ops. +// +// The reason some operands are unsafe is that we are not trying to recursively +// translate everything back through phi nodes. We actually expect some lookups +// of expressions to fail. In particular, a lookup where the expression cannot +// exist in the predecessor. This is true even if the expression, as shown, can +// be determined to be constant. +bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock, + SmallPtrSetImpl<const Value *> &Visited) { + SmallVector<Instruction *, 4> Worklist; + if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist)) + return false; + while (!Worklist.empty()) { + auto *I = Worklist.pop_back_val(); + if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist)) + return false; + } + OpSafeForPHIOfOps.insert({V, true}); + return true; +} + +// Try to find a leader for instruction TransInst, which is a phi translated +// version of something in our original program. Visited is used to ensure we +// don't infinite loop during translations of cycles. OrigInst is the +// instruction in the original program, and PredBB is the predecessor we +// translated it through. +Value *NewGVN::findLeaderForInst(Instruction *TransInst, + SmallPtrSetImpl<Value *> &Visited, + MemoryAccess *MemAccess, Instruction *OrigInst, + BasicBlock *PredBB) { + unsigned IDFSNum = InstrToDFSNum(OrigInst); + // Make sure it's marked as a temporary instruction. + AllTempInstructions.insert(TransInst); + // and make sure anything that tries to add it's DFS number is + // redirected to the instruction we are making a phi of ops + // for. + TempToBlock.insert({TransInst, PredBB}); + InstrDFS.insert({TransInst, IDFSNum}); + + const Expression *E = performSymbolicEvaluation(TransInst, Visited); + InstrDFS.erase(TransInst); + AllTempInstructions.erase(TransInst); + TempToBlock.erase(TransInst); + if (MemAccess) + TempToMemory.erase(TransInst); + if (!E) + return nullptr; + auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB); + if (!FoundVal) { + ExpressionToPhiOfOps[E].insert(OrigInst); + DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst + << " in block " << getBlockName(PredBB) << "\n"); + return nullptr; + } + if (auto *SI = dyn_cast<StoreInst>(FoundVal)) + FoundVal = SI->getValueOperand(); + return FoundVal; +} + // When we see an instruction that is an op of phis, generate the equivalent phi // of ops form. const Expression * -NewGVN::makePossiblePhiOfOps(Instruction *I, +NewGVN::makePossiblePHIOfOps(Instruction *I, SmallPtrSetImpl<Value *> &Visited) { if (!okayForPHIOfOps(I)) return nullptr; @@ -2450,7 +2711,6 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, if (!isCycleFree(I)) return nullptr; - unsigned IDFSNum = InstrToDFSNum(I); SmallPtrSet<const Value *, 8> ProcessedPHIs; // TODO: We don't do phi translation on memory accesses because it's // complicated. For a load, we'd need to be able to simulate a new memoryuse, @@ -2463,81 +2723,94 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, MemAccess->getDefiningAccess()->getBlock() == I->getParent()) return nullptr; + SmallPtrSet<const Value *, 10> VisitedOps; // Convert op of phis to phi of ops - for (auto &Op : I->operands()) { - // TODO: We can't handle expressions that must be recursively translated - // IE - // a = phi (b, c) - // f = use a - // g = f + phi of something - // To properly make a phi of ops for g, we'd have to properly translate and - // use the instruction for f. We should add this by splitting out the - // instruction creation we do below. - if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op))) - return nullptr; - if (!isa<PHINode>(Op)) - continue; + for (auto *Op : I->operand_values()) { + if (!isa<PHINode>(Op)) { + auto *ValuePHI = RealToTemp.lookup(Op); + if (!ValuePHI) + continue; + DEBUG(dbgs() << "Found possible dependent phi of ops\n"); + Op = ValuePHI; + } auto *OpPHI = cast<PHINode>(Op); // No point in doing this for one-operand phis. if (OpPHI->getNumOperands() == 1) continue; if (!DebugCounter::shouldExecute(PHIOfOpsCounter)) return nullptr; - SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops; + SmallVector<ValPair, 4> Ops; + SmallPtrSet<Value *, 4> Deps; auto *PHIBlock = getBlockForValue(OpPHI); - for (auto PredBB : OpPHI->blocks()) { + RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I)); + for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) { + auto *PredBB = OpPHI->getIncomingBlock(PredNum); Value *FoundVal = nullptr; // We could just skip unreachable edges entirely but it's tricky to do // with rewriting existing phi nodes. if (ReachableEdges.count({PredBB, PHIBlock})) { - // Clone the instruction, create an expression from it, and see if we - // have a leader. + // Clone the instruction, create an expression from it that is + // translated back into the predecessor, and see if we have a leader. Instruction *ValueOp = I->clone(); if (MemAccess) TempToMemory.insert({ValueOp, MemAccess}); - + bool SafeForPHIOfOps = true; + VisitedOps.clear(); for (auto &Op : ValueOp->operands()) { - Op = Op->DoPHITranslation(PHIBlock, PredBB); - // When this operand changes, it could change whether there is a - // leader for us or not. - addAdditionalUsers(Op, I); + auto *OrigOp = &*Op; + // When these operand changes, it could change whether there is a + // leader for us or not, so we have to add additional users. + if (isa<PHINode>(Op)) { + Op = Op->DoPHITranslation(PHIBlock, PredBB); + if (Op != OrigOp && Op != I) + Deps.insert(Op); + } else if (auto *ValuePHI = RealToTemp.lookup(Op)) { + if (getBlockForValue(ValuePHI) == PHIBlock) + Op = ValuePHI->getIncomingValueForBlock(PredBB); + } + // If we phi-translated the op, it must be safe. + SafeForPHIOfOps = + SafeForPHIOfOps && + (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps)); } - // Make sure it's marked as a temporary instruction. - AllTempInstructions.insert(ValueOp); - // and make sure anything that tries to add it's DFS number is - // redirected to the instruction we are making a phi of ops - // for. - InstrDFS.insert({ValueOp, IDFSNum}); - const Expression *E = performSymbolicEvaluation(ValueOp, Visited); - InstrDFS.erase(ValueOp); - AllTempInstructions.erase(ValueOp); + // FIXME: For those things that are not safe we could generate + // expressions all the way down, and see if this comes out to a + // constant. For anything where that is true, and unsafe, we should + // have made a phi-of-ops (or value numbered it equivalent to something) + // for the pieces already. + FoundVal = !SafeForPHIOfOps ? nullptr + : findLeaderForInst(ValueOp, Visited, + MemAccess, I, PredBB); ValueOp->deleteValue(); - if (MemAccess) - TempToMemory.erase(ValueOp); - if (!E) + if (!FoundVal) return nullptr; - FoundVal = findPhiOfOpsLeader(E, PredBB); - if (!FoundVal) { - ExpressionToPhiOfOps[E].insert(I); - return nullptr; - } - if (auto *SI = dyn_cast<StoreInst>(FoundVal)) - FoundVal = SI->getValueOperand(); } else { DEBUG(dbgs() << "Skipping phi of ops operand for incoming block " << getBlockName(PredBB) << " because the block is unreachable\n"); FoundVal = UndefValue::get(I->getType()); + RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); } Ops.push_back({FoundVal, PredBB}); DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in " << getBlockName(PredBB) << "\n"); } + for (auto Dep : Deps) + addAdditionalUsers(Dep, I); + sortPHIOps(Ops); + auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock); + if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) { + DEBUG(dbgs() + << "Not creating real PHI of ops because it simplified to existing " + "value or constant\n"); + return E; + } auto *ValuePHI = RealToTemp.lookup(I); bool NewPHI = false; if (!ValuePHI) { - ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands()); + ValuePHI = + PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops"); addPhiOfOps(ValuePHI, PHIBlock, I); NewPHI = true; NumGVNPHIOfOpsCreated++; @@ -2553,10 +2826,11 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, ++i; } } - + RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I << "\n"); - return performSymbolicEvaluation(ValuePHI, Visited); + + return E; } return nullptr; } @@ -2602,8 +2876,11 @@ void NewGVN::initializeCongruenceClasses(Function &F) { if (MD && isa<StoreInst>(MD->getMemoryInst())) TOPClass->incStoreCount(); } + + // FIXME: This is trying to discover which instructions are uses of phi + // nodes. We should move this into one of the myriad of places that walk + // all the operands already. for (auto &I : *BB) { - // TODO: Move to helper if (isa<PHINode>(&I)) for (auto *U : I.users()) if (auto *UInst = dyn_cast<Instruction>(U)) @@ -2661,7 +2938,8 @@ void NewGVN::cleanupTables() { ExpressionToPhiOfOps.clear(); TempToBlock.clear(); TempToMemory.clear(); - PHIOfOpsPHIs.clear(); + PHINodeUses.clear(); + OpSafeForPHIOfOps.clear(); ReachableBlocks.clear(); ReachableEdges.clear(); #ifndef NDEBUG @@ -2675,6 +2953,7 @@ void NewGVN::cleanupTables() { MemoryAccessToClass.clear(); PredicateToUsers.clear(); MemoryToUsers.clear(); + RevisitOnReachabilityChange.clear(); } // Assign local DFS number mapping to instructions, and leave space for Value @@ -2698,6 +2977,8 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B, markInstructionForDeletion(&I); continue; } + if (isa<PHINode>(&I)) + RevisitOnReachabilityChange[B].set(End); InstrDFS[&I] = End++; DFSToInstr.emplace_back(&I); } @@ -2719,6 +3000,7 @@ void NewGVN::updateProcessedCount(const Value *V) { } #endif } + // Evaluate MemoryPhi nodes symbolically, just like PHI nodes void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) { // If all the arguments are the same, the MemoryPhi has the same value as the @@ -2787,11 +3069,15 @@ void NewGVN::valueNumberInstruction(Instruction *I) { // Make a phi of ops if necessary if (Symbolized && !isa<ConstantExpression>(Symbolized) && !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) { - auto *PHIE = makePossiblePhiOfOps(I, Visited); - if (PHIE) + auto *PHIE = makePossiblePHIOfOps(I, Visited); + // If we created a phi of ops, use it. + // If we couldn't create one, make sure we don't leave one lying around + if (PHIE) { Symbolized = PHIE; + } else if (auto *Op = RealToTemp.lookup(I)) { + removePhiOfOps(I, Op); + } } - } else { // Mark the instruction as unused so we don't value number it again. InstrDFS[I] = 0; @@ -2905,7 +3191,7 @@ void NewGVN::verifyMemoryCongruency() const { // so we don't process them. if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) { for (auto &U : MemPHI->incoming_values()) { - if (Instruction *I = dyn_cast<Instruction>(U.get())) { + if (auto *I = dyn_cast<Instruction>(&*U)) { if (!isInstructionTriviallyDead(I)) return true; } @@ -3200,11 +3486,13 @@ struct NewGVN::ValueDFS { int DFSIn = 0; int DFSOut = 0; int LocalNum = 0; + // Only one of Def and U will be set. // The bool in the Def tells us whether the Def is the stored value of a // store. PointerIntPair<Value *, 1, bool> Def; Use *U = nullptr; + bool operator<(const ValueDFS &Other) const { // It's not enough that any given field be less than - we have sets // of fields that need to be evaluated together to give a proper ordering. @@ -3439,7 +3727,6 @@ void NewGVN::markInstructionForDeletion(Instruction *I) { } void NewGVN::replaceInstruction(Instruction *I, Value *V) { - DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n"); patchAndReplaceAllUsesWith(I, V); // We save the actual erasing to avoid invalidating memory @@ -3460,7 +3747,9 @@ public: ValueStack.emplace_back(V); DFSStack.emplace_back(DFSIn, DFSOut); } + bool empty() const { return DFSStack.empty(); } + bool isInScope(int DFSIn, int DFSOut) const { if (empty()) return false; @@ -3484,19 +3773,33 @@ private: SmallVector<Value *, 8> ValueStack; SmallVector<std::pair<int, int>, 8> DFSStack; }; + +} // end anonymous namespace + +// Given an expression, get the congruence class for it. +CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const { + if (auto *VE = dyn_cast<VariableExpression>(E)) + return ValueToClass.lookup(VE->getVariableValue()); + else if (isa<DeadExpression>(E)) + return TOPClass; + return ExpressionToClass.lookup(E); } // Given a value and a basic block we are trying to see if it is available in, // see if the value has a leader available in that block. -Value *NewGVN::findPhiOfOpsLeader(const Expression *E, +Value *NewGVN::findPHIOfOpsLeader(const Expression *E, + const Instruction *OrigInst, const BasicBlock *BB) const { // It would already be constant if we could make it constant if (auto *CE = dyn_cast<ConstantExpression>(E)) return CE->getConstantValue(); - if (auto *VE = dyn_cast<VariableExpression>(E)) - return VE->getVariableValue(); + if (auto *VE = dyn_cast<VariableExpression>(E)) { + auto *V = VE->getVariableValue(); + if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB)) + return VE->getVariableValue(); + } - auto *CC = ExpressionToClass.lookup(E); + auto *CC = getClassForExpression(E); if (!CC) return nullptr; if (alwaysAvailable(CC->getLeader())) @@ -3504,15 +3807,13 @@ Value *NewGVN::findPhiOfOpsLeader(const Expression *E, for (auto Member : *CC) { auto *MemberInst = dyn_cast<Instruction>(Member); + if (MemberInst == OrigInst) + continue; // Anything that isn't an instruction is always available. if (!MemberInst) return Member; - // If we are looking for something in the same block as the member, it must - // be a leader because this function is looking for operands for a phi node. - if (MemberInst->getParent() == BB || - DT->dominates(MemberInst->getParent(), BB)) { + if (DT->dominates(getBlockForValue(MemberInst), BB)) return Member; - } } return nullptr; } @@ -3549,36 +3850,39 @@ bool NewGVN::eliminateInstructions(Function &F) { // Go through all of our phi nodes, and kill the arguments associated with // unreachable edges. - auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) { - for (auto &Operand : PHI.incoming_values()) - if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) { + auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) { + for (auto &Operand : PHI->incoming_values()) + if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) { DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block " - << getBlockName(PHI.getIncomingBlock(Operand)) + << getBlockName(PHI->getIncomingBlock(Operand)) << " with undef due to it being unreachable\n"); - Operand.set(UndefValue::get(PHI.getType())); + Operand.set(UndefValue::get(PHI->getType())); } }; - SmallPtrSet<BasicBlock *, 8> BlocksWithPhis; - for (auto &B : F) - if ((!B.empty() && isa<PHINode>(*B.begin())) || - (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end())) - BlocksWithPhis.insert(&B); + // Replace unreachable phi arguments. + // At this point, RevisitOnReachabilityChange only contains: + // + // 1. PHIs + // 2. Temporaries that will convert to PHIs + // 3. Operations that are affected by an unreachable edge but do not fit into + // 1 or 2 (rare). + // So it is a slight overshoot of what we want. We could make it exact by + // using two SparseBitVectors per block. DenseMap<const BasicBlock *, unsigned> ReachablePredCount; - for (auto KV : ReachableEdges) + for (auto &KV : ReachableEdges) ReachablePredCount[KV.getEnd()]++; - for (auto *BB : BlocksWithPhis) - // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in - // the block and subtract the pred count, but it's more complicated. - if (ReachablePredCount.lookup(BB) != - unsigned(std::distance(pred_begin(BB), pred_end(BB)))) { - for (auto II = BB->begin(); isa<PHINode>(II); ++II) { - auto &PHI = cast<PHINode>(*II); + for (auto &BBPair : RevisitOnReachabilityChange) { + for (auto InstNum : BBPair.second) { + auto *Inst = InstrFromDFSNum(InstNum); + auto *PHI = dyn_cast<PHINode>(Inst); + PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst)); + if (!PHI) + continue; + auto *BB = BBPair.first; + if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues()) ReplaceUnreachablePHIArgs(PHI, BB); - } - for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) { - ReplaceUnreachablePHIArgs(*PHI, BB); - }); } + } // Map to store the use counts DenseMap<const Value *, unsigned int> UseCounts; @@ -3631,7 +3935,7 @@ bool NewGVN::eliminateInstructions(Function &F) { CC->swap(MembersLeft); } else { // If this is a singleton, we can skip it. - if (CC->size() != 1 || RealToTemp.lookup(Leader)) { + if (CC->size() != 1 || RealToTemp.count(Leader)) { // This is a stack because equality replacement/etc may place // constants in the middle of the member list, and we want to use // those constant values in preference to the current leader, over @@ -3873,12 +4177,16 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { } namespace { + class NewGVNLegacyPass : public FunctionPass { public: - static char ID; // Pass identification, replacement for typeid. + // Pass identification, replacement for typeid. + static char ID; + NewGVNLegacyPass() : FunctionPass(ID) { initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry()); } + bool runOnFunction(Function &F) override; private: @@ -3892,7 +4200,8 @@ private: AU.addPreserved<GlobalsAAWrapperPass>(); } }; -} // namespace + +} // end anonymous namespace bool NewGVNLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) @@ -3906,6 +4215,8 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) { .runGVN(); } +char NewGVNLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) @@ -3917,8 +4228,6 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false, false) -char NewGVNLegacyPass::ID = 0; - // createGVNPass - The public interface to this file. FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); } diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 1bfecea2f61e..1748815c5941 100644 --- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -26,16 +26,13 @@ using namespace llvm; static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, - BasicBlock &CurrBB, Function::iterator &BB) { + BasicBlock &CurrBB, Function::iterator &BB, + const TargetTransformInfo *TTI) { // There is no need to change the IR, since backend will emit sqrt // instruction if the call has already been marked read-only. if (Call->onlyReadsMemory()) return false; - // The call must have the expected result type. - if (!Call->getType()->isFloatingPointTy()) - return false; - // Do the following transformation: // // (before) @@ -43,7 +40,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, // // (after) // v0 = sqrt_noreadmem(src) # native sqrt instruction. - // if (v0 is a NaN) + // [if (v0 is a NaN) || if (src < 0)] // v1 = sqrt(src) # library call. // dst = phi(v0, v1) // @@ -52,7 +49,8 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, // Create phi and replace all uses. BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode()); IRBuilder<> Builder(JoinBB, JoinBB->begin()); - PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); + Type *Ty = Call->getType(); + PHINode *Phi = Builder.CreatePHI(Ty, 2); Call->replaceAllUsesWith(Phi); // Create basic block LibCallBB and insert a call to library function sqrt. @@ -69,7 +67,10 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone); CurrBB.getTerminator()->eraseFromParent(); Builder.SetInsertPoint(&CurrBB); - Value *FCmp = Builder.CreateFCmpOEQ(Call, Call); + Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty) + ? Builder.CreateFCmpORD(Call, Call) + : Builder.CreateFCmpOGE(Call->getOperand(0), + ConstantFP::get(Ty, 0.0)); Builder.CreateCondBr(FCmp, JoinBB, LibCallBB); // Add phi operands. @@ -96,18 +97,21 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI, if (!Call || !(CalledFunc = Call->getCalledFunction())) continue; + if (Call->isNoBuiltin()) + continue; + // Skip if function either has local linkage or is not a known library // function. LibFunc LF; - if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() || - !TLI->getLibFunc(CalledFunc->getName(), LF)) + if (CalledFunc->hasLocalLinkage() || + !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF)) continue; switch (LF) { case LibFunc_sqrtf: case LibFunc_sqrt: if (TTI->haveFastSqrt(Call->getType()) && - optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) + optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI)) break; continue; default: diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp index e47b636348e3..2d0cb6fbf211 100644 --- a/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -54,6 +54,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" @@ -113,6 +114,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { ScalarEvolution *SE = nullptr; DominatorTree *DT = nullptr; LoopInfo *LI = nullptr; + TargetLibraryInfo *TLI = nullptr; PlaceBackedgeSafepointsImpl(bool CallSafepoints = false) : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) { @@ -131,6 +133,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); for (Loop *I : *LI) { runOnLoopAndSubLoops(I); } @@ -141,6 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); // We no longer modify the IR at all in this pass. Thus all // analysis are preserved. AU.setPreservesAll(); @@ -165,6 +169,7 @@ struct PlaceSafepoints : public FunctionPass { // We modify the graph wholesale (inlining, block insertion, etc). We // preserve nothing at the moment. We could potentially preserve dom tree // if that was worth doing + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } @@ -174,10 +179,11 @@ struct PlaceSafepoints : public FunctionPass { // callers job. static void InsertSafepointPoll(Instruction *InsertBefore, - std::vector<CallSite> &ParsePointsNeeded /*rval*/); + std::vector<CallSite> &ParsePointsNeeded /*rval*/, + const TargetLibraryInfo &TLI); -static bool needsStatepoint(const CallSite &CS) { - if (callsGCLeafFunction(CS)) +static bool needsStatepoint(const CallSite &CS, const TargetLibraryInfo &TLI) { + if (callsGCLeafFunction(CS, TLI)) return false; if (CS.isCall()) { CallInst *call = cast<CallInst>(CS.getInstruction()); @@ -194,7 +200,8 @@ static bool needsStatepoint(const CallSite &CS) { /// answer; i.e. false is always valid. static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, BasicBlock *Pred, - DominatorTree &DT) { + DominatorTree &DT, + const TargetLibraryInfo &TLI) { // In general, we're looking for any cut of the graph which ensures // there's a call safepoint along every edge between Header and Pred. // For the moment, we look only for the 'cuts' that consist of a single call @@ -217,7 +224,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, // unconditional poll. In practice, this is only a theoretical concern // since we don't have any methods with conditional-only safepoint // polls. - if (needsStatepoint(CS)) + if (needsStatepoint(CS, TLI)) return true; } @@ -321,7 +328,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { continue; } if (CallSafepointsEnabled && - containsUnconditionalCallSafepoint(L, Header, Pred, *DT)) { + containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) { // Note: This is only semantically legal since we won't do any further // IPO or inlining before the actual call insertion.. If we hadn't, we // might latter loose this call safepoint. @@ -472,6 +479,9 @@ bool PlaceSafepoints::runOnFunction(Function &F) { if (!shouldRewriteFunction(F)) return false; + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + bool Modified = false; // In various bits below, we rely on the fact that uses are reachable from @@ -578,7 +588,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { // safepoint polls themselves. for (Instruction *PollLocation : PollsNeeded) { std::vector<CallSite> RuntimeCalls; - InsertSafepointPoll(PollLocation, RuntimeCalls); + InsertSafepointPoll(PollLocation, RuntimeCalls, TLI); ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(), RuntimeCalls.end()); } @@ -610,7 +620,8 @@ INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", static void InsertSafepointPoll(Instruction *InsertBefore, - std::vector<CallSite> &ParsePointsNeeded /*rval*/) { + std::vector<CallSite> &ParsePointsNeeded /*rval*/, + const TargetLibraryInfo &TLI) { BasicBlock *OrigBB = InsertBefore->getParent(); Module *M = InsertBefore->getModule(); assert(M && "must be part of a module"); @@ -669,7 +680,7 @@ InsertSafepointPoll(Instruction *InsertBefore, assert(ParsePointsNeeded.empty()); for (auto *CI : Calls) { // No safepoint needed or wanted - if (!needsStatepoint(CI)) + if (!needsStatepoint(CI, TLI)) continue; // These are likely runtime calls. Should we assert that via calling diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index e235e5eb1a06..88dcaf0f8a36 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -21,28 +21,45 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> +#include <cassert> +#include <utility> + using namespace llvm; using namespace reassociate; @@ -54,7 +71,6 @@ STATISTIC(NumFactor , "Number of multiplies factored"); #ifndef NDEBUG /// Print out the expression identified in the Ops list. -/// static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { Module *M = I->getModule(); dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " @@ -128,38 +144,37 @@ XorOpnd::XorOpnd(Value *V) { /// Return true if V is an instruction of the specified opcode and if it /// only has one use. static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { - if (V->hasOneUse() && isa<Instruction>(V) && - cast<Instruction>(V)->getOpcode() == Opcode && - (!isa<FPMathOperator>(V) || - cast<Instruction>(V)->hasUnsafeAlgebra())) - return cast<BinaryOperator>(V); + auto *I = dyn_cast<Instruction>(V); + if (I && I->hasOneUse() && I->getOpcode() == Opcode) + if (!isa<FPMathOperator>(I) || I->isFast()) + return cast<BinaryOperator>(I); return nullptr; } static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, unsigned Opcode2) { - if (V->hasOneUse() && isa<Instruction>(V) && - (cast<Instruction>(V)->getOpcode() == Opcode1 || - cast<Instruction>(V)->getOpcode() == Opcode2) && - (!isa<FPMathOperator>(V) || - cast<Instruction>(V)->hasUnsafeAlgebra())) - return cast<BinaryOperator>(V); + auto *I = dyn_cast<Instruction>(V); + if (I && I->hasOneUse() && + (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2)) + if (!isa<FPMathOperator>(I) || I->isFast()) + return cast<BinaryOperator>(I); return nullptr; } void ReassociatePass::BuildRankMap(Function &F, ReversePostOrderTraversal<Function*> &RPOT) { - unsigned i = 2; + unsigned Rank = 2; // Assign distinct ranks to function arguments. - for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { - ValueRankMap[&*I] = ++i; - DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n"); + for (auto &Arg : F.args()) { + ValueRankMap[&Arg] = ++Rank; + DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank + << "\n"); } // Traverse basic blocks in ReversePostOrder for (BasicBlock *BB : RPOT) { - unsigned BBRank = RankMap[BB] = ++i << 16; + unsigned BBRank = RankMap[BB] = ++Rank << 16; // Walk the basic block, adding precomputed ranks for any instructions that // we cannot move. This ensures that the ranks for these instructions are @@ -207,13 +222,9 @@ void ReassociatePass::canonicalizeOperands(Instruction *I) { Value *LHS = I->getOperand(0); Value *RHS = I->getOperand(1); - unsigned LHSRank = getRank(LHS); - unsigned RHSRank = getRank(RHS); - - if (isa<Constant>(RHS)) + if (LHS == RHS || isa<Constant>(RHS)) return; - - if (isa<Constant>(LHS) || RHSRank < LHSRank) + if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS)) cast<BinaryOperator>(I)->swapOperands(); } @@ -357,7 +368,7 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { } } -typedef std::pair<Value*, APInt> RepeatedValue; +using RepeatedValue = std::pair<Value*, APInt>; /// Given an associative binary expression, return the leaf /// nodes in Ops along with their weights (how many times the leaf occurs). The @@ -432,7 +443,6 @@ typedef std::pair<Value*, APInt> RepeatedValue; /// that have all uses inside the expression (i.e. only used by non-leaf nodes /// of the expression) if it can turn them into binary operators of the right /// type and thus make the expression bigger. - static bool LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<RepeatedValue> &Ops) { DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); @@ -470,12 +480,12 @@ static bool LinearizeExprTree(BinaryOperator *I, // Leaves - Keeps track of the set of putative leaves as well as the number of // paths to each leaf seen so far. - typedef DenseMap<Value*, APInt> LeafMap; + using LeafMap = DenseMap<Value *, APInt>; LeafMap Leaves; // Leaf -> Total weight so far. - SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order. + SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order. #ifndef NDEBUG - SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme. + SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme. #endif while (!Worklist.empty()) { std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val(); @@ -554,7 +564,7 @@ static bool LinearizeExprTree(BinaryOperator *I, assert((!isa<Instruction>(Op) || cast<Instruction>(Op)->getOpcode() != Opcode || (isa<FPMathOperator>(Op) && - !cast<Instruction>(Op)->hasUnsafeAlgebra())) && + !cast<Instruction>(Op)->isFast())) && "Should have been handled above!"); assert(Op->hasOneUse() && "Has uses outside the expression tree!"); @@ -773,7 +783,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, break; ExpressionChanged->moveBefore(I); ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin()); - } while (1); + } while (true); // Throw away any left over nodes from the original expression. for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) @@ -789,13 +799,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, /// additional opportunities have been exposed. static Value *NegateValue(Value *V, Instruction *BI, SetVector<AssertingVH<Instruction>> &ToRedo) { - if (Constant *C = dyn_cast<Constant>(V)) { - if (C->getType()->isFPOrFPVectorTy()) { - return ConstantExpr::getFNeg(C); - } - return ConstantExpr::getNeg(C); - } - + if (auto *C = dyn_cast<Constant>(V)) + return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) : + ConstantExpr::getNeg(C); // We are trying to expose opportunity for reassociation. One of the things // that we want to do to achieve this is to push a negation as deep into an @@ -913,7 +919,6 @@ BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) { // // Calculate the negative value of Operand 1 of the sub instruction, // and set it as the RHS of the add instruction we just made. - // Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo); BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. @@ -990,7 +995,7 @@ static Value *EmitAddTreeOfValues(Instruction *I, Value *V1 = Ops.back(); Ops.pop_back(); Value *V2 = EmitAddTreeOfValues(I, Ops); - return CreateAdd(V2, V1, "tmp", I, I); + return CreateAdd(V2, V1, "reass.add", I, I); } /// If V is an expression tree that is a multiplication sequence, @@ -1157,7 +1162,6 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, // If it was successful, true is returned, and the "R" and "C" is returned // via "Res" and "ConstOpnd", respectively; otherwise, false is returned, // and both "Res" and "ConstOpnd" remain unchanged. -// bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd, Value *&Res) { // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 @@ -1183,7 +1187,6 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, RedoInsts.insert(T); return true; } - // Helper function of OptimizeXor(). It tries to simplify // "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a @@ -1230,7 +1233,6 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, Res = createAndInstr(I, X, C3); ConstOpnd ^= C1; - } else if (Opnd1->isOrExpr()) { // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2 // @@ -1349,7 +1351,6 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, // step 3.2: When previous and current operands share the same symbolic // value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" - // if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) { // Remove previous operand PrevOpnd->Invalidate(); @@ -1601,7 +1602,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, RedoInsts.insert(VI); // Create the multiply. - Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I); + Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I); // Rerun associate on the multiply in case the inner expression turned into // a multiply. We want to make sure that we keep things in canonical form. @@ -2012,8 +2013,8 @@ void ReassociatePass::OptimizeInst(Instruction *I) { if (I->isCommutative()) canonicalizeOperands(I); - // Don't optimize floating point instructions that don't have unsafe algebra. - if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra()) + // Don't optimize floating-point instructions unless they are 'fast'. + if (I->getType()->isFPOrFPVectorTy() && !I->isFast()) return; // Do not reassociate boolean (i1) expressions. We want to preserve the @@ -2140,7 +2141,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); I->replaceAllUsesWith(V); if (Instruction *VI = dyn_cast<Instruction>(V)) - VI->setDebugLoc(I->getDebugLoc()); + if (I->getDebugLoc()) + VI->setDebugLoc(I->getDebugLoc()); RedoInsts.insert(I); ++NumAnnihil; return; @@ -2183,11 +2185,104 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { return; } + if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) { + // Find the pair with the highest count in the pairmap and move it to the + // back of the list so that it can later be CSE'd. + // example: + // a*b*c*d*e + // if c*e is the most "popular" pair, we can express this as + // (((c*e)*d)*b)*a + unsigned Max = 1; + unsigned BestRank = 0; + std::pair<unsigned, unsigned> BestPair; + unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin; + for (unsigned i = 0; i < Ops.size() - 1; ++i) + for (unsigned j = i + 1; j < Ops.size(); ++j) { + unsigned Score = 0; + Value *Op0 = Ops[i].Op; + Value *Op1 = Ops[j].Op; + if (std::less<Value *>()(Op1, Op0)) + std::swap(Op0, Op1); + auto it = PairMap[Idx].find({Op0, Op1}); + if (it != PairMap[Idx].end()) + Score += it->second; + + unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank); + if (Score > Max || (Score == Max && MaxRank < BestRank)) { + BestPair = {i, j}; + Max = Score; + BestRank = MaxRank; + } + } + if (Max > 1) { + auto Op0 = Ops[BestPair.first]; + auto Op1 = Ops[BestPair.second]; + Ops.erase(&Ops[BestPair.second]); + Ops.erase(&Ops[BestPair.first]); + Ops.push_back(Op0); + Ops.push_back(Op1); + } + } // Now that we ordered and optimized the expressions, splat them back into // the expression tree, removing any unneeded nodes. RewriteExprTree(I, Ops); } +void +ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) { + // Make a "pairmap" of how often each operand pair occurs. + for (BasicBlock *BI : RPOT) { + for (Instruction &I : *BI) { + if (!I.isAssociative()) + continue; + + // Ignore nodes that aren't at the root of trees. + if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode()) + continue; + + // Collect all operands in a single reassociable expression. + // Since Reassociate has already been run once, we can assume things + // are already canonical according to Reassociation's regime. + SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) }; + SmallVector<Value *, 8> Ops; + while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) { + Value *Op = Worklist.pop_back_val(); + Instruction *OpI = dyn_cast<Instruction>(Op); + if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) { + Ops.push_back(Op); + continue; + } + // Be paranoid about self-referencing expressions in unreachable code. + if (OpI->getOperand(0) != OpI) + Worklist.push_back(OpI->getOperand(0)); + if (OpI->getOperand(1) != OpI) + Worklist.push_back(OpI->getOperand(1)); + } + // Skip extremely long expressions. + if (Ops.size() > GlobalReassociateLimit) + continue; + + // Add all pairwise combinations of operands to the pair map. + unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin; + SmallSet<std::pair<Value *, Value*>, 32> Visited; + for (unsigned i = 0; i < Ops.size() - 1; ++i) { + for (unsigned j = i + 1; j < Ops.size(); ++j) { + // Canonicalize operand orderings. + Value *Op0 = Ops[i]; + Value *Op1 = Ops[j]; + if (std::less<Value *>()(Op1, Op0)) + std::swap(Op0, Op1); + if (!Visited.insert({Op0, Op1}).second) + continue; + auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, 1}); + if (!res.second) + ++res.first->second; + } + } + } + } +} + PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { // Get the functions basic blocks in Reverse Post Order. This order is used by // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic @@ -2198,8 +2293,20 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { // Calculate the rank map for F. BuildRankMap(F, RPOT); + // Build the pair map before running reassociate. + // Technically this would be more accurate if we did it after one round + // of reassociation, but in practice it doesn't seem to help much on + // real-world code, so don't waste the compile time running reassociate + // twice. + // If a user wants, they could expicitly run reassociate twice in their + // pass pipeline for further potential gains. + // It might also be possible to update the pair map during runtime, but the + // overhead of that may be large if there's many reassociable chains. + BuildPairMap(RPOT); + MadeChange = false; - // Traverse the same blocks that was analysed by BuildRankMap. + + // Traverse the same blocks that were analysed by BuildRankMap. for (BasicBlock *BI : RPOT) { assert(RankMap.count(&*BI) && "BB should be ranked."); // Optimize every instruction in the basic block. @@ -2238,9 +2345,11 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { } } - // We are done with the rank map. + // We are done with the rank map and pair map. RankMap.clear(); ValueRankMap.clear(); + for (auto &Entry : PairMap) + Entry.clear(); if (MadeChange) { PreservedAnalyses PA; @@ -2253,10 +2362,13 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { } namespace { + class ReassociateLegacyPass : public FunctionPass { ReassociatePass Impl; + public: static char ID; // Pass identification, replacement for typeid + ReassociateLegacyPass() : FunctionPass(ID) { initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -2275,9 +2387,11 @@ namespace { AU.addPreserved<GlobalsAAWrapperPass>(); } }; -} + +} // end anonymous namespace char ReassociateLegacyPass::ID = 0; + INITIALIZE_PASS(ReassociateLegacyPass, "reassociate", "Reassociate expressions", false, false) diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index f19d45329d23..3b45cfa482e6 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -12,36 +12,69 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/CFG.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Statepoint.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" -#include "llvm/IR/Verifier.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <iterator> +#include <set> +#include <string> +#include <utility> +#include <vector> #define DEBUG_TYPE "rewrite-statepoints-for-gc" @@ -52,6 +85,7 @@ static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, cl::init(false)); static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden, cl::init(false)); + // Print out the base pointers for debugging static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden, cl::init(false)); @@ -67,6 +101,7 @@ static bool ClobberNonLive = true; #else static bool ClobberNonLive = false; #endif + static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live", cl::location(ClobberNonLive), cl::Hidden); @@ -75,27 +110,96 @@ static cl::opt<bool> AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", cl::Hidden, cl::init(true)); +/// The IR fed into RewriteStatepointsForGC may have had attributes and +/// metadata implying dereferenceability that are no longer valid/correct after +/// RewriteStatepointsForGC has run. This is because semantically, after +/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire +/// heap. stripNonValidData (conservatively) restores +/// correctness by erasing all attributes in the module that externally imply +/// dereferenceability. Similar reasoning also applies to the noalias +/// attributes and metadata. gc.statepoint can touch the entire heap including +/// noalias objects. +/// Apart from attributes and metadata, we also remove instructions that imply +/// constant physical memory: llvm.invariant.start. +static void stripNonValidData(Module &M); + +static bool shouldRewriteStatepointsIn(Function &F); + +PreservedAnalyses RewriteStatepointsForGC::run(Module &M, + ModuleAnalysisManager &AM) { + bool Changed = false; + auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + // Nothing to do for declarations. + if (F.isDeclaration() || F.empty()) + continue; + + // Policy choice says not to rewrite - the most common reason is that we're + // compiling code without a GCStrategy. + if (!shouldRewriteStatepointsIn(F)) + continue; + + auto &DT = FAM.getResult<DominatorTreeAnalysis>(F); + auto &TTI = FAM.getResult<TargetIRAnalysis>(F); + auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); + Changed |= runOnFunction(F, DT, TTI, TLI); + } + if (!Changed) + return PreservedAnalyses::all(); + + // stripNonValidData asserts that shouldRewriteStatepointsIn + // returns true for at least one function in the module. Since at least + // one function changed, we know that the precondition is satisfied. + stripNonValidData(M); + + PreservedAnalyses PA; + PA.preserve<TargetIRAnalysis>(); + PA.preserve<TargetLibraryAnalysis>(); + return PA; +} + namespace { -struct RewriteStatepointsForGC : public ModulePass { + +class RewriteStatepointsForGCLegacyPass : public ModulePass { + RewriteStatepointsForGC Impl; + +public: static char ID; // Pass identification, replacement for typeid - RewriteStatepointsForGC() : ModulePass(ID) { - initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry()); + RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() { + initializeRewriteStatepointsForGCLegacyPassPass( + *PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnModule(Module &M) override { bool Changed = false; - for (Function &F : M) - Changed |= runOnFunction(F); + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + for (Function &F : M) { + // Nothing to do for declarations. + if (F.isDeclaration() || F.empty()) + continue; + + // Policy choice says not to rewrite - the most common reason is that + // we're compiling code without a GCStrategy. + if (!shouldRewriteStatepointsIn(F)) + continue; - if (Changed) { - // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn - // returns true for at least one function in the module. Since at least - // one function changed, we know that the precondition is satisfied. - stripNonValidAttributesAndMetadata(M); + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); + + Changed |= Impl.runOnFunction(F, DT, TTI, TLI); } - return Changed; + if (!Changed) + return false; + + // stripNonValidData asserts that shouldRewriteStatepointsIn + // returns true for at least one function in the module. Since at least + // one function changed, we know that the precondition is satisfied. + stripNonValidData(M); + return true; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -103,46 +207,33 @@ struct RewriteStatepointsForGC : public ModulePass { // else. We could in theory preserve a lot more analyses here. AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } - - /// The IR fed into RewriteStatepointsForGC may have had attributes and - /// metadata implying dereferenceability that are no longer valid/correct after - /// RewriteStatepointsForGC has run. This is because semantically, after - /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidAttributesAndMetadata (conservatively) restores - /// correctness by erasing all attributes in the module that externally imply - /// dereferenceability. Similar reasoning also applies to the noalias - /// attributes and metadata. gc.statepoint can touch the entire heap including - /// noalias objects. - void stripNonValidAttributesAndMetadata(Module &M); - - // Helpers for stripNonValidAttributesAndMetadata - void stripNonValidAttributesAndMetadataFromBody(Function &F); - void stripNonValidAttributesFromPrototype(Function &F); - // Certain metadata on instructions are invalid after running RS4GC. - // Optimizations that run after RS4GC can incorrectly use this metadata to - // optimize functions. We drop such metadata on the instruction. - void stripInvalidMetadataFromInstruction(Instruction &I); }; -} // namespace -char RewriteStatepointsForGC::ID = 0; +} // end anonymous namespace -ModulePass *llvm::createRewriteStatepointsForGCPass() { - return new RewriteStatepointsForGC(); +char RewriteStatepointsForGCLegacyPass::ID = 0; + +ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() { + return new RewriteStatepointsForGCLegacyPass(); } -INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", +INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass, + "rewrite-statepoints-for-gc", "Make relocations explicit at statepoints", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", +INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass, + "rewrite-statepoints-for-gc", "Make relocations explicit at statepoints", false, false) namespace { + struct GCPtrLivenessData { /// Values defined in this block. MapVector<BasicBlock *, SetVector<Value *>> KillSet; + /// Values used in this block (and thus live); does not included values /// killed within this block. MapVector<BasicBlock *, SetVector<Value *>> LiveSet; @@ -166,10 +257,10 @@ struct GCPtrLivenessData { // Generally, after the execution of a full findBasePointer call, only the // base relation will remain. Internally, we add a mixture of the two // types, then update all the second type to the first type -typedef MapVector<Value *, Value *> DefiningValueMapTy; -typedef SetVector<Value *> StatepointLiveSetTy; -typedef MapVector<AssertingVH<Instruction>, AssertingVH<Value>> - RematerializedValueMapTy; +using DefiningValueMapTy = MapVector<Value *, Value *>; +using StatepointLiveSetTy = SetVector<Value *>; +using RematerializedValueMapTy = + MapVector<AssertingVH<Instruction>, AssertingVH<Value>>; struct PartiallyConstructedSafepointRecord { /// The set of values known to be live across this safepoint @@ -191,7 +282,8 @@ struct PartiallyConstructedSafepointRecord { /// Maps rematerialized copy to it's original value. RematerializedValueMapTy RematerializedValues; }; -} + +} // end anonymous namespace static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) { Optional<OperandBundleUse> DeoptBundle = @@ -254,7 +346,7 @@ static bool containsGCPtrType(Type *Ty) { if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) return containsGCPtrType(AT->getElementType()); if (StructType *ST = dyn_cast<StructType>(Ty)) - return any_of(ST->subtypes(), containsGCPtrType); + return llvm::any_of(ST->subtypes(), containsGCPtrType); return false; } @@ -299,7 +391,9 @@ analyzeParsePointLiveness(DominatorTree &DT, } static bool isKnownBaseResult(Value *V); + namespace { + /// A single base defining value - An immediate base defining value for an /// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. /// For instructions which have multiple pointer [vector] inputs or that @@ -311,9 +405,11 @@ namespace { struct BaseDefiningValueResult { /// Contains the value which is the base defining value. Value * const BDV; + /// True if the base defining value is also known to be an actual base /// pointer. const bool IsKnownBase; + BaseDefiningValueResult(Value *BDV, bool IsKnownBase) : BDV(BDV), IsKnownBase(IsKnownBase) { #ifndef NDEBUG @@ -324,7 +420,8 @@ struct BaseDefiningValueResult { #endif } }; -} + +} // end anonymous namespace static BaseDefiningValueResult findBaseDefiningValue(Value *I); @@ -374,6 +471,11 @@ findBaseDefiningValueOfVector(Value *I) { if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) return findBaseDefiningValue(GEP->getPointerOperand()); + // If the pointer comes through a bitcast of a vector of pointers to + // a vector of another type of pointer, then look through the bitcast + if (auto *BC = dyn_cast<BitCastInst>(I)) + return findBaseDefiningValue(BC->getOperand(0)); + // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && @@ -429,7 +531,6 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { if (isa<LoadInst>(I)) // The value loaded is an gc base itself return BaseDefiningValueResult(I, true); - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) // The base of this GEP is the base @@ -442,12 +543,11 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { break; case Intrinsic::experimental_gc_statepoint: llvm_unreachable("statepoints don't produce pointers"); - case Intrinsic::experimental_gc_relocate: { + case Intrinsic::experimental_gc_relocate: // Rerunning safepoint insertion after safepoints are already // inserted is not supported. It could probably be made to work, // but why are you doing this? There's no good reason. llvm_unreachable("repeat safepoint insertion is not supported"); - } case Intrinsic::gcroot: // Currently, this mechanism hasn't been extended to work with gcroot. // There's no reason it couldn't be, but I haven't thought about the @@ -551,6 +651,7 @@ static bool isKnownBaseResult(Value *V) { } namespace { + /// Models the state of a single base defining value in the findBasePointer /// algorithm for determining where a new instruction is needed to propagate /// the base of this BDV. @@ -558,7 +659,7 @@ class BDVState { public: enum Status { Unknown, Base, Conflict }; - BDVState() : Status(Unknown), BaseValue(nullptr) {} + BDVState() : BaseValue(nullptr) {} explicit BDVState(Status Status, Value *BaseValue = nullptr) : Status(Status), BaseValue(BaseValue) { @@ -597,16 +698,17 @@ public: case Conflict: OS << "C"; break; - }; + } OS << " (" << getBaseValue() << " - " << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): "; } private: - Status Status; + Status Status = Unknown; AssertingVH<Value> BaseValue; // Non-null only if Status == Base. }; -} + +} // end anonymous namespace #ifndef NDEBUG static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { @@ -1169,7 +1271,7 @@ static void CreateGCRelocates(ArrayRef<Value *> LiveVariables, return; auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) { - auto ValIt = find(LiveVec, Val); + auto ValIt = llvm::find(LiveVec, Val); assert(ValIt != LiveVec.end() && "Val not found in LiveVec!"); size_t Index = std::distance(LiveVec.begin(), ValIt); assert(Index < LiveVec.size() && "Bug in std::find?"); @@ -1229,7 +1331,7 @@ class DeferredReplacement { AssertingVH<Instruction> New; bool IsDeoptimize = false; - DeferredReplacement() {} + DeferredReplacement() = default; public: static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) { @@ -1286,7 +1388,8 @@ public: OldI->eraseFromParent(); } }; -} + +} // end anonymous namespace static StringRef getDeoptLowering(CallSite CS) { const char *DeoptLowering = "deopt-lowering"; @@ -1304,7 +1407,6 @@ static StringRef getDeoptLowering(CallSite CS) { return "live-through"; } - static void makeStatepointExplicitImpl(const CallSite CS, /* to replace */ const SmallVectorImpl<Value *> &BasePtrs, @@ -1528,7 +1630,6 @@ static void insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, DenseMap<Value *, Value *> &AllocaMap, DenseSet<Value *> &VisitedLiveValues) { - for (User *U : GCRelocs) { GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U); if (!Relocate) @@ -1564,7 +1665,6 @@ static void insertRematerializationStores( const RematerializedValueMapTy &RematerializedValues, DenseMap<Value *, Value *> &AllocaMap, DenseSet<Value *> &VisitedLiveValues) { - for (auto RematerializedValuePair: RematerializedValues) { Instruction *RematerializedValue = RematerializedValuePair.first; Value *OriginalValue = RematerializedValuePair.second; @@ -1830,7 +1930,6 @@ static void findLiveReferences( static Value* findRematerializableChainToBasePointer( SmallVectorImpl<Instruction*> &ChainToBase, Value *CurrentValue) { - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) { ChainToBase.push_back(GEP); return findRematerializableChainToBasePointer(ChainToBase, @@ -1886,7 +1985,6 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, } static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) { - unsigned PhiNum = OrigRootPhi.getNumIncomingValues(); if (PhiNum != AlternateRootPhi.getNumIncomingValues() || OrigRootPhi.getParent() != AlternateRootPhi.getParent()) @@ -1910,7 +2008,6 @@ static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPh return false; } return true; - } // From the statepoint live set pick values that are cheaper to recompute then @@ -2297,8 +2394,7 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R)); } -void -RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { +static void stripNonValidAttributesFromPrototype(Function &F) { LLVMContext &Ctx = F.getContext(); for (Argument &A : F.args()) @@ -2310,8 +2406,10 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex); } -void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) { - +/// Certain metadata on instructions are invalid after running RS4GC. +/// Optimizations that run after RS4GC can incorrectly use this metadata to +/// optimize functions. We drop such metadata on the instruction. +static void stripInvalidMetadataFromInstruction(Instruction &I) { if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) return; // These are the attributes that are still valid on loads and stores after @@ -2337,18 +2435,32 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC. I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); - } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { +static void stripNonValidDataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); + // Set of invariantstart instructions that we need to remove. + // Use this to avoid invalidating the instruction iterator. + SmallVector<IntrinsicInst*, 12> InvariantStartInstructions; for (Instruction &I : instructions(F)) { + // invariant.start on memory location implies that the referenced memory + // location is constant and unchanging. This is no longer true after + // RewriteStatepointsForGC runs because there can be calls to gc.statepoint + // which frees the entire heap and the presence of invariant.start allows + // the optimizer to sink the load of a memory location past a statepoint, + // which is incorrect. + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + if (II->getIntrinsicID() == Intrinsic::invariant_start) { + InvariantStartInstructions.push_back(II); + continue; + } + if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2378,6 +2490,12 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } + + // Delete the invariant.start instructions and RAUW undef. + for (auto *II : InvariantStartInstructions) { + II->replaceAllUsesWith(UndefValue::get(II->getType())); + II->eraseFromParent(); + } } /// Returns true if this function should be rewritten by this pass. The main @@ -2394,35 +2512,28 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { +static void stripNonValidData(Module &M) { #ifndef NDEBUG - assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!"); + assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif for (Function &F : M) stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidAttributesAndMetadataFromBody(F); + stripNonValidDataFromBody(F); } -bool RewriteStatepointsForGC::runOnFunction(Function &F) { - // Nothing to do for declarations. - if (F.isDeclaration() || F.empty()) - return false; - - // Policy choice says not to rewrite - the most common reason is that we're - // compiling code without a GCStrategy. - if (!shouldRewriteStatepointsIn(F)) - return false; - - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); - TargetTransformInfo &TTI = - getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); +bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, + TargetTransformInfo &TTI, + const TargetLibraryInfo &TLI) { + assert(!F.isDeclaration() && !F.empty() && + "need function body to rewrite statepoints in"); + assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision"); - auto NeedsRewrite = [](Instruction &I) { + auto NeedsRewrite = [&TLI](Instruction &I) { if (ImmutableCallSite CS = ImmutableCallSite(&I)) - return !callsGCLeafFunction(CS) && !isStatepoint(CS); + return !callsGCLeafFunction(CS, TLI) && !isStatepoint(CS); return false; }; @@ -2662,7 +2773,6 @@ static void computeLiveInValues(DominatorTree &DT, Function &F, static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, StatepointLiveSetTy &Out) { - BasicBlock *BB = Inst->getParent(); // Note: The copy is intentional and required diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 4822cf7cce0f..e5866b4718da 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -18,30 +18,49 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/SCCP.h" +#include "llvm/Transforms/Scalar/SCCP.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueLattice.h" +#include "llvm/Analysis/ValueLatticeUtils.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Utils/Local.h" -#include <algorithm> +#include <cassert> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "sccp" @@ -52,8 +71,11 @@ STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable"); STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP"); STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP"); STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP"); +STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by" + "IPSCCP"); namespace { + /// LatticeVal class - This class represents the different lattice values that /// an LLVM value may occupy. It is a simple class with value semantics. /// @@ -88,9 +110,11 @@ public: LatticeVal() : Val(nullptr, unknown) {} bool isUnknown() const { return getLatticeValue() == unknown; } + bool isConstant() const { return getLatticeValue() == constant || getLatticeValue() == forcedconstant; } + bool isOverdefined() const { return getLatticeValue() == overdefined; } Constant *getConstant() const { @@ -153,11 +177,15 @@ public: Val.setInt(forcedconstant); Val.setPointer(V); } -}; -} // end anonymous namespace. - -namespace { + ValueLatticeElement toValueLattice() const { + if (isOverdefined()) + return ValueLatticeElement::getOverdefined(); + if (isConstant()) + return ValueLatticeElement::get(getConstant()); + return ValueLatticeElement(); + } +}; //===----------------------------------------------------------------------===// // @@ -167,37 +195,38 @@ namespace { class SCCPSolver : public InstVisitor<SCCPSolver> { const DataLayout &DL; const TargetLibraryInfo *TLI; - SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. - DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. + SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable. + DenseMap<Value *, LatticeVal> ValueState; // The state each value is in. + // The state each parameter is in. + DenseMap<Value *, ValueLatticeElement> ParamState; /// StructValueState - This maintains ValueState for values that have /// StructType, for example for formal arguments, calls, insertelement, etc. - /// - DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState; + DenseMap<std::pair<Value *, unsigned>, LatticeVal> StructValueState; /// GlobalValue - If we are tracking any values for the contents of a global /// variable, we keep a mapping from the constant accessor to the element of /// the global, to the currently known value. If the value becomes /// overdefined, it's entry is simply removed from this map. - DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals; + DenseMap<GlobalVariable *, LatticeVal> TrackedGlobals; /// TrackedRetVals - If we are tracking arguments into and the return /// value out of a function, it will have an entry in this map, indicating /// what the known return value for the function is. - DenseMap<Function*, LatticeVal> TrackedRetVals; + DenseMap<Function *, LatticeVal> TrackedRetVals; /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions /// that return multiple values. - DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals; + DenseMap<std::pair<Function *, unsigned>, LatticeVal> TrackedMultipleRetVals; /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is /// represented here for efficient lookup. - SmallPtrSet<Function*, 16> MRVFunctionsTracked; + SmallPtrSet<Function *, 16> MRVFunctionsTracked; /// TrackingIncomingArguments - This is the set of functions for whose /// arguments we make optimistic assumptions about and try to prove as /// constants. - SmallPtrSet<Function*, 16> TrackingIncomingArguments; + SmallPtrSet<Function *, 16> TrackingIncomingArguments; /// The reason for two worklists is that overdefined is the lowest state /// on the lattice, and moving things to overdefined as fast as possible @@ -206,16 +235,17 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { /// By having a separate worklist, we accomplish this because everything /// possibly overdefined will become overdefined at the soonest possible /// point. - SmallVector<Value*, 64> OverdefinedInstWorkList; - SmallVector<Value*, 64> InstWorkList; - + SmallVector<Value *, 64> OverdefinedInstWorkList; + SmallVector<Value *, 64> InstWorkList; - SmallVector<BasicBlock*, 64> BBWorkList; // The BasicBlock work list + // The BasicBlock work list + SmallVector<BasicBlock *, 64> BBWorkList; /// KnownFeasibleEdges - Entries in this set are edges which have already had /// PHI nodes retriggered. - typedef std::pair<BasicBlock*, BasicBlock*> Edge; + using Edge = std::pair<BasicBlock *, BasicBlock *>; DenseSet<Edge> KnownFeasibleEdges; + public: SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli) : DL(DL), TLI(tli) {} @@ -263,8 +293,13 @@ public: TrackingIncomingArguments.insert(F); } + /// Returns true if the given function is in the solver's set of + /// argument-tracked functions. + bool isArgumentTrackedFunction(Function *F) { + return TrackingIncomingArguments.count(F); + } + /// Solve - Solve for constants and executable blocks. - /// void Solve(); /// ResolvedUndefsIn - While solving the dataflow for a function, we assume @@ -290,14 +325,23 @@ public: return StructValues; } - LatticeVal getLatticeValueFor(Value *V) const { - DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V); - assert(I != ValueState.end() && "V is not in valuemap!"); - return I->second; + ValueLatticeElement getLatticeValueFor(Value *V) { + assert(!V->getType()->isStructTy() && + "Should use getStructLatticeValueFor"); + std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool> + PI = ParamState.insert(std::make_pair(V, ValueLatticeElement())); + ValueLatticeElement &LV = PI.first->second; + if (PI.second) { + DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V); + assert(I != ValueState.end() && + "V not found in ValueState nor Paramstate map!"); + LV = I->second.toValueLattice(); + } + + return LV; } /// getTrackedRetVals - Get the inferred return value map. - /// const DenseMap<Function*, LatticeVal> &getTrackedRetVals() { return TrackedRetVals; } @@ -349,7 +393,6 @@ private: // markConstant - Make a value be marked as "constant". If the value // is not already a constant, add it to the instruction work list so that // the users of the instruction are updated later. - // void markConstant(LatticeVal &IV, Value *V, Constant *C) { if (!IV.markConstant(C)) return; DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); @@ -369,7 +412,6 @@ private: pushToWorkList(IV, V); } - // markOverdefined - Make a value be marked as "overdefined". If the // value is not already overdefined, add it to the overdefined instruction // work list so that the users of the instruction are updated later. @@ -402,7 +444,6 @@ private: mergeInValue(ValueState[V], V, MergeWithV); } - /// getValueState - Return the LatticeVal object that corresponds to the /// value. This function handles the case when the value hasn't been seen yet /// by properly seeding constants etc. @@ -426,6 +467,18 @@ private: return LV; } + ValueLatticeElement &getParamState(Value *V) { + assert(!V->getType()->isStructTy() && "Should use getStructValueState"); + + std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool> + PI = ParamState.insert(std::make_pair(V, ValueLatticeElement())); + ValueLatticeElement &LV = PI.first->second; + if (PI.second) + LV = getValueState(V).toValueLattice(); + + return LV; + } + /// getStructValueState - Return the LatticeVal object that corresponds to the /// value/field pair. This function handles the case when the value hasn't /// been seen yet by properly seeding constants etc. @@ -457,7 +510,6 @@ private: return LV; } - /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB /// work list if it is not already executable. void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) { @@ -480,18 +532,15 @@ private: // getFeasibleSuccessors - Return a vector of booleans to indicate which // successors are reachable from a given terminator instruction. - // void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs); // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. - // bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); // OperandChangedState - This method is invoked on all of the users of an // instruction that was just changed state somehow. Based on this // information, we need to update the specified user of this instruction. - // void OperandChangedState(Instruction *I) { if (BBExecutable.count(I->getParent())) // Inst is executable? visit(*I); @@ -506,6 +555,7 @@ private: void visitPHINode(PHINode &I); // Terminators + void visitReturnInst(ReturnInst &I); void visitTerminatorInst(TerminatorInst &TI); @@ -515,26 +565,32 @@ private: void visitCmpInst(CmpInst &I); void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); + void visitCatchSwitchInst(CatchSwitchInst &CPI) { markOverdefined(&CPI); visitTerminatorInst(CPI); } // Instructions that cannot be folded away. + void visitStoreInst (StoreInst &I); void visitLoadInst (LoadInst &I); void visitGetElementPtrInst(GetElementPtrInst &I); + void visitCallInst (CallInst &I) { visitCallSite(&I); } + void visitInvokeInst (InvokeInst &II) { visitCallSite(&II); visitTerminatorInst(II); } + void visitCallSite (CallSite CS); void visitResumeInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } + void visitInstruction(Instruction &I) { // All the instructions we don't do any special handling for just // go to overdefined. @@ -545,10 +601,8 @@ private: } // end anonymous namespace - // getFeasibleSuccessors - Return a vector of booleans to indicate which // successors are reachable from a given terminator instruction. -// void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs) { Succs.resize(TI.getNumSuccessors()); @@ -631,10 +685,8 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } - // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. -// bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { assert(BBExecutable.count(To) && "Dest should always be alive!"); @@ -710,7 +762,6 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { // destination executable // 7. If a conditional branch has a value that is overdefined, make all // successors executable. -// void SCCPSolver::visitPHINode(PHINode &PN) { // If this PN returns a struct, just mark the result overdefined. // TODO: We could do a lot better than this if code actually uses this. @@ -730,7 +781,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is overdefined. // If there are no executable operands, the PHI remains unknown. - // Constant *OperandVal = nullptr; for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { LatticeVal IV = getValueState(PN.getIncomingValue(i)); @@ -761,7 +811,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // arguments that agree with each other(and OperandVal is the constant) or // OperandVal is null because there are no defined incoming arguments. If // this is the case, the PHI remains unknown. - // if (OperandVal) markConstant(&PN, OperandVal); // Acquire operand value } @@ -789,7 +838,6 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F, getStructValueState(ResultOp, i)); - } } @@ -820,7 +868,6 @@ void SCCPSolver::visitCastInst(CastInst &I) { } } - void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { // If this returns a struct, mark all elements over defined, we don't track // structs in structs. @@ -969,7 +1016,6 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { } } - markOverdefined(&I); } @@ -998,7 +1044,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) { // Handle getelementptr instructions. If all operands are constants then we // can turn this into a getelementptr ConstantExpr. -// void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { if (ValueState[&I].isOverdefined()) return; @@ -1044,7 +1089,6 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) { TrackedGlobals.erase(I); // No need to keep tracking this! } - // Handle load instructions. If the operand is a constant pointer to a constant // global, we can replace the load with the loaded constant value! void SCCPSolver::visitLoadInst(LoadInst &I) { @@ -1108,7 +1152,6 @@ CallOverdefined: // a declaration, maybe we can constant fold it. if (F && F->isDeclaration() && !I->getType()->isStructTy() && canConstantFoldCallTo(CS, F)) { - SmallVector<Constant*, 8> Operands; for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end(); AI != E; ++AI) { @@ -1162,6 +1205,9 @@ CallOverdefined: mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg); } } else { + // Most other parts of the Solver still only use the simpler value + // lattice, so we propagate changes for parameters to both lattices. + getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL); mergeInValue(&*AI, getValueState(*CAI)); } } @@ -1360,7 +1406,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef & X -> 0. X could be zero. markForcedConstant(&I, Constant::getNullValue(ITy)); return true; - case Instruction::Or: // Both operands undef -> undef if (Op0LV.isUnknown() && Op1LV.isUnknown()) @@ -1368,7 +1413,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef | X -> -1. X could be -1. markForcedConstant(&I, Constant::getAllOnesValue(ITy)); return true; - case Instruction::Xor: // undef ^ undef -> 0; strictly speaking, this is not strictly // necessary, but we try to be nice to people who expect this @@ -1379,7 +1423,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } // undef ^ X -> undef break; - case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: @@ -1397,7 +1440,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef % X -> 0. X could be 1. markForcedConstant(&I, Constant::getNullValue(ITy)); return true; - case Instruction::AShr: // X >>a undef -> undef. if (Op1LV.isUnknown()) break; @@ -1464,7 +1506,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { markOverdefined(&I); return true; case Instruction::Call: - case Instruction::Invoke: { + case Instruction::Invoke: // There are two reasons a call can have an undef result // 1. It could be tracked. // 2. It could be constant-foldable. @@ -1478,7 +1520,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // we do not know what return values are valid. markOverdefined(&I); return true; - } default: // If we don't know what should happen here, conservatively mark it // overdefined. @@ -1557,11 +1598,56 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { return false; } +static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) { + bool Changed = false; + + // Currently we only use range information for integer values. + if (!V->getType()->isIntegerTy()) + return false; + + const ValueLatticeElement &IV = Solver.getLatticeValueFor(V); + if (!IV.isConstantRange()) + return false; + + for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) { + const Use &U = *UI++; + auto *Icmp = dyn_cast<ICmpInst>(U.getUser()); + if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent())) + continue; + + auto getIcmpLatticeValue = [&](Value *Op) { + if (auto *C = dyn_cast<Constant>(Op)) + return ValueLatticeElement::get(C); + return Solver.getLatticeValueFor(Op); + }; + + ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0)); + ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1)); + + Constant *C = nullptr; + if (A.satisfiesPredicate(Icmp->getPredicate(), B)) + C = ConstantInt::getTrue(Icmp->getType()); + else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B)) + C = ConstantInt::getFalse(Icmp->getType()); + + if (C) { + Icmp->replaceAllUsesWith(C); + DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C + << ", because of range information " << A << " " << B + << "\n"); + Icmp->eraseFromParent(); + Changed = true; + } + } + return Changed; +} + static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { Constant *Const = nullptr; if (V->getType()->isStructTy()) { std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V); - if (any_of(IVs, [](const LatticeVal &LV) { return LV.isOverdefined(); })) + if (llvm::any_of(IVs, + [](const LatticeVal &LV) { return LV.isOverdefined(); })) return false; std::vector<Constant *> ConstVals; auto *ST = dyn_cast<StructType>(V->getType()); @@ -1573,10 +1659,19 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { } Const = ConstantStruct::get(ST, ConstVals); } else { - LatticeVal IV = Solver.getLatticeValueFor(V); + const ValueLatticeElement &IV = Solver.getLatticeValueFor(V); if (IV.isOverdefined()) return false; - Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType()); + + if (IV.isConstantRange()) { + if (IV.getConstantRange().isSingleElement()) + Const = + ConstantInt::get(V->getType(), IV.asConstantInteger().getValue()); + else + return false; + } else + Const = + IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType()); } assert(Const && "Constant is nullptr here!"); DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n'); @@ -1588,7 +1683,6 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { // runSCCP() - Run the Sparse Conditional Constant Propagation algorithm, // and return true if the function was modified. -// static bool runSCCP(Function &F, const DataLayout &DL, const TargetLibraryInfo *TLI) { DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); @@ -1628,7 +1722,6 @@ static bool runSCCP(Function &F, const DataLayout &DL, // Iterate over all of the instructions in a function, replacing them with // constants if we have found them to be of constant values. - // for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst)) @@ -1659,6 +1752,7 @@ PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) { } namespace { + //===--------------------------------------------------------------------===// // /// SCCP Class - This class uses the SCCPSolver to implement a per-function @@ -1666,18 +1760,20 @@ namespace { /// class SCCPLegacyPass : public FunctionPass { public: + // Pass identification, replacement for typeid + static char ID; + + SCCPLegacyPass() : FunctionPass(ID) { + initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } - static char ID; // Pass identification, replacement for typeid - SCCPLegacyPass() : FunctionPass(ID) { - initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry()); - } // runOnFunction - Run the Sparse Conditional Constant Propagation // algorithm, and return true if the function was modified. - // bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; @@ -1687,9 +1783,11 @@ public: return runSCCP(F, DL, TLI); } }; + } // end anonymous namespace char SCCPLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp", "Sparse Conditional Constant Propagation", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) @@ -1699,38 +1797,11 @@ INITIALIZE_PASS_END(SCCPLegacyPass, "sccp", // createSCCPPass - This is the public interface to this file. FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); } -static bool AddressIsTaken(const GlobalValue *GV) { - // Delete any dead constantexpr klingons. - GV->removeDeadConstantUsers(); - - for (const Use &U : GV->uses()) { - const User *UR = U.getUser(); - if (const auto *SI = dyn_cast<StoreInst>(UR)) { - if (SI->getOperand(0) == GV || SI->isVolatile()) - return true; // Storing addr of GV. - } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) { - // Make sure we are calling the function, not passing the address. - ImmutableCallSite CS(cast<Instruction>(UR)); - if (!CS.isCallee(&U)) - return true; - } else if (const auto *LI = dyn_cast<LoadInst>(UR)) { - if (LI->isVolatile()) - return true; - } else if (isa<BlockAddress>(UR)) { - // blockaddress doesn't take the address of the function, it takes addr - // of label. - } else { - return true; - } - } - return false; -} - static void findReturnsToZap(Function &F, - SmallPtrSet<Function *, 32> &AddressTakenFunctions, - SmallVector<ReturnInst *, 8> &ReturnsToZap) { + SmallVector<ReturnInst *, 8> &ReturnsToZap, + SCCPSolver &Solver) { // We can only do this if we know that nothing else can call the function. - if (!F.hasLocalLinkage() || AddressTakenFunctions.count(&F)) + if (!Solver.isArgumentTrackedFunction(&F)) return; for (BasicBlock &BB : F) @@ -1743,39 +1814,22 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI) { SCCPSolver Solver(DL, TLI); - // AddressTakenFunctions - This set keeps track of the address-taken functions - // that are in the input. As IPSCCP runs through and simplifies code, - // functions that were address taken can end up losing their - // address-taken-ness. Because of this, we keep track of their addresses from - // the first pass so we can use them for the later simplification pass. - SmallPtrSet<Function*, 32> AddressTakenFunctions; - // Loop over all functions, marking arguments to those with their addresses // taken or that are external as overdefined. - // for (Function &F : M) { if (F.isDeclaration()) continue; - // If this is an exact definition of this function, then we can propagate - // information about its result into callsites of it. - // Don't touch naked functions. They may contain asm returning a - // value we don't see, so we may end up interprocedurally propagating - // the return value incorrectly. - if (F.hasExactDefinition() && !F.hasFnAttribute(Attribute::Naked)) + // Determine if we can track the function's return values. If so, add the + // function to the solver's set of return-tracked functions. + if (canTrackReturnsInterprocedurally(&F)) Solver.AddTrackedFunction(&F); - // If this function only has direct calls that we can see, we can track its - // arguments and return value aggressively, and can assume it is not called - // unless we see evidence to the contrary. - if (F.hasLocalLinkage()) { - if (F.hasAddressTaken()) { - AddressTakenFunctions.insert(&F); - } - else { - Solver.AddArgumentTrackedFunction(&F); - continue; - } + // Determine if we can track the function's arguments. If so, add the + // function to the solver's set of argument-tracked functions. + if (canTrackArgumentsInterprocedurally(&F)) { + Solver.AddArgumentTrackedFunction(&F); + continue; } // Assume the function is called. @@ -1786,13 +1840,14 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, Solver.markOverdefined(&AI); } - // Loop over global variables. We inform the solver about any internal global - // variables that do not have their 'addresses taken'. If they don't have - // their addresses taken, we can propagate constants through them. - for (GlobalVariable &G : M.globals()) - if (!G.isConstant() && G.hasLocalLinkage() && - G.hasDefinitiveInitializer() && !AddressIsTaken(&G)) + // Determine if we can track any of the module's global variables. If so, add + // the global variables we can track to the solver's set of tracked global + // variables. + for (GlobalVariable &G : M.globals()) { + G.removeDeadConstantUsers(); + if (canTrackGlobalVariableInterprocedurally(&G)) Solver.TrackValueOfGlobalVariable(&G); + } // Solve for constants. bool ResolvedUndefs = true; @@ -1809,7 +1864,6 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, // Iterate over all of the instructions in the module, replacing them with // constants if we have found them to be of constant values. - // SmallVector<BasicBlock*, 512> BlocksToErase; for (Function &F : M) { @@ -1818,9 +1872,15 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, if (Solver.isBlockExecutable(&F.front())) for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; - ++AI) - if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) + ++AI) { + if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) { ++IPNumArgsElimed; + continue; + } + + if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI)) + ++IPNumRangeInfoUsed; + } for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { if (!Solver.isBlockExecutable(&*BB)) { @@ -1897,7 +1957,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, Function *F = I.first; if (I.second.isOverdefined() || F->getReturnType()->isVoidTy()) continue; - findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap); + findReturnsToZap(*F, ReturnsToZap, Solver); } for (const auto &F : Solver.getMRVFunctionsTracked()) { @@ -1905,7 +1965,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, "The return type should be a struct"); StructType *STy = cast<StructType>(F->getReturnType()); if (Solver.isStructLatticeConstant(F, STy)) - findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap); + findReturnsToZap(*F, ReturnsToZap, Solver); } // Zap all returns which we've identified as zap to change. @@ -1943,6 +2003,7 @@ PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) { } namespace { + //===--------------------------------------------------------------------===// // /// IPSCCP Class - This class implements interprocedural Sparse Conditional @@ -1969,9 +2030,11 @@ public: AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; + } // end anonymous namespace char IPSCCPLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", false, false) diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index b9cee5b2ba95..bfe3754f0769 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -24,28 +24,54 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/SROA.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/ConstantFolder.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" -#include "llvm/Support/Chrono.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" @@ -55,6 +81,17 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include <algorithm> +#include <cassert> +#include <chrono> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <string> +#include <tuple> +#include <utility> +#include <vector> #ifndef NDEBUG // We only use this for a debug check. @@ -87,11 +124,18 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), cl::Hidden); +/// Hidden option to allow more aggressive splitting. +static cl::opt<bool> +SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices", + cl::init(false), cl::Hidden); + namespace { + /// \brief A custom IRBuilder inserter which prefixes all names, but only in /// Assert builds. class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter { std::string Prefix; + const Twine getNameWithPrefix(const Twine &Name) const { return Name.isTriviallyEmpty() ? Name : Prefix + Name; } @@ -107,11 +151,9 @@ protected: } }; -/// \brief Provide a typedef for IRBuilder that drops names in release builds. -using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>; -} +/// \brief Provide a type for IRBuilder that drops names in release builds. +using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>; -namespace { /// \brief A used slice of an alloca. /// /// This structure represents a slice of an alloca used by some instruction. It @@ -120,17 +162,18 @@ namespace { /// or not when forming partitions of the alloca. class Slice { /// \brief The beginning offset of the range. - uint64_t BeginOffset; + uint64_t BeginOffset = 0; /// \brief The ending offset, not included in the range. - uint64_t EndOffset; + uint64_t EndOffset = 0; /// \brief Storage for both the use of this slice and whether it can be /// split. PointerIntPair<Use *, 1, bool> UseAndIsSplittable; public: - Slice() : BeginOffset(), EndOffset() {} + Slice() = default; + Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) : BeginOffset(BeginOffset), EndOffset(EndOffset), UseAndIsSplittable(U, IsSplittable) {} @@ -180,12 +223,15 @@ public: } bool operator!=(const Slice &RHS) const { return !operator==(RHS); } }; + } // end anonymous namespace namespace llvm { + template <typename T> struct isPodLike; template <> struct isPodLike<Slice> { static const bool value = true; }; -} + +} // end namespace llvm /// \brief Representation of the alloca slices. /// @@ -207,13 +253,15 @@ public: /// \brief Support for iterating over the slices. /// @{ - typedef SmallVectorImpl<Slice>::iterator iterator; - typedef iterator_range<iterator> range; + using iterator = SmallVectorImpl<Slice>::iterator; + using range = iterator_range<iterator>; + iterator begin() { return Slices.begin(); } iterator end() { return Slices.end(); } - typedef SmallVectorImpl<Slice>::const_iterator const_iterator; - typedef iterator_range<const_iterator> const_range; + using const_iterator = SmallVectorImpl<Slice>::const_iterator; + using const_range = iterator_range<const_iterator>; + const_iterator begin() const { return Slices.begin(); } const_iterator end() const { return Slices.end(); } /// @} @@ -264,6 +312,7 @@ public: private: template <typename DerivedT, typename RetT = void> class BuilderBase; class SliceBuilder; + friend class AllocaSlices::SliceBuilder; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -320,7 +369,7 @@ private: friend class AllocaSlices; friend class AllocaSlices::partition_iterator; - typedef AllocaSlices::iterator iterator; + using iterator = AllocaSlices::iterator; /// \brief The beginning and ending offsets of the alloca for this /// partition. @@ -403,12 +452,12 @@ class AllocaSlices::partition_iterator /// \brief We also need to keep track of the maximum split end offset seen. /// FIXME: Do we really? - uint64_t MaxSplitSliceEndOffset; + uint64_t MaxSplitSliceEndOffset = 0; /// \brief Sets the partition to be empty at given iterator, and sets the /// end iterator. partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) - : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { + : P(SI), SE(SE) { // If not already at the end, advance our state to form the initial // partition. if (SI != SE) @@ -432,19 +481,21 @@ class AllocaSlices::partition_iterator // Remove the uses which have ended in the prior partition. This // cannot change the max split slice end because we just checked that // the prior partition ended prior to that max. - P.SplitTails.erase( - remove_if(P.SplitTails, - [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), - P.SplitTails.end()); - assert(any_of(P.SplitTails, - [&](Slice *S) { - return S->endOffset() == MaxSplitSliceEndOffset; - }) && + P.SplitTails.erase(llvm::remove_if(P.SplitTails, + [&](Slice *S) { + return S->endOffset() <= + P.EndOffset; + }), + P.SplitTails.end()); + assert(llvm::any_of(P.SplitTails, + [&](Slice *S) { + return S->endOffset() == MaxSplitSliceEndOffset; + }) && "Could not find the current max split slice offset!"); - assert(all_of(P.SplitTails, - [&](Slice *S) { - return S->endOffset() <= MaxSplitSliceEndOffset; - }) && + assert(llvm::all_of(P.SplitTails, + [&](Slice *S) { + return S->endOffset() <= MaxSplitSliceEndOffset; + }) && "Max split slice end offset is not actually the max!"); } } @@ -608,7 +659,8 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) { class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { friend class PtrUseVisitor<SliceBuilder>; friend class InstVisitor<SliceBuilder>; - typedef PtrUseVisitor<SliceBuilder> Base; + + using Base = PtrUseVisitor<SliceBuilder>; const uint64_t AllocSize; AllocaSlices &AS; @@ -996,8 +1048,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) return; } - Slices.erase(remove_if(Slices, [](const Slice &S) { return S.isDead(); }), - Slices.end()); + Slices.erase( + llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }), + Slices.end()); #ifndef NDEBUG if (SROARandomShuffleSlices) { @@ -1820,11 +1873,12 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // do that until all the backends are known to produce good code for all // integer vector types. if (!HaveCommonEltTy) { - CandidateTys.erase(remove_if(CandidateTys, - [](VectorType *VTy) { - return !VTy->getElementType()->isIntegerTy(); - }), - CandidateTys.end()); + CandidateTys.erase( + llvm::remove_if(CandidateTys, + [](VectorType *VTy) { + return !VTy->getElementType()->isIntegerTy(); + }), + CandidateTys.end()); // If there were no integer vector types, give up. if (CandidateTys.empty()) @@ -2151,8 +2205,9 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, class llvm::sroa::AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. - friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; - typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; + friend class InstVisitor<AllocaSliceRewriter, bool>; + + using Base = InstVisitor<AllocaSliceRewriter, bool>; const DataLayout &DL; AllocaSlices &AS; @@ -2182,16 +2237,18 @@ class llvm::sroa::AllocaSliceRewriter // The original offset of the slice currently being rewritten relative to // the original alloca. - uint64_t BeginOffset, EndOffset; + uint64_t BeginOffset = 0; + uint64_t EndOffset = 0; + // The new offsets of the slice currently being rewritten relative to the // original alloca. uint64_t NewBeginOffset, NewEndOffset; uint64_t SliceSize; - bool IsSplittable; - bool IsSplit; - Use *OldUse; - Instruction *OldPtr; + bool IsSplittable = false; + bool IsSplit = false; + Use *OldUse = nullptr; + Instruction *OldPtr = nullptr; // Track post-rewrite users which are PHI nodes and Selects. SmallSetVector<PHINode *, 8> &PHIUsers; @@ -2221,8 +2278,7 @@ public: VecTy(PromotableVecTy), ElementTy(VecTy ? VecTy->getElementType() : nullptr), ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), - BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), - OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), + PHIUsers(PHIUsers), SelectUsers(SelectUsers), IRB(NewAI.getContext(), ConstantFolder()) { if (VecTy) { assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && @@ -2987,6 +3043,7 @@ private: }; namespace { + /// \brief Visitor to rewrite aggregate loads and stores as scalar. /// /// This pass aggressively rewrites all aggregate loads and stores on @@ -2994,7 +3051,7 @@ namespace { /// with scalar loads and stores. class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. - friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; + friend class InstVisitor<AggLoadStoreRewriter, bool>; /// Queue of pointer uses to analyze and potentially rewrite. SmallVector<Use *, 8> Queue; @@ -3037,12 +3094,15 @@ private: protected: /// The builder used to form new instructions. IRBuilderTy IRB; + /// The indices which to be used with insert- or extractvalue to select the /// appropriate value within the aggregate. SmallVector<unsigned, 4> Indices; + /// The indices to a GEP instruction which will move Ptr to the correct slot /// within the aggregate. SmallVector<Value *, 4> GEPIndices; + /// The base pointer of the original op, used as a base for GEPing the /// split operations. Value *Ptr; @@ -3193,7 +3253,8 @@ private: return false; } }; -} + +} // end anonymous namespace /// \brief Strip aggregate type wrapping. /// @@ -3485,58 +3546,60 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // match relative to their starting offset. We have to verify this prior to // any rewriting. Stores.erase( - remove_if(Stores, - [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { - // Lookup the load we are storing in our map of split - // offsets. - auto *LI = cast<LoadInst>(SI->getValueOperand()); - // If it was completely unsplittable, then we're done, - // and this store can't be pre-split. - if (UnsplittableLoads.count(LI)) - return true; + llvm::remove_if(Stores, + [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { + // Lookup the load we are storing in our map of split + // offsets. + auto *LI = cast<LoadInst>(SI->getValueOperand()); + // If it was completely unsplittable, then we're done, + // and this store can't be pre-split. + if (UnsplittableLoads.count(LI)) + return true; - auto LoadOffsetsI = SplitOffsetsMap.find(LI); - if (LoadOffsetsI == SplitOffsetsMap.end()) - return false; // Unrelated loads are definitely safe. - auto &LoadOffsets = LoadOffsetsI->second; + auto LoadOffsetsI = SplitOffsetsMap.find(LI); + if (LoadOffsetsI == SplitOffsetsMap.end()) + return false; // Unrelated loads are definitely safe. + auto &LoadOffsets = LoadOffsetsI->second; - // Now lookup the store's offsets. - auto &StoreOffsets = SplitOffsetsMap[SI]; + // Now lookup the store's offsets. + auto &StoreOffsets = SplitOffsetsMap[SI]; - // If the relative offsets of each split in the load and - // store match exactly, then we can split them and we - // don't need to remove them here. - if (LoadOffsets.Splits == StoreOffsets.Splits) - return false; + // If the relative offsets of each split in the load and + // store match exactly, then we can split them and we + // don't need to remove them here. + if (LoadOffsets.Splits == StoreOffsets.Splits) + return false; - DEBUG(dbgs() << " Mismatched splits for load and store:\n" - << " " << *LI << "\n" - << " " << *SI << "\n"); + DEBUG(dbgs() + << " Mismatched splits for load and store:\n" + << " " << *LI << "\n" + << " " << *SI << "\n"); - // We've found a store and load that we need to split - // with mismatched relative splits. Just give up on them - // and remove both instructions from our list of - // candidates. - UnsplittableLoads.insert(LI); - return true; - }), + // We've found a store and load that we need to split + // with mismatched relative splits. Just give up on them + // and remove both instructions from our list of + // candidates. + UnsplittableLoads.insert(LI); + return true; + }), Stores.end()); // Now we have to go *back* through all the stores, because a later store may // have caused an earlier store's load to become unsplittable and if it is // unsplittable for the later store, then we can't rely on it being split in // the earlier store either. - Stores.erase(remove_if(Stores, - [&UnsplittableLoads](StoreInst *SI) { - auto *LI = cast<LoadInst>(SI->getValueOperand()); - return UnsplittableLoads.count(LI); - }), + Stores.erase(llvm::remove_if(Stores, + [&UnsplittableLoads](StoreInst *SI) { + auto *LI = + cast<LoadInst>(SI->getValueOperand()); + return UnsplittableLoads.count(LI); + }), Stores.end()); // Once we've established all the loads that can't be split for some reason, // filter any that made it into our list out. - Loads.erase(remove_if(Loads, - [&UnsplittableLoads](LoadInst *LI) { - return UnsplittableLoads.count(LI); - }), + Loads.erase(llvm::remove_if(Loads, + [&UnsplittableLoads](LoadInst *LI) { + return UnsplittableLoads.count(LI); + }), Loads.end()); // If no loads or stores are left, there is no pre-splitting to be done for @@ -3804,7 +3867,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } // Remove the killed slices that have ben pre-split. - AS.erase(remove_if(AS, [](const Slice &S) { return S.isDead(); }), AS.end()); + AS.erase(llvm::remove_if(AS, [](const Slice &S) { return S.isDead(); }), + AS.end()); // Insert our new slices. This will sort and merge them into the sorted // sequence. @@ -3819,7 +3883,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // Finally, don't try to promote any allocas that new require re-splitting. // They have already been added to the worklist above. PromotableAllocas.erase( - remove_if( + llvm::remove_if( PromotableAllocas, [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }), PromotableAllocas.end()); @@ -3989,27 +4053,58 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // First try to pre-split loads and stores. Changed |= presplitLoadsAndStores(AI, AS); - // Now that we have identified any pre-splitting opportunities, mark any - // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail - // to split these during pre-splitting, we want to force them to be - // rewritten into a partition. + // Now that we have identified any pre-splitting opportunities, + // mark loads and stores unsplittable except for the following case. + // We leave a slice splittable if all other slices are disjoint or fully + // included in the slice, such as whole-alloca loads and stores. + // If we fail to split these during pre-splitting, we want to force them + // to be rewritten into a partition. bool IsSorted = true; - for (Slice &S : AS) { - if (!S.isSplittable()) - continue; - // FIXME: We currently leave whole-alloca splittable loads and stores. This - // used to be the only splittable loads and stores and we need to be - // confident that the above handling of splittable loads and stores is - // completely sufficient before we forcibly disable the remaining handling. - if (S.beginOffset() == 0 && - S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType())) - continue; - if (isa<LoadInst>(S.getUse()->getUser()) || - isa<StoreInst>(S.getUse()->getUser())) { - S.makeUnsplittable(); - IsSorted = false; + + uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType()); + const uint64_t MaxBitVectorSize = 1024; + if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) { + // If a byte boundary is included in any load or store, a slice starting or + // ending at the boundary is not splittable. + SmallBitVector SplittableOffset(AllocaSize + 1, true); + for (Slice &S : AS) + for (unsigned O = S.beginOffset() + 1; + O < S.endOffset() && O < AllocaSize; O++) + SplittableOffset.reset(O); + + for (Slice &S : AS) { + if (!S.isSplittable()) + continue; + + if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) && + (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()])) + continue; + + if (isa<LoadInst>(S.getUse()->getUser()) || + isa<StoreInst>(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; + } } } + else { + // We only allow whole-alloca splittable loads and stores + // for a large alloca to avoid creating too large BitVector. + for (Slice &S : AS) { + if (!S.isSplittable()) + continue; + + if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize) + continue; + + if (isa<LoadInst>(S.getUse()->getUser()) || + isa<StoreInst>(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; + } + } + } + if (!IsSorted) std::sort(AS.begin(), AS.end()); @@ -4044,9 +4139,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. - if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { - auto *Var = DbgDecl->getVariable(); - auto *Expr = DbgDecl->getExpression(); + TinyPtrVector<DbgInfoIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI); + if (!DbgDeclares.empty()) { + auto *Var = DbgDeclares.front()->getVariable(); + auto *Expr = DbgDeclares.front()->getExpression(); + auto VarSize = Var->getSizeInBits(); DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType()); for (auto Fragment : Fragments) { @@ -4062,21 +4159,43 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { uint64_t Size = Fragment.Size; if (ExprFragment) { uint64_t AbsEnd = - ExprFragment->OffsetInBits + ExprFragment->SizeInBits; + ExprFragment->OffsetInBits + ExprFragment->SizeInBits; if (Start >= AbsEnd) // No need to describe a SROAed padding. continue; Size = std::min(Size, AbsEnd - Start); } - FragmentExpr = DIB.createFragmentExpression(Start, Size); + // The new, smaller fragment is stenciled out from the old fragment. + if (auto OrigFragment = FragmentExpr->getFragmentInfo()) { + assert(Start >= OrigFragment->OffsetInBits && + "new fragment is outside of original fragment"); + Start -= OrigFragment->OffsetInBits; + } + + // The alloca may be larger than the variable. + if (VarSize) { + if (Size > *VarSize) + Size = *VarSize; + if (Size == 0 || Start + Size > *VarSize) + continue; + } + + // Avoid creating a fragment expression that covers the entire variable. + if (!VarSize || *VarSize != Size) { + if (auto E = + DIExpression::createFragmentExpression(Expr, Start, Size)) + FragmentExpr = *E; + else + continue; + } } - // Remove any existing dbg.declare intrinsic describing the same alloca. - if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Fragment.Alloca)) - OldDDI->eraseFromParent(); + // Remove any existing intrinsics describing the same alloca. + for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) + OldDII->eraseFromParent(); DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr, - DbgDecl->getDebugLoc(), &AI); + DbgDeclares.front()->getDebugLoc(), &AI); } } return Changed; @@ -4175,12 +4294,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) { /// /// We also record the alloca instructions deleted here so that they aren't /// subsequently handed to mem2reg to promote. -void SROA::deleteDeadInstructions( +bool SROA::deleteDeadInstructions( SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { + bool Changed = false; while (!DeadInsts.empty()) { Instruction *I = DeadInsts.pop_back_val(); DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); + // If the instruction is an alloca, find the possible dbg.declare connected + // to it, and remove it too. We must do this before calling RAUW or we will + // not be able to find it. + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { + DeletedAllocas.insert(AI); + for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(AI)) + OldDII->eraseFromParent(); + } + I->replaceAllUsesWith(UndefValue::get(I->getType())); for (Use &Operand : I->operands()) @@ -4191,15 +4320,11 @@ void SROA::deleteDeadInstructions( DeadInsts.insert(U); } - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { - DeletedAllocas.insert(AI); - if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI)) - DbgDecl->eraseFromParent(); - } - ++NumDeleted; I->eraseFromParent(); + Changed = true; } + return Changed; } /// \brief Promote the allocas, using the best available technique. @@ -4241,7 +4366,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, do { while (!Worklist.empty()) { Changed |= runOnAlloca(*Worklist.pop_back_val()); - deleteDeadInstructions(DeletedAllocas); + Changed |= deleteDeadInstructions(DeletedAllocas); // Remove the deleted allocas from various lists so that we don't try to // continue processing them. @@ -4249,7 +4374,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; Worklist.remove_if(IsInSet); PostPromotionWorklist.remove_if(IsInSet); - PromotableAllocas.erase(remove_if(PromotableAllocas, IsInSet), + PromotableAllocas.erase(llvm::remove_if(PromotableAllocas, IsInSet), PromotableAllocas.end()); DeletedAllocas.clear(); } @@ -4284,9 +4409,12 @@ class llvm::sroa::SROALegacyPass : public FunctionPass { SROA Impl; public: + static char ID; + SROALegacyPass() : FunctionPass(ID) { initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); } + bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; @@ -4296,6 +4424,7 @@ public: getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); return !PA.areAllPreserved(); } + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); @@ -4304,7 +4433,6 @@ public: } StringRef getPassName() const override { return "SROA"; } - static char ID; }; char SROALegacyPass::ID = 0; diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index ce6f93eb0c15..3b99ddff2e06 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -35,11 +35,13 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); + initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); initializeDCELegacyPassPass(Registry); initializeDeadInstEliminationPass(Registry); + initializeDivRemPairsLegacyPassPass(Registry); initializeScalarizerPass(Registry); initializeDSELegacyPassPass(Registry); initializeGuardWideningLegacyPassPass(Registry); @@ -73,17 +75,17 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); initializeMemCpyOptLegacyPassPass(Registry); + initializeMergeICmpsPass(Registry); initializeMergedLoadStoreMotionLegacyPassPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); initializeRegToMemPass(Registry); - initializeRewriteStatepointsForGCPass(Registry); + initializeRewriteStatepointsForGCLegacyPassPass(Registry); initializeSCCPLegacyPassPass(Registry); initializeIPSCCPLegacyPassPass(Registry); initializeSROALegacyPassPass(Registry); initializeCFGSimplifyPassPass(Registry); - initializeLateCFGSimplifyPassPass(Registry); initializeStructurizeCFGPass(Registry); initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); @@ -98,6 +100,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopLoadEliminationPass(Registry); initializeLoopSimplifyCFGLegacyPassPass(Registry); initializeLoopVersioningPassPass(Registry); + initializeEntryExitInstrumenterPass(Registry); + initializePostInlineEntryExitInstrumenterPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -117,11 +121,7 @@ void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) { } void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCFGSimplificationPass()); -} - -void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createLateCFGSimplificationPass()); + unwrap(PM)->add(createCFGSimplificationPass(1, false, false, true)); } void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) { diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp index d11855f2f3a9..34ed126155be 100644 --- a/lib/Transforms/Scalar/Scalarizer.cpp +++ b/lib/Transforms/Scalar/Scalarizer.cpp @@ -1,4 +1,4 @@ -//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===// +//===- Scalarizer.cpp - Scalarize vector operations -----------------------===// // // The LLVM Compiler Infrastructure // @@ -14,36 +14,59 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/Options.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <map> +#include <utility> using namespace llvm; #define DEBUG_TYPE "scalarizer" namespace { + // Used to store the scattered form of a vector. -typedef SmallVector<Value *, 8> ValueVector; +using ValueVector = SmallVector<Value *, 8>; // Used to map a vector Value to its scattered form. We use std::map // because we want iterators to persist across insertion and because the // values are relatively large. -typedef std::map<Value *, ValueVector> ScatterMap; +using ScatterMap = std::map<Value *, ValueVector>; // Lists Instructions that have been replaced with scalar implementations, // along with a pointer to their scattered forms. -typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList; +using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>; // Provides a very limited vector-like interface for lazily accessing one // component of a scattered vector or vector pointer. class Scatterer { public: - Scatterer() {} + Scatterer() = default; // Scatter V into Size components. If new instructions are needed, // insert them before BBI in BB. If Cache is nonnull, use it to cache @@ -71,10 +94,12 @@ private: // called Name that compares X and Y in the same way as FCI. struct FCmpSplitter { FCmpSplitter(FCmpInst &fci) : FCI(fci) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, const Twine &Name) const { return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name); } + FCmpInst &FCI; }; @@ -82,10 +107,12 @@ struct FCmpSplitter { // called Name that compares X and Y in the same way as ICI. struct ICmpSplitter { ICmpSplitter(ICmpInst &ici) : ICI(ici) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, const Twine &Name) const { return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name); } + ICmpInst &ICI; }; @@ -93,16 +120,18 @@ struct ICmpSplitter { // a binary operator like BO called Name with operands X and Y. struct BinarySplitter { BinarySplitter(BinaryOperator &bo) : BO(bo) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, const Twine &Name) const { return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name); } + BinaryOperator &BO; }; // Information about a load or store that we're scalarizing. struct VectorLayout { - VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {} + VectorLayout() = default; // Return the alignment of element I. uint64_t getElemAlign(unsigned I) { @@ -110,16 +139,16 @@ struct VectorLayout { } // The type of the vector. - VectorType *VecTy; + VectorType *VecTy = nullptr; // The type of each element. - Type *ElemTy; + Type *ElemTy = nullptr; // The alignment of the vector. - uint64_t VecAlign; + uint64_t VecAlign = 0; // The size of each element. - uint64_t ElemSize; + uint64_t ElemSize = 0; }; class Scalarizer : public FunctionPass, @@ -127,8 +156,7 @@ class Scalarizer : public FunctionPass, public: static char ID; - Scalarizer() : - FunctionPass(ID) { + Scalarizer() : FunctionPass(ID) { initializeScalarizerPass(*PassRegistry::getPassRegistry()); } @@ -137,19 +165,19 @@ public: // InstVisitor methods. They return true if the instruction was scalarized, // false if nothing changed. - bool visitInstruction(Instruction &) { return false; } + bool visitInstruction(Instruction &I) { return false; } bool visitSelectInst(SelectInst &SI); - bool visitICmpInst(ICmpInst &); - bool visitFCmpInst(FCmpInst &); - bool visitBinaryOperator(BinaryOperator &); - bool visitGetElementPtrInst(GetElementPtrInst &); - bool visitCastInst(CastInst &); - bool visitBitCastInst(BitCastInst &); - bool visitShuffleVectorInst(ShuffleVectorInst &); - bool visitPHINode(PHINode &); - bool visitLoadInst(LoadInst &); - bool visitStoreInst(StoreInst &); - bool visitCallInst(CallInst &I); + bool visitICmpInst(ICmpInst &ICI); + bool visitFCmpInst(FCmpInst &FCI); + bool visitBinaryOperator(BinaryOperator &BO); + bool visitGetElementPtrInst(GetElementPtrInst &GEPI); + bool visitCastInst(CastInst &CI); + bool visitBitCastInst(BitCastInst &BCI); + bool visitShuffleVectorInst(ShuffleVectorInst &SVI); + bool visitPHINode(PHINode &PHI); + bool visitLoadInst(LoadInst &LI); + bool visitStoreInst(StoreInst &SI); + bool visitCallInst(CallInst &ICI); static void registerOptions() { // This is disabled by default because having separate loads and stores @@ -162,11 +190,12 @@ public: } private: - Scatterer scatter(Instruction *, Value *); - void gather(Instruction *, const ValueVector &); + Scatterer scatter(Instruction *Point, Value *V); + void gather(Instruction *Op, const ValueVector &CV); bool canTransferMetadata(unsigned Kind); - void transferMetadata(Instruction *, const ValueVector &); - bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &); + void transferMetadata(Instruction *Op, const ValueVector &CV); + bool getVectorLayout(Type *Ty, unsigned Alignment, VectorLayout &Layout, + const DataLayout &DL); bool finish(); template<typename T> bool splitBinary(Instruction &, const T &); @@ -179,9 +208,10 @@ private: bool ScalarizeLoadStore; }; -char Scalarizer::ID = 0; } // end anonymous namespace +char Scalarizer::ID = 0; + INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer", "Scalarize vector operations", false, false) @@ -222,7 +252,7 @@ Value *Scatterer::operator[](unsigned I) { // Search through a chain of InsertElementInsts looking for element I. // Record other elements in the cache. The new V is still suitable // for all uncached indices. - for (;;) { + while (true) { InsertElementInst *Insert = dyn_cast<InsertElementInst>(V); if (!Insert) break; diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 84675f41cdd5..209821ff21d7 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -1,4 +1,4 @@ -//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===// +//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -156,27 +156,44 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <cassert> +#include <cstdint> +#include <string> using namespace llvm; using namespace llvm::PatternMatch; @@ -185,6 +202,7 @@ static cl::opt<bool> DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), cl::desc("Do not separate the constant offset from a GEP instruction"), cl::Hidden); + // Setting this flag may emit false positives when the input module already // contains dead instructions. Therefore, we set it only in unit tests that are // free of dead code. @@ -219,6 +237,7 @@ public: /// garbage-collect unused instructions in UserChain. static Value *Extract(Value *Idx, GetElementPtrInst *GEP, User *&UserChainTail, const DominatorTree *DT); + /// Looks for a constant offset from the given GEP index without extracting /// it. It returns the numeric value of the extracted constant offset (0 if /// failed). The meaning of the arguments are the same as Extract. @@ -229,6 +248,7 @@ private: ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT) : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) { } + /// Searches the expression that computes V for a non-zero constant C s.t. /// V can be reassociated into the form V' + C. If the searching is /// successful, returns C and update UserChain as a def-use chain from C to V; @@ -244,9 +264,11 @@ private: /// non-negative. Levaraging this, we can better split /// inbounds GEPs. APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); + /// A helper function to look into both operands of a binary operator. APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, bool ZeroExtended); + /// After finding the constant offset C from the GEP index I, we build a new /// index I' s.t. I' + C = I. This function builds and returns the new /// index I' according to UserChain produced by function "find". @@ -263,6 +285,7 @@ private: /// (sext(a) + sext(b)) + 5. /// Given this form, we know I' is sext(a) + sext(b). Value *rebuildWithoutConstOffset(); + /// After the first step of rebuilding the GEP index without the constant /// offset, distribute s/zext to the operands of all operators in UserChain. /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => @@ -279,8 +302,10 @@ private: /// UserChain.size() - 1, and is decremented during /// the recursion. Value *distributeExtsAndCloneChain(unsigned ChainIndex); + /// Reassociates the GEP index to the form I' + C and returns I'. Value *removeConstOffset(unsigned ChainIndex); + /// A helper function to apply ExtInsts, a list of s/zext, to value V. /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function /// returns "sext i32 (zext i16 V to i32) to i64". @@ -303,10 +328,14 @@ private: /// /// This path helps to rebuild the new GEP index. SmallVector<User *, 8> UserChain; + /// A data structure used in rebuildWithoutConstOffset. Contains all /// sext/zext instructions along UserChain. SmallVector<CastInst *, 16> ExtInsts; - Instruction *IP; /// Insertion position of cloned instructions. + + /// Insertion position of cloned instructions. + Instruction *IP; + const DataLayout &DL; const DominatorTree *DT; }; @@ -317,9 +346,10 @@ private: class SeparateConstOffsetFromGEP : public FunctionPass { public: static char ID; + SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr, bool LowerGEP = false) - : FunctionPass(ID), DL(nullptr), DT(nullptr), TM(TM), LowerGEP(LowerGEP) { + : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) { initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); } @@ -336,12 +366,14 @@ public: DL = &M.getDataLayout(); return false; } + bool runOnFunction(Function &F) override; private: /// Tries to split the given GEP into a variadic base and a constant offset, /// and returns true if the splitting succeeds. bool splitGEP(GetElementPtrInst *GEP); + /// Lower a GEP with multiple indices into multiple GEPs with a single index. /// Function splitGEP already split the original GEP into a variadic part and /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the @@ -351,6 +383,7 @@ private: /// \p AccumulativeByteOffset The constant offset. void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset); + /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. /// Function splitGEP already split the original GEP into a variadic part and /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the @@ -360,12 +393,14 @@ private: /// \p AccumulativeByteOffset The constant offset. void lowerToArithmetics(GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset); + /// Finds the constant offset within each index and accumulates them. If /// LowerGEP is true, it finds in indices of both sequential and structure /// types, otherwise it only finds in sequential indices. The output /// NeedsExtraction indicates whether we successfully find a non-zero constant /// offset. int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); + /// Canonicalize array indices to pointer-size integers. This helps to /// simplify the logic of splitting a GEP. For example, if a + b is a /// pointer-size integer, we have @@ -382,6 +417,7 @@ private: /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting /// the constant offset. After extraction, it becomes desirable to reunion the @@ -392,8 +428,10 @@ private: /// => constant extraction &a[sext(i) + sext(j)] + 5 /// => reunion &a[sext(i +nsw j)] + 5 bool reuniteExts(Function &F); + /// A helper that reunites sexts in an instruction. bool reuniteExts(Instruction *I); + /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. Instruction *findClosestMatchingDominator(const SCEV *Key, Instruction *Dominatee); @@ -401,27 +439,33 @@ private: void verifyNoDeadCode(Function &F); bool hasMoreThanOneUseInLoop(Value *v, Loop *L); + // Swap the index operand of two GEP. void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); + // Check if it is safe to swap operand of two GEP. bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, Loop *CurLoop); - const DataLayout *DL; - DominatorTree *DT; + const DataLayout *DL = nullptr; + DominatorTree *DT = nullptr; ScalarEvolution *SE; const TargetMachine *TM; LoopInfo *LI; TargetLibraryInfo *TLI; + /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. bool LowerGEP; + DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs; }; -} // anonymous namespace + +} // end anonymous namespace char SeparateConstOffsetFromGEP::ID = 0; + INITIALIZE_PASS_BEGIN( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index aaab5857e0f1..3d0fca0bc3a5 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -28,6 +29,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" @@ -36,11 +38,15 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> #include <cassert> #include <iterator> +#include <numeric> #include <utility> #define DEBUG_TYPE "simple-loop-unswitch" @@ -51,6 +57,15 @@ STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); STATISTIC(NumTrivial, "Number of unswitches that are trivial"); +static cl::opt<bool> EnableNonTrivialUnswitch( + "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, + cl::desc("Forcibly enables non-trivial loop unswitching rather than " + "following the configuration passed into the pass.")); + +static cl::opt<int> + UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, + cl::desc("The cost threshold for unswitching a loop.")); + static void replaceLoopUsesWithConstant(Loop &L, Value &LIC, Constant &Replacement) { assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); @@ -68,24 +83,95 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC, } } -/// Update the dominator tree after removing one exiting predecessor of a loop -/// exit block. -static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L, - DominatorTree &DT) { - assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) && - "Cannot have empty predecessors of the loop exit block if we split " - "off a block to unswitch!"); +/// Update the IDom for a basic block whose predecessor set has changed. +/// +/// This routine is designed to work when the domtree update is relatively +/// localized by leveraging a known common dominator, often a loop header. +/// +/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS +/// approach here as we can do that easily by persisting the candidate IDom's +/// dominating set between each predecessor. +/// +/// FIXME: Longer term, many uses of this can be replaced by an incremental +/// domtree update strategy that starts from a known dominating block and +/// rebuilds that subtree. +static bool updateIDomWithKnownCommonDominator(BasicBlock *BB, + BasicBlock *KnownDominatingBB, + DominatorTree &DT) { + assert(pred_begin(BB) != pred_end(BB) && + "This routine does not handle unreachable blocks!"); + + BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock(); + + BasicBlock *IDom = *pred_begin(BB); + assert(DT.dominates(KnownDominatingBB, IDom) && + "Bad known dominating block!"); - BasicBlock *IDom = *pred_begin(LoopExitBB); // Walk all of the other predecessors finding the nearest common dominator // until all predecessors are covered or we reach the loop header. The loop // header necessarily dominates all loop exit blocks in loop simplified form // so we can early-exit the moment we hit that block. - for (auto PI = std::next(pred_begin(LoopExitBB)), PE = pred_end(LoopExitBB); - PI != PE && IDom != L.getHeader(); ++PI) + for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB); + PI != PE && IDom != KnownDominatingBB; ++PI) { + assert(DT.dominates(KnownDominatingBB, *PI) && + "Bad known dominating block!"); IDom = DT.findNearestCommonDominator(IDom, *PI); + } + + if (IDom == OrigIDom) + return false; + + DT.changeImmediateDominator(BB, IDom); + return true; +} + +// Note that we don't currently use the IDFCalculator here for two reasons: +// 1) It computes dominator tree levels for the entire function on each run +// of 'compute'. While this isn't terrible, given that we expect to update +// relatively small subtrees of the domtree, it isn't necessarily the right +// tradeoff. +// 2) The interface doesn't fit this usage well. It doesn't operate in +// append-only, and builds several sets that we don't need. +// +// FIXME: Neither of these issues are a big deal and could be addressed with +// some amount of refactoring of IDFCalculator. That would allow us to share +// the core logic here (which is solving the same core problem). +static void appendDomFrontier(DomTreeNode *Node, + SmallSetVector<BasicBlock *, 4> &Worklist, + SmallVectorImpl<DomTreeNode *> &DomNodes, + SmallPtrSetImpl<BasicBlock *> &DomSet) { + assert(DomNodes.empty() && "Must start with no dominator nodes."); + assert(DomSet.empty() && "Must start with an empty dominator set."); + + // First flatten this subtree into sequence of nodes by doing a pre-order + // walk. + DomNodes.push_back(Node); + // We intentionally re-evaluate the size as each node can add new children. + // Because this is a tree walk, this cannot add any duplicates. + for (int i = 0; i < (int)DomNodes.size(); ++i) + DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end()); + + // Now create a set of the basic blocks so we can quickly test for + // dominated successors. We could in theory use the DFS numbers of the + // dominator tree for this, but we want this to remain predictably fast + // even while we mutate the dominator tree in ways that would invalidate + // the DFS numbering. + for (DomTreeNode *InnerN : DomNodes) + DomSet.insert(InnerN->getBlock()); - DT.changeImmediateDominator(LoopExitBB, IDom); + // Now re-walk the nodes, appending every successor of every node that isn't + // in the set. Note that we don't append the node itself, even though if it + // is a successor it does not strictly dominate itself and thus it would be + // part of the dominance frontier. The reason we don't append it is that + // the node passed in came *from* the worklist and so it has already been + // processed. + for (DomTreeNode *InnerN : DomNodes) + for (BasicBlock *SuccBB : successors(InnerN->getBlock())) + if (!DomSet.count(SuccBB)) + Worklist.insert(SuccBB); + + DomNodes.clear(); + DomSet.clear(); } /// Update the dominator tree after unswitching a particular former exit block. @@ -127,58 +213,14 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, // dominator frontier to see if it additionally should move up the dominator // tree. This lambda appends the dominator frontier for a node on the // worklist. - // - // Note that we don't currently use the IDFCalculator here for two reasons: - // 1) It computes dominator tree levels for the entire function on each run - // of 'compute'. While this isn't terrible, given that we expect to update - // relatively small subtrees of the domtree, it isn't necessarily the right - // tradeoff. - // 2) The interface doesn't fit this usage well. It doesn't operate in - // append-only, and builds several sets that we don't need. - // - // FIXME: Neither of these issues are a big deal and could be addressed with - // some amount of refactoring of IDFCalculator. That would allow us to share - // the core logic here (which is solving the same core problem). SmallSetVector<BasicBlock *, 4> Worklist; + + // Scratch data structures reused by domfrontier finding. SmallVector<DomTreeNode *, 4> DomNodes; SmallPtrSet<BasicBlock *, 4> DomSet; - auto AppendDomFrontier = [&](DomTreeNode *Node) { - assert(DomNodes.empty() && "Must start with no dominator nodes."); - assert(DomSet.empty() && "Must start with an empty dominator set."); - - // First flatten this subtree into sequence of nodes by doing a pre-order - // walk. - DomNodes.push_back(Node); - // We intentionally re-evaluate the size as each node can add new children. - // Because this is a tree walk, this cannot add any duplicates. - for (int i = 0; i < (int)DomNodes.size(); ++i) - DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end()); - - // Now create a set of the basic blocks so we can quickly test for - // dominated successors. We could in theory use the DFS numbers of the - // dominator tree for this, but we want this to remain predictably fast - // even while we mutate the dominator tree in ways that would invalidate - // the DFS numbering. - for (DomTreeNode *InnerN : DomNodes) - DomSet.insert(InnerN->getBlock()); - - // Now re-walk the nodes, appending every successor of every node that isn't - // in the set. Note that we don't append the node itself, even though if it - // is a successor it does not strictly dominate itself and thus it would be - // part of the dominance frontier. The reason we don't append it is that - // the node passed in came *from* the worklist and so it has already been - // processed. - for (DomTreeNode *InnerN : DomNodes) - for (BasicBlock *SuccBB : successors(InnerN->getBlock())) - if (!DomSet.count(SuccBB)) - Worklist.insert(SuccBB); - - DomNodes.clear(); - DomSet.clear(); - }; // Append the initial dom frontier nodes. - AppendDomFrontier(UnswitchedNode); + appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet); // Walk the worklist. We grow the list in the loop and so must recompute size. for (int i = 0; i < (int)Worklist.size(); ++i) { @@ -197,7 +239,7 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, DT.changeImmediateDominator(Node, OldPHNode); // Now add this node's dominator frontier to the worklist as well. - AppendDomFrontier(Node); + appendDomFrontier(Node, Worklist, DomNodes, DomSet); } } @@ -395,7 +437,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // one of the predecessors for the loop exit block and may need to update its // idom. if (UnswitchedBB != LoopExitBB) - updateLoopExitIDom(LoopExitBB, L, DT); + updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT); // Since this is an i1 condition we can also trivially replace uses of it // within the loop with a constant. @@ -540,7 +582,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, *ParentBB, *OldPH); - updateLoopExitIDom(DefaultExitBB, L, DT); + updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT); DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; } } @@ -567,7 +609,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, *ParentBB, *OldPH); - updateLoopExitIDom(ExitBB, L, DT); + updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT); } // Update the case pair to point to the split block. CasePair.second = SplitExitBB; @@ -708,15 +750,1172 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, return Changed; } +/// Build the cloned blocks for an unswitched copy of the given loop. +/// +/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and +/// after the split block (`SplitBB`) that will be used to select between the +/// cloned and original loop. +/// +/// This routine handles cloning all of the necessary loop blocks and exit +/// blocks including rewriting their instructions and the relevant PHI nodes. +/// It skips loop and exit blocks that are not necessary based on the provided +/// set. It also correctly creates the unconditional branch in the cloned +/// unswitched parent block to only point at the unswitched successor. +/// +/// This does not handle most of the necessary updates to `LoopInfo`. Only exit +/// block splitting is correctly reflected in `LoopInfo`, essentially all of +/// the cloned blocks (and their loops) are left without full `LoopInfo` +/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned +/// blocks to them but doesn't create the cloned `DominatorTree` structure and +/// instead the caller must recompute an accurate DT. It *does* correctly +/// update the `AssumptionCache` provided in `AC`. +static BasicBlock *buildClonedLoopBlocks( + Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, + ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, + BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, + const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks, + ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT, + LoopInfo &LI) { + SmallVector<BasicBlock *, 4> NewBlocks; + NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size()); + + // We will need to clone a bunch of blocks, wrap up the clone operation in + // a helper. + auto CloneBlock = [&](BasicBlock *OldBB) { + // Clone the basic block and insert it before the new preheader. + BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent()); + NewBB->moveBefore(LoopPH); + + // Record this block and the mapping. + NewBlocks.push_back(NewBB); + VMap[OldBB] = NewBB; + + // Add the block to the domtree. We'll move it to the correct position + // below. + DT.addNewBlock(NewBB, SplitBB); + + return NewBB; + }; + + // First, clone the preheader. + auto *ClonedPH = CloneBlock(LoopPH); + + // Then clone all the loop blocks, skipping the ones that aren't necessary. + for (auto *LoopBB : L.blocks()) + if (!SkippedLoopAndExitBlocks.count(LoopBB)) + CloneBlock(LoopBB); + + // Split all the loop exit edges so that when we clone the exit blocks, if + // any of the exit blocks are *also* a preheader for some other loop, we + // don't create multiple predecessors entering the loop header. + for (auto *ExitBB : ExitBlocks) { + if (SkippedLoopAndExitBlocks.count(ExitBB)) + continue; + + // When we are going to clone an exit, we don't need to clone all the + // instructions in the exit block and we want to ensure we have an easy + // place to merge the CFG, so split the exit first. This is always safe to + // do because there cannot be any non-loop predecessors of a loop exit in + // loop simplified form. + auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); + + // Rearrange the names to make it easier to write test cases by having the + // exit block carry the suffix rather than the merge block carrying the + // suffix. + MergeBB->takeName(ExitBB); + ExitBB->setName(Twine(MergeBB->getName()) + ".split"); + + // Now clone the original exit block. + auto *ClonedExitBB = CloneBlock(ExitBB); + assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 && + "Exit block should have been split to have one successor!"); + assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && + "Cloned exit block has the wrong successor!"); + + // Move the merge block's idom to be the split point as one exit is + // dominated by one header, and the other by another, so we know the split + // point dominates both. While the dominator tree isn't fully accurate, we + // want sub-trees within the original loop to be correctly reflect + // dominance within that original loop (at least) and that requires moving + // the merge block out of that subtree. + // FIXME: This is very brittle as we essentially have a partial contract on + // the dominator tree. We really need to instead update it and keep it + // valid or stop relying on it. + DT.changeImmediateDominator(MergeBB, SplitBB); + + // Remap any cloned instructions and create a merge phi node for them. + for (auto ZippedInsts : llvm::zip_first( + llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())), + llvm::make_range(ClonedExitBB->begin(), + std::prev(ClonedExitBB->end())))) { + Instruction &I = std::get<0>(ZippedInsts); + Instruction &ClonedI = std::get<1>(ZippedInsts); + + // The only instructions in the exit block should be PHI nodes and + // potentially a landing pad. + assert( + (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) && + "Bad instruction in exit block!"); + // We should have a value map between the instruction and its clone. + assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!"); + + auto *MergePN = + PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi", + &*MergeBB->getFirstInsertionPt()); + I.replaceAllUsesWith(MergePN); + MergePN->addIncoming(&I, ExitBB); + MergePN->addIncoming(&ClonedI, ClonedExitBB); + } + } + + // Rewrite the instructions in the cloned blocks to refer to the instructions + // in the cloned blocks. We have to do this as a second pass so that we have + // everything available. Also, we have inserted new instructions which may + // include assume intrinsics, so we update the assumption cache while + // processing this. + for (auto *ClonedBB : NewBlocks) + for (Instruction &I : *ClonedBB) { + RemapInstruction(&I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC.registerAssumption(II); + } + + // Remove the cloned parent as a predecessor of the cloned continue successor + // if we did in fact clone it. + auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB)); + if (auto *ClonedContinueSuccBB = + cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB))) + ClonedContinueSuccBB->removePredecessor(ClonedParentBB, + /*DontDeleteUselessPHIs*/ true); + // Replace the cloned branch with an unconditional branch to the cloneed + // unswitched successor. + auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB)); + ClonedParentBB->getTerminator()->eraseFromParent(); + BranchInst::Create(ClonedSuccBB, ClonedParentBB); + + // Update any PHI nodes in the cloned successors of the skipped blocks to not + // have spurious incoming values. + for (auto *LoopBB : L.blocks()) + if (SkippedLoopAndExitBlocks.count(LoopBB)) + for (auto *SuccBB : successors(LoopBB)) + if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB))) + for (PHINode &PN : ClonedSuccBB->phis()) + PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false); + + return ClonedPH; +} + +/// Recursively clone the specified loop and all of its children. +/// +/// The target parent loop for the clone should be provided, or can be null if +/// the clone is a top-level loop. While cloning, all the blocks are mapped +/// with the provided value map. The entire original loop must be present in +/// the value map. The cloned loop is returned. +static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, + const ValueToValueMapTy &VMap, LoopInfo &LI) { + auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) { + assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!"); + ClonedL.reserveBlocks(OrigL.getNumBlocks()); + for (auto *BB : OrigL.blocks()) { + auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB)); + ClonedL.addBlockEntry(ClonedBB); + if (LI.getLoopFor(BB) == &OrigL) { + assert(!LI.getLoopFor(ClonedBB) && + "Should not have an existing loop for this block!"); + LI.changeLoopFor(ClonedBB, &ClonedL); + } + } + }; + + // We specially handle the first loop because it may get cloned into + // a different parent and because we most commonly are cloning leaf loops. + Loop *ClonedRootL = LI.AllocateLoop(); + if (RootParentL) + RootParentL->addChildLoop(ClonedRootL); + else + LI.addTopLevelLoop(ClonedRootL); + AddClonedBlocksToLoop(OrigRootL, *ClonedRootL); + + if (OrigRootL.empty()) + return ClonedRootL; + + // If we have a nest, we can quickly clone the entire loop nest using an + // iterative approach because it is a tree. We keep the cloned parent in the + // data structure to avoid repeatedly querying through a map to find it. + SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone; + // Build up the loops to clone in reverse order as we'll clone them from the + // back. + for (Loop *ChildL : llvm::reverse(OrigRootL)) + LoopsToClone.push_back({ClonedRootL, ChildL}); + do { + Loop *ClonedParentL, *L; + std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val(); + Loop *ClonedL = LI.AllocateLoop(); + ClonedParentL->addChildLoop(ClonedL); + AddClonedBlocksToLoop(*L, *ClonedL); + for (Loop *ChildL : llvm::reverse(*L)) + LoopsToClone.push_back({ClonedL, ChildL}); + } while (!LoopsToClone.empty()); + + return ClonedRootL; +} + +/// Build the cloned loops of an original loop from unswitching. +/// +/// Because unswitching simplifies the CFG of the loop, this isn't a trivial +/// operation. We need to re-verify that there even is a loop (as the backedge +/// may not have been cloned), and even if there are remaining backedges the +/// backedge set may be different. However, we know that each child loop is +/// undisturbed, we only need to find where to place each child loop within +/// either any parent loop or within a cloned version of the original loop. +/// +/// Because child loops may end up cloned outside of any cloned version of the +/// original loop, multiple cloned sibling loops may be created. All of them +/// are returned so that the newly introduced loop nest roots can be +/// identified. +static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, + const ValueToValueMapTy &VMap, LoopInfo &LI, + SmallVectorImpl<Loop *> &NonChildClonedLoops) { + Loop *ClonedL = nullptr; + + auto *OrigPH = OrigL.getLoopPreheader(); + auto *OrigHeader = OrigL.getHeader(); + + auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH)); + auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader)); + + // We need to know the loops of the cloned exit blocks to even compute the + // accurate parent loop. If we only clone exits to some parent of the + // original parent, we want to clone into that outer loop. We also keep track + // of the loops that our cloned exit blocks participate in. + Loop *ParentL = nullptr; + SmallVector<BasicBlock *, 4> ClonedExitsInLoops; + SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap; + ClonedExitsInLoops.reserve(ExitBlocks.size()); + for (auto *ExitBB : ExitBlocks) + if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB))) + if (Loop *ExitL = LI.getLoopFor(ExitBB)) { + ExitLoopMap[ClonedExitBB] = ExitL; + ClonedExitsInLoops.push_back(ClonedExitBB); + if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL))) + ParentL = ExitL; + } + assert((!ParentL || ParentL == OrigL.getParentLoop() || + ParentL->contains(OrigL.getParentLoop())) && + "The computed parent loop should always contain (or be) the parent of " + "the original loop."); + + // We build the set of blocks dominated by the cloned header from the set of + // cloned blocks out of the original loop. While not all of these will + // necessarily be in the cloned loop, it is enough to establish that they + // aren't in unreachable cycles, etc. + SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks; + for (auto *BB : OrigL.blocks()) + if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB))) + ClonedLoopBlocks.insert(ClonedBB); + + // Rebuild the set of blocks that will end up in the cloned loop. We may have + // skipped cloning some region of this loop which can in turn skip some of + // the backedges so we have to rebuild the blocks in the loop based on the + // backedges that remain after cloning. + SmallVector<BasicBlock *, 16> Worklist; + SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop; + for (auto *Pred : predecessors(ClonedHeader)) { + // The only possible non-loop header predecessor is the preheader because + // we know we cloned the loop in simplified form. + if (Pred == ClonedPH) + continue; + + // Because the loop was in simplified form, the only non-loop predecessor + // should be the preheader. + assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop " + "header other than the preheader " + "that is not part of the loop!"); + + // Insert this block into the loop set and on the first visit (and if it + // isn't the header we're currently walking) put it into the worklist to + // recurse through. + if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader) + Worklist.push_back(Pred); + } + + // If we had any backedges then there *is* a cloned loop. Put the header into + // the loop set and then walk the worklist backwards to find all the blocks + // that remain within the loop after cloning. + if (!BlocksInClonedLoop.empty()) { + BlocksInClonedLoop.insert(ClonedHeader); + + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + assert(BlocksInClonedLoop.count(BB) && + "Didn't put block into the loop set!"); + + // Insert any predecessors that are in the possible set into the cloned + // set, and if the insert is successful, add them to the worklist. Note + // that we filter on the blocks that are definitely reachable via the + // backedge to the loop header so we may prune out dead code within the + // cloned loop. + for (auto *Pred : predecessors(BB)) + if (ClonedLoopBlocks.count(Pred) && + BlocksInClonedLoop.insert(Pred).second) + Worklist.push_back(Pred); + } + + ClonedL = LI.AllocateLoop(); + if (ParentL) { + ParentL->addBasicBlockToLoop(ClonedPH, LI); + ParentL->addChildLoop(ClonedL); + } else { + LI.addTopLevelLoop(ClonedL); + } + + ClonedL->reserveBlocks(BlocksInClonedLoop.size()); + // We don't want to just add the cloned loop blocks based on how we + // discovered them. The original order of blocks was carefully built in + // a way that doesn't rely on predecessor ordering. Rather than re-invent + // that logic, we just re-walk the original blocks (and those of the child + // loops) and filter them as we add them into the cloned loop. + for (auto *BB : OrigL.blocks()) { + auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)); + if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB)) + continue; + + // Directly add the blocks that are only in this loop. + if (LI.getLoopFor(BB) == &OrigL) { + ClonedL->addBasicBlockToLoop(ClonedBB, LI); + continue; + } + + // We want to manually add it to this loop and parents. + // Registering it with LoopInfo will happen when we clone the top + // loop for this block. + for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop()) + PL->addBlockEntry(ClonedBB); + } + + // Now add each child loop whose header remains within the cloned loop. All + // of the blocks within the loop must satisfy the same constraints as the + // header so once we pass the header checks we can just clone the entire + // child loop nest. + for (Loop *ChildL : OrigL) { + auto *ClonedChildHeader = + cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader())); + if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader)) + continue; + +#ifndef NDEBUG + // We should never have a cloned child loop header but fail to have + // all of the blocks for that child loop. + for (auto *ChildLoopBB : ChildL->blocks()) + assert(BlocksInClonedLoop.count( + cast<BasicBlock>(VMap.lookup(ChildLoopBB))) && + "Child cloned loop has a header within the cloned outer " + "loop but not all of its blocks!"); +#endif + + cloneLoopNest(*ChildL, ClonedL, VMap, LI); + } + } + + // Now that we've handled all the components of the original loop that were + // cloned into a new loop, we still need to handle anything from the original + // loop that wasn't in a cloned loop. + + // Figure out what blocks are left to place within any loop nest containing + // the unswitched loop. If we never formed a loop, the cloned PH is one of + // them. + SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet; + if (BlocksInClonedLoop.empty()) + UnloopedBlockSet.insert(ClonedPH); + for (auto *ClonedBB : ClonedLoopBlocks) + if (!BlocksInClonedLoop.count(ClonedBB)) + UnloopedBlockSet.insert(ClonedBB); + + // Copy the cloned exits and sort them in ascending loop depth, we'll work + // backwards across these to process them inside out. The order shouldn't + // matter as we're just trying to build up the map from inside-out; we use + // the map in a more stably ordered way below. + auto OrderedClonedExitsInLoops = ClonedExitsInLoops; + std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(), + [&](BasicBlock *LHS, BasicBlock *RHS) { + return ExitLoopMap.lookup(LHS)->getLoopDepth() < + ExitLoopMap.lookup(RHS)->getLoopDepth(); + }); + + // Populate the existing ExitLoopMap with everything reachable from each + // exit, starting from the inner most exit. + while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) { + assert(Worklist.empty() && "Didn't clear worklist!"); + + BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val(); + Loop *ExitL = ExitLoopMap.lookup(ExitBB); + + // Walk the CFG back until we hit the cloned PH adding everything reachable + // and in the unlooped set to this exit block's loop. + Worklist.push_back(ExitBB); + do { + BasicBlock *BB = Worklist.pop_back_val(); + // We can stop recursing at the cloned preheader (if we get there). + if (BB == ClonedPH) + continue; + + for (BasicBlock *PredBB : predecessors(BB)) { + // If this pred has already been moved to our set or is part of some + // (inner) loop, no update needed. + if (!UnloopedBlockSet.erase(PredBB)) { + assert( + (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) && + "Predecessor not mapped to a loop!"); + continue; + } + + // We just insert into the loop set here. We'll add these blocks to the + // exit loop after we build up the set in an order that doesn't rely on + // predecessor order (which in turn relies on use list order). + bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second; + (void)Inserted; + assert(Inserted && "Should only visit an unlooped block once!"); + + // And recurse through to its predecessors. + Worklist.push_back(PredBB); + } + } while (!Worklist.empty()); + } + + // Now that the ExitLoopMap gives as mapping for all the non-looping cloned + // blocks to their outer loops, walk the cloned blocks and the cloned exits + // in their original order adding them to the correct loop. + + // We need a stable insertion order. We use the order of the original loop + // order and map into the correct parent loop. + for (auto *BB : llvm::concat<BasicBlock *const>( + makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops)) + if (Loop *OuterL = ExitLoopMap.lookup(BB)) + OuterL->addBasicBlockToLoop(BB, LI); + +#ifndef NDEBUG + for (auto &BBAndL : ExitLoopMap) { + auto *BB = BBAndL.first; + auto *OuterL = BBAndL.second; + assert(LI.getLoopFor(BB) == OuterL && + "Failed to put all blocks into outer loops!"); + } +#endif + + // Now that all the blocks are placed into the correct containing loop in the + // absence of child loops, find all the potentially cloned child loops and + // clone them into whatever outer loop we placed their header into. + for (Loop *ChildL : OrigL) { + auto *ClonedChildHeader = + cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader())); + if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader)) + continue; + +#ifndef NDEBUG + for (auto *ChildLoopBB : ChildL->blocks()) + assert(VMap.count(ChildLoopBB) && + "Cloned a child loop header but not all of that loops blocks!"); +#endif + + NonChildClonedLoops.push_back(cloneLoopNest( + *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI)); + } + + // Return the main cloned loop if any. + return ClonedL; +} + +static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot, + SmallVectorImpl<BasicBlock *> &ExitBlocks, + DominatorTree &DT, LoopInfo &LI) { + // Walk the dominator tree to build up the set of blocks we will delete here. + // The order is designed to allow us to always delete bottom-up and avoid any + // dangling uses. + SmallSetVector<BasicBlock *, 16> DeadBlocks; + DeadBlocks.insert(DeadSubtreeRoot); + for (int i = 0; i < (int)DeadBlocks.size(); ++i) + for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) { + // FIXME: This assert should pass and that means we don't change nearly + // as much below! Consider rewriting all of this to avoid deleting + // blocks. They are always cloned before being deleted, and so instead + // could just be moved. + // FIXME: This in turn means that we might actually be more able to + // update the domtree. + assert((L.contains(ChildN->getBlock()) || + llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) && + "Should never reach beyond the loop and exits when deleting!"); + DeadBlocks.insert(ChildN->getBlock()); + } + + // Filter out the dead blocks from the exit blocks list so that it can be + // used in the caller. + llvm::erase_if(ExitBlocks, + [&](BasicBlock *BB) { return DeadBlocks.count(BB); }); + + // Remove these blocks from their successors. + for (auto *BB : DeadBlocks) + for (BasicBlock *SuccBB : successors(BB)) + SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true); + + // Walk from this loop up through its parents removing all of the dead blocks. + for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { + for (auto *BB : DeadBlocks) + ParentL->getBlocksSet().erase(BB); + llvm::erase_if(ParentL->getBlocksVector(), + [&](BasicBlock *BB) { return DeadBlocks.count(BB); }); + } + + // Now delete the dead child loops. This raw delete will clear them + // recursively. + llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) { + if (!DeadBlocks.count(ChildL->getHeader())) + return false; + + assert(llvm::all_of(ChildL->blocks(), + [&](BasicBlock *ChildBB) { + return DeadBlocks.count(ChildBB); + }) && + "If the child loop header is dead all blocks in the child loop must " + "be dead as well!"); + LI.destroy(ChildL); + return true; + }); + + // Remove the mappings for the dead blocks. + for (auto *BB : DeadBlocks) + LI.changeLoopFor(BB, nullptr); + + // Drop all the references from these blocks to others to handle cyclic + // references as we start deleting the blocks themselves. + for (auto *BB : DeadBlocks) + BB->dropAllReferences(); + + for (auto *BB : llvm::reverse(DeadBlocks)) { + DT.eraseNode(BB); + BB->eraseFromParent(); + } +} + +/// Recompute the set of blocks in a loop after unswitching. +/// +/// This walks from the original headers predecessors to rebuild the loop. We +/// take advantage of the fact that new blocks can't have been added, and so we +/// filter by the original loop's blocks. This also handles potentially +/// unreachable code that we don't want to explore but might be found examining +/// the predecessors of the header. +/// +/// If the original loop is no longer a loop, this will return an empty set. If +/// it remains a loop, all the blocks within it will be added to the set +/// (including those blocks in inner loops). +static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, + LoopInfo &LI) { + SmallPtrSet<const BasicBlock *, 16> LoopBlockSet; + + auto *PH = L.getLoopPreheader(); + auto *Header = L.getHeader(); + + // A worklist to use while walking backwards from the header. + SmallVector<BasicBlock *, 16> Worklist; + + // First walk the predecessors of the header to find the backedges. This will + // form the basis of our walk. + for (auto *Pred : predecessors(Header)) { + // Skip the preheader. + if (Pred == PH) + continue; + + // Because the loop was in simplified form, the only non-loop predecessor + // is the preheader. + assert(L.contains(Pred) && "Found a predecessor of the loop header other " + "than the preheader that is not part of the " + "loop!"); + + // Insert this block into the loop set and on the first visit and, if it + // isn't the header we're currently walking, put it into the worklist to + // recurse through. + if (LoopBlockSet.insert(Pred).second && Pred != Header) + Worklist.push_back(Pred); + } + + // If no backedges were found, we're done. + if (LoopBlockSet.empty()) + return LoopBlockSet; + + // Add the loop header to the set. + LoopBlockSet.insert(Header); + + // We found backedges, recurse through them to identify the loop blocks. + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!"); + + // Because we know the inner loop structure remains valid we can use the + // loop structure to jump immediately across the entire nested loop. + // Further, because it is in loop simplified form, we can directly jump + // to its preheader afterward. + if (Loop *InnerL = LI.getLoopFor(BB)) + if (InnerL != &L) { + assert(L.contains(InnerL) && + "Should not reach a loop *outside* this loop!"); + // The preheader is the only possible predecessor of the loop so + // insert it into the set and check whether it was already handled. + auto *InnerPH = InnerL->getLoopPreheader(); + assert(L.contains(InnerPH) && "Cannot contain an inner loop block " + "but not contain the inner loop " + "preheader!"); + if (!LoopBlockSet.insert(InnerPH).second) + // The only way to reach the preheader is through the loop body + // itself so if it has been visited the loop is already handled. + continue; + + // Insert all of the blocks (other than those already present) into + // the loop set. The only block we expect to already be in the set is + // the one we used to find this loop as we immediately handle the + // others the first time we encounter the loop. + for (auto *InnerBB : InnerL->blocks()) { + if (InnerBB == BB) { + assert(LoopBlockSet.count(InnerBB) && + "Block should already be in the set!"); + continue; + } + + bool Inserted = LoopBlockSet.insert(InnerBB).second; + (void)Inserted; + assert(Inserted && "Should only insert an inner loop once!"); + } + + // Add the preheader to the worklist so we will continue past the + // loop body. + Worklist.push_back(InnerPH); + continue; + } + + // Insert any predecessors that were in the original loop into the new + // set, and if the insert is successful, add them to the worklist. + for (auto *Pred : predecessors(BB)) + if (L.contains(Pred) && LoopBlockSet.insert(Pred).second) + Worklist.push_back(Pred); + } + + // We've found all the blocks participating in the loop, return our completed + // set. + return LoopBlockSet; +} + +/// Rebuild a loop after unswitching removes some subset of blocks and edges. +/// +/// The removal may have removed some child loops entirely but cannot have +/// disturbed any remaining child loops. However, they may need to be hoisted +/// to the parent loop (or to be top-level loops). The original loop may be +/// completely removed. +/// +/// The sibling loops resulting from this update are returned. If the original +/// loop remains a valid loop, it will be the first entry in this list with all +/// of the newly sibling loops following it. +/// +/// Returns true if the loop remains a loop after unswitching, and false if it +/// is no longer a loop after unswitching (and should not continue to be +/// referenced). +static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, + LoopInfo &LI, + SmallVectorImpl<Loop *> &HoistedLoops) { + auto *PH = L.getLoopPreheader(); + + // Compute the actual parent loop from the exit blocks. Because we may have + // pruned some exits the loop may be different from the original parent. + Loop *ParentL = nullptr; + SmallVector<Loop *, 4> ExitLoops; + SmallVector<BasicBlock *, 4> ExitsInLoops; + ExitsInLoops.reserve(ExitBlocks.size()); + for (auto *ExitBB : ExitBlocks) + if (Loop *ExitL = LI.getLoopFor(ExitBB)) { + ExitLoops.push_back(ExitL); + ExitsInLoops.push_back(ExitBB); + if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL))) + ParentL = ExitL; + } + + // Recompute the blocks participating in this loop. This may be empty if it + // is no longer a loop. + auto LoopBlockSet = recomputeLoopBlockSet(L, LI); + + // If we still have a loop, we need to re-set the loop's parent as the exit + // block set changing may have moved it within the loop nest. Note that this + // can only happen when this loop has a parent as it can only hoist the loop + // *up* the nest. + if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) { + // Remove this loop's (original) blocks from all of the intervening loops. + for (Loop *IL = L.getParentLoop(); IL != ParentL; + IL = IL->getParentLoop()) { + IL->getBlocksSet().erase(PH); + for (auto *BB : L.blocks()) + IL->getBlocksSet().erase(BB); + llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) { + return BB == PH || L.contains(BB); + }); + } + + LI.changeLoopFor(PH, ParentL); + L.getParentLoop()->removeChildLoop(&L); + if (ParentL) + ParentL->addChildLoop(&L); + else + LI.addTopLevelLoop(&L); + } + + // Now we update all the blocks which are no longer within the loop. + auto &Blocks = L.getBlocksVector(); + auto BlocksSplitI = + LoopBlockSet.empty() + ? Blocks.begin() + : std::stable_partition( + Blocks.begin(), Blocks.end(), + [&](BasicBlock *BB) { return LoopBlockSet.count(BB); }); + + // Before we erase the list of unlooped blocks, build a set of them. + SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end()); + if (LoopBlockSet.empty()) + UnloopedBlocks.insert(PH); + + // Now erase these blocks from the loop. + for (auto *BB : make_range(BlocksSplitI, Blocks.end())) + L.getBlocksSet().erase(BB); + Blocks.erase(BlocksSplitI, Blocks.end()); + + // Sort the exits in ascending loop depth, we'll work backwards across these + // to process them inside out. + std::stable_sort(ExitsInLoops.begin(), ExitsInLoops.end(), + [&](BasicBlock *LHS, BasicBlock *RHS) { + return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS); + }); + + // We'll build up a set for each exit loop. + SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks; + Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop. + + auto RemoveUnloopedBlocksFromLoop = + [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) { + for (auto *BB : UnloopedBlocks) + L.getBlocksSet().erase(BB); + llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) { + return UnloopedBlocks.count(BB); + }); + }; + + SmallVector<BasicBlock *, 16> Worklist; + while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) { + assert(Worklist.empty() && "Didn't clear worklist!"); + assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!"); + + // Grab the next exit block, in decreasing loop depth order. + BasicBlock *ExitBB = ExitsInLoops.pop_back_val(); + Loop &ExitL = *LI.getLoopFor(ExitBB); + assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!"); + + // Erase all of the unlooped blocks from the loops between the previous + // exit loop and this exit loop. This works because the ExitInLoops list is + // sorted in increasing order of loop depth and thus we visit loops in + // decreasing order of loop depth. + for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop()) + RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); + + // Walk the CFG back until we hit the cloned PH adding everything reachable + // and in the unlooped set to this exit block's loop. + Worklist.push_back(ExitBB); + do { + BasicBlock *BB = Worklist.pop_back_val(); + // We can stop recursing at the cloned preheader (if we get there). + if (BB == PH) + continue; + + for (BasicBlock *PredBB : predecessors(BB)) { + // If this pred has already been moved to our set or is part of some + // (inner) loop, no update needed. + if (!UnloopedBlocks.erase(PredBB)) { + assert((NewExitLoopBlocks.count(PredBB) || + ExitL.contains(LI.getLoopFor(PredBB))) && + "Predecessor not in a nested loop (or already visited)!"); + continue; + } + + // We just insert into the loop set here. We'll add these blocks to the + // exit loop after we build up the set in a deterministic order rather + // than the predecessor-influenced visit order. + bool Inserted = NewExitLoopBlocks.insert(PredBB).second; + (void)Inserted; + assert(Inserted && "Should only visit an unlooped block once!"); + + // And recurse through to its predecessors. + Worklist.push_back(PredBB); + } + } while (!Worklist.empty()); + + // If blocks in this exit loop were directly part of the original loop (as + // opposed to a child loop) update the map to point to this exit loop. This + // just updates a map and so the fact that the order is unstable is fine. + for (auto *BB : NewExitLoopBlocks) + if (Loop *BBL = LI.getLoopFor(BB)) + if (BBL == &L || !L.contains(BBL)) + LI.changeLoopFor(BB, &ExitL); + + // We will remove the remaining unlooped blocks from this loop in the next + // iteration or below. + NewExitLoopBlocks.clear(); + } + + // Any remaining unlooped blocks are no longer part of any loop unless they + // are part of some child loop. + for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop()) + RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); + for (auto *BB : UnloopedBlocks) + if (Loop *BBL = LI.getLoopFor(BB)) + if (BBL == &L || !L.contains(BBL)) + LI.changeLoopFor(BB, nullptr); + + // Sink all the child loops whose headers are no longer in the loop set to + // the parent (or to be top level loops). We reach into the loop and directly + // update its subloop vector to make this batch update efficient. + auto &SubLoops = L.getSubLoopsVector(); + auto SubLoopsSplitI = + LoopBlockSet.empty() + ? SubLoops.begin() + : std::stable_partition( + SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) { + return LoopBlockSet.count(SubL->getHeader()); + }); + for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) { + HoistedLoops.push_back(HoistedL); + HoistedL->setParentLoop(nullptr); + + // To compute the new parent of this hoisted loop we look at where we + // placed the preheader above. We can't lookup the header itself because we + // retained the mapping from the header to the hoisted loop. But the + // preheader and header should have the exact same new parent computed + // based on the set of exit blocks from the original loop as the preheader + // is a predecessor of the header and so reached in the reverse walk. And + // because the loops were all in simplified form the preheader of the + // hoisted loop can't be part of some *other* loop. + if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader())) + NewParentL->addChildLoop(HoistedL); + else + LI.addTopLevelLoop(HoistedL); + } + SubLoops.erase(SubLoopsSplitI, SubLoops.end()); + + // Actually delete the loop if nothing remained within it. + if (Blocks.empty()) { + assert(SubLoops.empty() && + "Failed to remove all subloops from the original loop!"); + if (Loop *ParentL = L.getParentLoop()) + ParentL->removeChildLoop(llvm::find(*ParentL, &L)); + else + LI.removeLoop(llvm::find(LI, &L)); + LI.destroy(&L); + return false; + } + + return true; +} + +/// Helper to visit a dominator subtree, invoking a callable on each node. +/// +/// Returning false at any point will stop walking past that node of the tree. +template <typename CallableT> +void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { + SmallVector<DomTreeNode *, 4> DomWorklist; + DomWorklist.push_back(DT[BB]); +#ifndef NDEBUG + SmallPtrSet<DomTreeNode *, 4> Visited; + Visited.insert(DT[BB]); +#endif + do { + DomTreeNode *N = DomWorklist.pop_back_val(); + + // Visit this node. + if (!Callable(N->getBlock())) + continue; + + // Accumulate the child nodes. + for (DomTreeNode *ChildN : *N) { + assert(Visited.insert(ChildN).second && + "Cannot visit a node twice when walking a tree!"); + DomWorklist.push_back(ChildN); + } + } while (!DomWorklist.empty()); +} + +/// Take an invariant branch that has been determined to be safe and worthwhile +/// to unswitch despite being non-trivial to do so and perform the unswitch. +/// +/// This directly updates the CFG to hoist the predicate out of the loop, and +/// clone the necessary parts of the loop to maintain behavior. +/// +/// It also updates both dominator tree and loopinfo based on the unswitching. +/// +/// Once unswitching has been performed it runs the provided callback to report +/// the new loops and no-longer valid loops to the caller. +static bool unswitchInvariantBranch( + Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, + function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) { + assert(BI.isConditional() && "Can only unswitch a conditional branch!"); + assert(L.isLoopInvariant(BI.getCondition()) && + "Can only unswitch an invariant branch condition!"); + + // Constant and BBs tracking the cloned and continuing successor. + const int ClonedSucc = 0; + auto *ParentBB = BI.getParent(); + auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc); + auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc); + + assert(UnswitchedSuccBB != ContinueSuccBB && + "Should not unswitch a branch that always goes to the same place!"); + + // The branch should be in this exact loop. Any inner loop's invariant branch + // should be handled by unswitching that inner loop. The caller of this + // routine should filter out any candidates that remain (but were skipped for + // whatever reason). + assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!"); + + SmallVector<BasicBlock *, 4> ExitBlocks; + L.getUniqueExitBlocks(ExitBlocks); + + // We cannot unswitch if exit blocks contain a cleanuppad instruction as we + // don't know how to split those exit blocks. + // FIXME: We should teach SplitBlock to handle this and remove this + // restriction. + for (auto *ExitBB : ExitBlocks) + if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) + return false; + + SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); + + // Compute the parent loop now before we start hacking on things. + Loop *ParentL = L.getParentLoop(); + + // Compute the outer-most loop containing one of our exit blocks. This is the + // furthest up our loopnest which can be mutated, which we will use below to + // update things. + Loop *OuterExitL = &L; + for (auto *ExitBB : ExitBlocks) { + Loop *NewOuterExitL = LI.getLoopFor(ExitBB); + if (!NewOuterExitL) { + // We exited the entire nest with this block, so we're done. + OuterExitL = nullptr; + break; + } + if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL)) + OuterExitL = NewOuterExitL; + } + + // If the edge we *aren't* cloning in the unswitch (the continuing edge) + // dominates its target, we can skip cloning the dominated region of the loop + // and its exits. We compute this as a set of nodes to be skipped. + SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks; + if (ContinueSuccBB->getUniquePredecessor() || + llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) { + return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB); + })) { + visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) { + SkippedLoopAndExitBlocks.insert(BB); + return true; + }); + } + // Similarly, if the edge we *are* cloning in the unswitch (the unswitched + // edge) dominates its target, we will end up with dead nodes in the original + // loop and its exits that will need to be deleted. Here, we just retain that + // the property holds and will compute the deleted set later. + bool DeleteUnswitchedSucc = + UnswitchedSuccBB->getUniquePredecessor() || + llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) { + return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB); + }); + + // Split the preheader, so that we know that there is a safe place to insert + // the conditional branch. We will change the preheader to have a conditional + // branch on LoopCond. The original preheader will become the split point + // between the unswitched versions, and we will have a new preheader for the + // original loop. + BasicBlock *SplitBB = L.getLoopPreheader(); + BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI); + + // Keep a mapping for the cloned values. + ValueToValueMapTy VMap; + + // Build the cloned blocks from the loop. + auto *ClonedPH = buildClonedLoopBlocks( + L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB, + ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI); + + // Build the cloned loop structure itself. This may be substantially + // different from the original structure due to the simplified CFG. This also + // handles inserting all the cloned blocks into the correct loops. + SmallVector<Loop *, 4> NonChildClonedLoops; + Loop *ClonedL = + buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops); + + // Remove the parent as a predecessor of the unswitched successor. + UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true); + + // Now splice the branch from the original loop and use it to select between + // the two loops. + SplitBB->getTerminator()->eraseFromParent(); + SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI); + BI.setSuccessor(ClonedSucc, ClonedPH); + BI.setSuccessor(1 - ClonedSucc, LoopPH); + + // Create a new unconditional branch to the continuing block (as opposed to + // the one cloned). + BranchInst::Create(ContinueSuccBB, ParentBB); + + // Delete anything that was made dead in the original loop due to + // unswitching. + if (DeleteUnswitchedSucc) + deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI); + + SmallVector<Loop *, 4> HoistedLoops; + bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops); + + // This will have completely invalidated the dominator tree. We can't easily + // bound how much is invalid because in some cases we will refine the + // predecessor set of exit blocks of the loop which can move large unrelated + // regions of code into a new subtree. + // + // FIXME: Eventually, we should use an incremental update utility that + // leverages the existing information in the dominator tree (and potentially + // the nature of the change) to more efficiently update things. + DT.recalculate(*SplitBB->getParent()); + + // We can change which blocks are exit blocks of all the cloned sibling + // loops, the current loop, and any parent loops which shared exit blocks + // with the current loop. As a consequence, we need to re-form LCSSA for + // them. But we shouldn't need to re-form LCSSA for any child loops. + // FIXME: This could be made more efficient by tracking which exit blocks are + // new, and focusing on them, but that isn't likely to be necessary. + // + // In order to reasonably rebuild LCSSA we need to walk inside-out across the + // loop nest and update every loop that could have had its exits changed. We + // also need to cover any intervening loops. We add all of these loops to + // a list and sort them by loop depth to achieve this without updating + // unnecessary loops. + auto UpdateLCSSA = [&](Loop &UpdateL) { +#ifndef NDEBUG + for (Loop *ChildL : UpdateL) + assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && + "Perturbed a child loop's LCSSA form!"); +#endif + formLCSSA(UpdateL, DT, &LI, nullptr); + }; + + // For non-child cloned loops and hoisted loops, we just need to update LCSSA + // and we can do it in any order as they don't nest relative to each other. + for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) + UpdateLCSSA(*UpdatedL); + + // If the original loop had exit blocks, walk up through the outer most loop + // of those exit blocks to update LCSSA and form updated dedicated exits. + if (OuterExitL != &L) { + SmallVector<Loop *, 4> OuterLoops; + // We start with the cloned loop and the current loop if they are loops and + // move toward OuterExitL. Also, if either the cloned loop or the current + // loop have become top level loops we need to walk all the way out. + if (ClonedL) { + OuterLoops.push_back(ClonedL); + if (!ClonedL->getParentLoop()) + OuterExitL = nullptr; + } + if (IsStillLoop) { + OuterLoops.push_back(&L); + if (!L.getParentLoop()) + OuterExitL = nullptr; + } + // Grab all of the enclosing loops now. + for (Loop *OuterL = ParentL; OuterL != OuterExitL; + OuterL = OuterL->getParentLoop()) + OuterLoops.push_back(OuterL); + + // Finally, update our list of outer loops. This is nicely ordered to work + // inside-out. + for (Loop *OuterL : OuterLoops) { + // First build LCSSA for this loop so that we can preserve it when + // forming dedicated exits. We don't want to perturb some other loop's + // LCSSA while doing that CFG edit. + UpdateLCSSA(*OuterL); + + // For loops reached by this loop's original exit blocks we may + // introduced new, non-dedicated exits. At least try to re-form dedicated + // exits for these loops. This may fail if they couldn't have dedicated + // exits to start with. + formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true); + } + } + +#ifndef NDEBUG + // Verify the entire loop structure to catch any incorrect updates before we + // progress in the pass pipeline. + LI.verify(DT); +#endif + + // Now that we've unswitched something, make callbacks to report the changes. + // For that we need to merge together the updated loops and the cloned loops + // and check whether the original loop survived. + SmallVector<Loop *, 4> SibLoops; + for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) + if (UpdatedL->getParentLoop() == ParentL) + SibLoops.push_back(UpdatedL); + NonTrivialUnswitchCB(IsStillLoop, SibLoops); + + ++NumBranches; + return true; +} + +/// Recursively compute the cost of a dominator subtree based on the per-block +/// cost map provided. +/// +/// The recursive computation is memozied into the provided DT-indexed cost map +/// to allow querying it for most nodes in the domtree without it becoming +/// quadratic. +static int +computeDomSubtreeCost(DomTreeNode &N, + const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap, + SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) { + // Don't accumulate cost (or recurse through) blocks not in our block cost + // map and thus not part of the duplication cost being considered. + auto BBCostIt = BBCostMap.find(N.getBlock()); + if (BBCostIt == BBCostMap.end()) + return 0; + + // Lookup this node to see if we already computed its cost. + auto DTCostIt = DTCostMap.find(&N); + if (DTCostIt != DTCostMap.end()) + return DTCostIt->second; + + // If not, we have to compute it. We can't use insert above and update + // because computing the cost may insert more things into the map. + int Cost = std::accumulate( + N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) { + return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap); + }); + bool Inserted = DTCostMap.insert({&N, Cost}).second; + (void)Inserted; + assert(Inserted && "Should not insert a node while visiting children!"); + return Cost; +} + /// Unswitch control flow predicated on loop invariant conditions. /// /// This first hoists all branches or switches which are trivial (IE, do not /// require duplicating any part of the loop) out of the loop body. It then /// looks at other loop invariant control flows and tries to unswitch those as /// well by cloning the loop if the result is small enough. -static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, - AssumptionCache &AC) { - assert(L.isLCSSAForm(DT) && +static bool +unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, + TargetTransformInfo &TTI, bool NonTrivial, + function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) { + assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); bool Changed = false; @@ -727,7 +1926,136 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, // Try trivial unswitch first before loop over other basic blocks in the loop. Changed |= unswitchAllTrivialConditions(L, DT, LI); - // FIXME: Add support for non-trivial unswitching by cloning the loop. + // If we're not doing non-trivial unswitching, we're done. We both accept + // a parameter but also check a local flag that can be used for testing + // a debugging. + if (!NonTrivial && !EnableNonTrivialUnswitch) + return Changed; + + // Collect all remaining invariant branch conditions within this loop (as + // opposed to an inner loop which would be handled when visiting that inner + // loop). + SmallVector<TerminatorInst *, 4> UnswitchCandidates; + for (auto *BB : L.blocks()) + if (LI.getLoopFor(BB) == &L) + if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) + if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) && + BI->getSuccessor(0) != BI->getSuccessor(1)) + UnswitchCandidates.push_back(BI); + + // If we didn't find any candidates, we're done. + if (UnswitchCandidates.empty()) + return Changed; + + DEBUG(dbgs() << "Considering " << UnswitchCandidates.size() + << " non-trivial loop invariant conditions for unswitching.\n"); + + // Given that unswitching these terminators will require duplicating parts of + // the loop, so we need to be able to model that cost. Compute the ephemeral + // values and set up a data structure to hold per-BB costs. We cache each + // block's cost so that we don't recompute this when considering different + // subsets of the loop for duplication during unswitching. + SmallPtrSet<const Value *, 4> EphValues; + CodeMetrics::collectEphemeralValues(&L, &AC, EphValues); + SmallDenseMap<BasicBlock *, int, 4> BBCostMap; + + // Compute the cost of each block, as well as the total loop cost. Also, bail + // out if we see instructions which are incompatible with loop unswitching + // (convergent, noduplicate, or cross-basic-block tokens). + // FIXME: We might be able to safely handle some of these in non-duplicated + // regions. + int LoopCost = 0; + for (auto *BB : L.blocks()) { + int Cost = 0; + for (auto &I : *BB) { + if (EphValues.count(&I)) + continue; + + if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) + return Changed; + if (auto CS = CallSite(&I)) + if (CS.isConvergent() || CS.cannotDuplicate()) + return Changed; + + Cost += TTI.getUserCost(&I); + } + assert(Cost >= 0 && "Must not have negative costs!"); + LoopCost += Cost; + assert(LoopCost >= 0 && "Must not have negative loop costs!"); + BBCostMap[BB] = Cost; + } + DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n"); + + // Now we find the best candidate by searching for the one with the following + // properties in order: + // + // 1) An unswitching cost below the threshold + // 2) The smallest number of duplicated unswitch candidates (to avoid + // creating redundant subsequent unswitching) + // 3) The smallest cost after unswitching. + // + // We prioritize reducing fanout of unswitch candidates provided the cost + // remains below the threshold because this has a multiplicative effect. + // + // This requires memoizing each dominator subtree to avoid redundant work. + // + // FIXME: Need to actually do the number of candidates part above. + SmallDenseMap<DomTreeNode *, int, 4> DTCostMap; + // Given a terminator which might be unswitched, computes the non-duplicated + // cost for that terminator. + auto ComputeUnswitchedCost = [&](TerminatorInst *TI) { + BasicBlock &BB = *TI->getParent(); + SmallPtrSet<BasicBlock *, 4> Visited; + + int Cost = LoopCost; + for (BasicBlock *SuccBB : successors(&BB)) { + // Don't count successors more than once. + if (!Visited.insert(SuccBB).second) + continue; + + // This successor's domtree will not need to be duplicated after + // unswitching if the edge to the successor dominates it (and thus the + // entire tree). This essentially means there is no other path into this + // subtree and so it will end up live in only one clone of the loop. + if (SuccBB->getUniquePredecessor() || + llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { + return PredBB == &BB || DT.dominates(SuccBB, PredBB); + })) { + Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap); + assert(Cost >= 0 && + "Non-duplicated cost should never exceed total loop cost!"); + } + } + + // Now scale the cost by the number of unique successors minus one. We + // subtract one because there is already at least one copy of the entire + // loop. This is computing the new cost of unswitching a condition. + assert(Visited.size() > 1 && + "Cannot unswitch a condition without multiple distinct successors!"); + return Cost * (Visited.size() - 1); + }; + TerminatorInst *BestUnswitchTI = nullptr; + int BestUnswitchCost; + for (TerminatorInst *CandidateTI : UnswitchCandidates) { + int CandidateCost = ComputeUnswitchedCost(CandidateTI); + DEBUG(dbgs() << " Computed cost of " << CandidateCost + << " for unswitch candidate: " << *CandidateTI << "\n"); + if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) { + BestUnswitchTI = CandidateTI; + BestUnswitchCost = CandidateCost; + } + } + + if (BestUnswitchCost < UnswitchThreshold) { + DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = " + << BestUnswitchCost << ") branch: " << *BestUnswitchTI + << "\n"); + Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT, + LI, AC, NonTrivialUnswitchCB); + } else { + DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost + << "\n"); + } return Changed; } @@ -740,7 +2068,25 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); - if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC)) + // Save the current loop name in a variable so that we can report it even + // after it has been deleted. + std::string LoopName = L.getName(); + + auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid, + ArrayRef<Loop *> NewLoops) { + // If we did a non-trivial unswitch, we have added new (cloned) loops. + U.addSiblingLoops(NewLoops); + + // If the current loop remains valid, we should revisit it to catch any + // other unswitch opportunities. Otherwise, we need to mark it as deleted. + if (CurrentLoopValid) + U.revisitCurrentLoop(); + else + U.markLoopAsDeleted(L, LoopName); + }; + + if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, + NonTrivialUnswitchCB)) return PreservedAnalyses::all(); #ifndef NDEBUG @@ -754,10 +2100,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, namespace { class SimpleLoopUnswitchLegacyPass : public LoopPass { + bool NonTrivial; + public: static char ID; // Pass ID, replacement for typeid - explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) { + explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false) + : LoopPass(ID), NonTrivial(NonTrivial) { initializeSimpleLoopUnswitchLegacyPassPass( *PassRegistry::getPassRegistry()); } @@ -766,6 +2115,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); getLoopAnalysisUsage(AU); } }; @@ -783,8 +2133,29 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid, + ArrayRef<Loop *> NewLoops) { + // If we did a non-trivial unswitch, we have added new (cloned) loops. + for (auto *NewL : NewLoops) + LPM.addLoop(*NewL); + + // If the current loop remains valid, re-add it to the queue. This is + // a little wasteful as we'll finish processing the current loop as well, + // but it is the best we can do in the old PM. + if (CurrentLoopValid) + LPM.addLoop(*L); + else + LPM.markLoopAsDeleted(*L); + }; + + bool Changed = + unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB); - bool Changed = unswitchLoop(*L, DT, LI, AC); + // If anything was unswitched, also clear any cached information about this + // loop. + LPM.deleteSimpleAnalysisLoop(L); #ifndef NDEBUG // Historically this pass has had issues with the dominator tree so verify it @@ -798,11 +2169,13 @@ char SimpleLoopUnswitchLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch", "Simple unswitch loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch", "Simple unswitch loops", false, false) -Pass *llvm::createSimpleLoopUnswitchLegacyPass() { - return new SimpleLoopUnswitchLegacyPass(); +Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) { + return new SimpleLoopUnswitchLegacyPass(NonTrivial); } diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 8754c714c5b2..1522170dc3b9 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -45,9 +45,26 @@ using namespace llvm; #define DEBUG_TYPE "simplifycfg" -static cl::opt<unsigned> -UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1), - cl::desc("Control the number of bonus instructions (default = 1)")); +static cl::opt<unsigned> UserBonusInstThreshold( + "bonus-inst-threshold", cl::Hidden, cl::init(1), + cl::desc("Control the number of bonus instructions (default = 1)")); + +static cl::opt<bool> UserKeepLoops( + "keep-loops", cl::Hidden, cl::init(true), + cl::desc("Preserve canonical loop structure (default = true)")); + +static cl::opt<bool> UserSwitchToLookup( + "switch-to-lookup", cl::Hidden, cl::init(false), + cl::desc("Convert switches to lookup tables (default = false)")); + +static cl::opt<bool> UserForwardSwitchCond( + "forward-switch-cond", cl::Hidden, cl::init(false), + cl::desc("Forward switch condition to phi ops (default = false)")); + +static cl::opt<bool> UserSinkCommonInsts( + "sink-common-insts", cl::Hidden, cl::init(false), + cl::desc("Sink common instructions (default = false)")); + STATISTIC(NumSimpl, "Number of blocks simplified"); @@ -129,9 +146,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - AssumptionCache *AC, - unsigned BonusInstThreshold, - bool LateSimplifyCFG) { + const SimplifyCFGOptions &Options) { bool Changed = false; bool LocalChange = true; @@ -146,7 +161,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded. for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) { + if (simplifyCFG(&*BBIt++, TTI, Options, &LoopHeaders)) { LocalChange = true; ++NumSimpl; } @@ -157,12 +172,10 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, } static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, - AssumptionCache *AC, int BonusInstThreshold, - bool LateSimplifyCFG) { + const SimplifyCFGOptions &Options) { bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold, - LateSimplifyCFG); + EverChanged |= iterativelySimplifyCFG(F, TTI, Options); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -176,28 +189,37 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold, - LateSimplifyCFG); + EverChanged = iterativelySimplifyCFG(F, TTI, Options); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); return true; } -SimplifyCFGPass::SimplifyCFGPass() - : BonusInstThreshold(UserBonusInstThreshold), - LateSimplifyCFG(true) {} - -SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG) - : BonusInstThreshold(BonusInstThreshold), - LateSimplifyCFG(LateSimplifyCFG) {} +// Command-line settings override compile-time settings. +SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts) { + Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences() + ? UserBonusInstThreshold + : Opts.BonusInstThreshold; + Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences() + ? UserForwardSwitchCond + : Opts.ForwardSwitchCondToPhi; + Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences() + ? UserSwitchToLookup + : Opts.ConvertSwitchToLookupTable; + Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences() + ? UserKeepLoops + : Opts.NeedCanonicalLoop; + Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences() + ? UserSinkCommonInsts + : Opts.SinkCommonInsts; +} PreservedAnalyses SimplifyCFGPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TTI = AM.getResult<TargetIRAnalysis>(F); - auto &AC = AM.getResult<AssumptionAnalysis>(F); - - if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG)) + Options.AC = &AM.getResult<AssumptionAnalysis>(F); + if (!simplifyFunctionCFG(F, TTI, Options)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<GlobalsAA>(); @@ -205,55 +227,54 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F, } namespace { -struct BaseCFGSimplifyPass : public FunctionPass { - unsigned BonusInstThreshold; +struct CFGSimplifyPass : public FunctionPass { + static char ID; + SimplifyCFGOptions Options; std::function<bool(const Function &)> PredicateFtor; - bool LateSimplifyCFG; - BaseCFGSimplifyPass(int T, bool LateSimplifyCFG, - std::function<bool(const Function &)> Ftor, - char &ID) - : FunctionPass(ID), PredicateFtor(std::move(Ftor)), - LateSimplifyCFG(LateSimplifyCFG) { - BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); + CFGSimplifyPass(unsigned Threshold = 1, bool ForwardSwitchCond = false, + bool ConvertSwitch = false, bool KeepLoops = true, + bool SinkCommon = false, + std::function<bool(const Function &)> Ftor = nullptr) + : FunctionPass(ID), PredicateFtor(std::move(Ftor)) { + + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + + // Check for command-line overrides of options for debug/customization. + Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences() + ? UserBonusInstThreshold + : Threshold; + + Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences() + ? UserForwardSwitchCond + : ForwardSwitchCond; + + Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences() + ? UserSwitchToLookup + : ConvertSwitch; + + Options.NeedCanonicalLoop = + UserKeepLoops.getNumOccurrences() ? UserKeepLoops : KeepLoops; + + Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences() + ? UserSinkCommonInsts + : SinkCommon; } + bool runOnFunction(Function &F) override { if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F))) return false; - AssumptionCache *AC = - &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - const TargetTransformInfo &TTI = - getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG); + Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + return simplifyFunctionCFG(F, TTI, Options); } - void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } }; - -struct CFGSimplifyPass : public BaseCFGSimplifyPass { - static char ID; // Pass identification, replacement for typeid - - CFGSimplifyPass(int T = -1, - std::function<bool(const Function &)> Ftor = nullptr) - : BaseCFGSimplifyPass(T, false, Ftor, ID) { - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } -}; - -struct LateCFGSimplifyPass : public BaseCFGSimplifyPass { - static char ID; // Pass identification, replacement for typeid - - LateCFGSimplifyPass(int T = -1, - std::function<bool(const Function &)> Ftor = nullptr) - : BaseCFGSimplifyPass(T, true, Ftor, ID) { - initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } -}; } char CFGSimplifyPass::ID = 0; @@ -264,24 +285,12 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, false) -char LateCFGSimplifyPass::ID = 0; -INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg", - "Simplify the CFG more aggressively", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg", - "Simplify the CFG more aggressively", false, false) - // Public interface to the CFGSimplification pass FunctionPass * -llvm::createCFGSimplificationPass(int Threshold, - std::function<bool(const Function &)> Ftor) { - return new CFGSimplifyPass(Threshold, std::move(Ftor)); -} - -// Public interface to the LateCFGSimplification pass -FunctionPass * -llvm::createLateCFGSimplificationPass(int Threshold, +llvm::createCFGSimplificationPass(unsigned Threshold, bool ForwardSwitchCond, + bool ConvertSwitch, bool KeepLoops, + bool SinkCommon, std::function<bool(const Function &)> Ftor) { - return new LateCFGSimplifyPass(Threshold, std::move(Ftor)); + return new CFGSimplifyPass(Threshold, ForwardSwitchCond, ConvertSwitch, + KeepLoops, SinkCommon, std::move(Ftor)); } diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 5210f165b874..cfb8a062299f 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -68,7 +68,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { MemoryLocation Loc = MemoryLocation::get(L); for (Instruction *S : Stores) - if (AA.getModRefInfo(S, Loc) & MRI_Mod) + if (isModSet(AA.getModRefInfo(S, Loc))) return false; } @@ -83,7 +83,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, return false; for (Instruction *S : Stores) - if (AA.getModRefInfo(S, CS) & MRI_Mod) + if (isModSet(AA.getModRefInfo(S, CS))) return false; } diff --git a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp new file mode 100644 index 000000000000..23156d5a4d83 --- /dev/null +++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp @@ -0,0 +1,811 @@ +//===- SpeculateAroundPHIs.cpp --------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "spec-phis" + +STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around"); +STATISTIC(NumEdgesSplit, + "Number of critical edges which were split for speculation"); +STATISTIC(NumSpeculatedInstructions, + "Number of instructions we speculated around the PHI nodes"); +STATISTIC(NumNewRedundantInstructions, + "Number of new, redundant instructions inserted"); + +/// Check wether speculating the users of a PHI node around the PHI +/// will be safe. +/// +/// This checks both that all of the users are safe and also that all of their +/// operands are either recursively safe or already available along an incoming +/// edge to the PHI. +/// +/// This routine caches both all the safe nodes explored in `PotentialSpecSet` +/// and the chain of nodes that definitively reach any unsafe node in +/// `UnsafeSet`. By preserving these between repeated calls to this routine for +/// PHIs in the same basic block, the exploration here can be reused. However, +/// these caches must no be reused for PHIs in a different basic block as they +/// reflect what is available along incoming edges. +static bool +isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT, + SmallPtrSetImpl<Instruction *> &PotentialSpecSet, + SmallPtrSetImpl<Instruction *> &UnsafeSet) { + auto *PhiBB = PN.getParent(); + SmallPtrSet<Instruction *, 4> Visited; + SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack; + + // Walk each user of the PHI node. + for (Use &U : PN.uses()) { + auto *UI = cast<Instruction>(U.getUser()); + + // Ensure the use post-dominates the PHI node. This ensures that, in the + // absence of unwinding, the use will actually be reached. + // FIXME: We use a blunt hammer of requiring them to be in the same basic + // block. We should consider using actual post-dominance here in the + // future. + if (UI->getParent() != PhiBB) { + DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n"); + return false; + } + + // FIXME: This check is much too conservative. We're not going to move these + // instructions onto new dynamic paths through the program unless there is + // a call instruction between the use and the PHI node. And memory isn't + // changing unless there is a store in that same sequence. We should + // probably change this to do at least a limited scan of the intervening + // instructions and allow handling stores in easily proven safe cases. + if (mayBeMemoryDependent(*UI)) { + DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n"); + return false; + } + + // Now do a depth-first search of everything these users depend on to make + // sure they are transitively safe. This is a depth-first search, but we + // check nodes in preorder to minimize the amount of checking. + Visited.insert(UI); + DFSStack.push_back({UI, UI->value_op_begin()}); + do { + User::value_op_iterator OpIt; + std::tie(UI, OpIt) = DFSStack.pop_back_val(); + + while (OpIt != UI->value_op_end()) { + auto *OpI = dyn_cast<Instruction>(*OpIt); + // Increment to the next operand for whenever we continue. + ++OpIt; + // No need to visit non-instructions, which can't form dependencies. + if (!OpI) + continue; + + // Now do the main pre-order checks that this operand is a viable + // dependency of something we want to speculate. + + // First do a few checks for instructions that won't require + // speculation at all because they are trivially available on the + // incoming edge (either through dominance or through an incoming value + // to a PHI). + // + // The cases in the current block will be trivially dominated by the + // edge. + auto *ParentBB = OpI->getParent(); + if (ParentBB == PhiBB) { + if (isa<PHINode>(OpI)) { + // We can trivially map through phi nodes in the same block. + continue; + } + } else if (DT.dominates(ParentBB, PhiBB)) { + // Instructions from dominating blocks are already available. + continue; + } + + // Once we know that we're considering speculating the operand, check + // if we've already explored this subgraph and found it to be safe. + if (PotentialSpecSet.count(OpI)) + continue; + + // If we've already explored this subgraph and found it unsafe, bail. + // If when we directly test whether this is safe it fails, bail. + if (UnsafeSet.count(OpI) || ParentBB != PhiBB || + mayBeMemoryDependent(*OpI)) { + DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI + << "\n"); + // Record the stack of instructions which reach this node as unsafe + // so we prune subsequent searches. + UnsafeSet.insert(OpI); + for (auto &StackPair : DFSStack) { + Instruction *I = StackPair.first; + UnsafeSet.insert(I); + } + return false; + } + + // Skip any operands we're already recursively checking. + if (!Visited.insert(OpI).second) + continue; + + // Push onto the stack and descend. We can directly continue this + // loop when ascending. + DFSStack.push_back({UI, OpIt}); + UI = OpI; + OpIt = OpI->value_op_begin(); + } + + // This node and all its operands are safe. Go ahead and cache that for + // reuse later. + PotentialSpecSet.insert(UI); + + // Continue with the next node on the stack. + } while (!DFSStack.empty()); + } + +#ifndef NDEBUG + // Every visited operand should have been marked as safe for speculation at + // this point. Verify this and return success. + for (auto *I : Visited) + assert(PotentialSpecSet.count(I) && + "Failed to mark a visited instruction as safe!"); +#endif + return true; +} + +/// Check whether, in isolation, a given PHI node is both safe and profitable +/// to speculate users around. +/// +/// This handles checking whether there are any constant operands to a PHI +/// which could represent a useful speculation candidate, whether the users of +/// the PHI are safe to speculate including all their transitive dependencies, +/// and whether after speculation there will be some cost savings (profit) to +/// folding the operands into the users of the PHI node. Returns true if both +/// safe and profitable with relevant cost savings updated in the map and with +/// an update to the `PotentialSpecSet`. Returns false if either safety or +/// profitability are absent. Some new entries may be made to the +/// `PotentialSpecSet` even when this routine returns false, but they remain +/// conservatively correct. +/// +/// The profitability check here is a local one, but it checks this in an +/// interesting way. Beyond checking that the total cost of materializing the +/// constants will be less than the cost of folding them into their users, it +/// also checks that no one incoming constant will have a higher cost when +/// folded into its users rather than materialized. This higher cost could +/// result in a dynamic *path* that is more expensive even when the total cost +/// is lower. Currently, all of the interesting cases where this optimization +/// should fire are ones where it is a no-loss operation in this sense. If we +/// ever want to be more aggressive here, we would need to balance the +/// different incoming edges' cost by looking at their respective +/// probabilities. +static bool isSafeAndProfitableToSpeculateAroundPHI( + PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap, + SmallPtrSetImpl<Instruction *> &PotentialSpecSet, + SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT, + TargetTransformInfo &TTI) { + // First see whether there is any cost savings to speculating around this + // PHI, and build up a map of the constant inputs to how many times they + // occur. + bool NonFreeMat = false; + struct CostsAndCount { + int MatCost = TargetTransformInfo::TCC_Free; + int FoldedCost = TargetTransformInfo::TCC_Free; + int Count = 0; + }; + SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts; + SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks; + for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) { + auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i)); + if (!IncomingC) + continue; + + // Only visit each incoming edge with a constant input once. + if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second) + continue; + + auto InsertResult = CostsAndCounts.insert({IncomingC, {}}); + // Count how many edges share a given incoming costant. + ++InsertResult.first->second.Count; + // Only compute the cost the first time we see a particular constant. + if (!InsertResult.second) + continue; + + int &MatCost = InsertResult.first->second.MatCost; + MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType()); + NonFreeMat |= MatCost != TTI.TCC_Free; + } + if (!NonFreeMat) { + DEBUG(dbgs() << " Free: " << PN << "\n"); + // No profit in free materialization. + return false; + } + + // Now check that the uses of this PHI can actually be speculated, + // otherwise we'll still have to materialize the PHI value. + if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) { + DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n"); + return false; + } + + // Compute how much (if any) savings are available by speculating around this + // PHI. + for (Use &U : PN.uses()) { + auto *UserI = cast<Instruction>(U.getUser()); + // Now check whether there is any savings to folding the incoming constants + // into this use. + unsigned Idx = U.getOperandNo(); + + // If we have a binary operator that is commutative, an actual constant + // operand would end up on the RHS, so pretend the use of the PHI is on the + // RHS. + // + // Technically, this is a bit weird if *both* operands are PHIs we're + // speculating. But if that is the case, giving an "optimistic" cost isn't + // a bad thing because after speculation it will constant fold. And + // moreover, such cases should likely have been constant folded already by + // some other pass, so we shouldn't worry about "modeling" them terribly + // accurately here. Similarly, if the other operand is a constant, it still + // seems fine to be "optimistic" in our cost modeling, because when the + // incoming operand from the PHI node is also a constant, we will end up + // constant folding. + if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1) + // Assume we will commute the constant to the RHS to be canonical. + Idx = 1; + + // Get the intrinsic ID if this user is an instrinsic. + Intrinsic::ID IID = Intrinsic::not_intrinsic; + if (auto *UserII = dyn_cast<IntrinsicInst>(UserI)) + IID = UserII->getIntrinsicID(); + + for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) { + ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first; + int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; + int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; + if (IID) + FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(), + IncomingC->getType()); + else + FoldedCost += + TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(), + IncomingC->getType()); + + // If we accumulate more folded cost for this incoming constant than + // materialized cost, then we'll regress any edge with this constant so + // just bail. We're only interested in cases where folding the incoming + // constants is at least break-even on all paths. + if (FoldedCost > MatCost) { + DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n" + " Materializing cost: " << MatCost << "\n" + " Accumulated folded cost: " << FoldedCost << "\n"); + return false; + } + } + } + + // Compute the total cost savings afforded by this PHI node. + int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free; + for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) { + int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; + int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; + int Count = IncomingConstantAndCostsAndCount.second.Count; + + TotalMatCost += MatCost * Count; + TotalFoldedCost += FoldedCost * Count; + } + assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is " + "less that its materialized cost, " + "the sum must be as well."); + + DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost) + << ": " << PN << "\n"); + CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost; + return true; +} + +/// Simple helper to walk all the users of a list of phis depth first, and call +/// a visit function on each one in post-order. +/// +/// All of the PHIs should be in the same basic block, and this is primarily +/// used to make a single depth-first walk across their collective users +/// without revisiting any subgraphs. Callers should provide a fast, idempotent +/// callable to test whether a node has been visited and the more important +/// callable to actually visit a particular node. +/// +/// Depth-first and postorder here refer to the *operand* graph -- we start +/// from a collection of users of PHI nodes and walk "up" the operands +/// depth-first. +template <typename IsVisitedT, typename VisitT> +static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs, + IsVisitedT IsVisited, + VisitT Visit) { + SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack; + for (auto *PN : PNs) + for (Use &U : PN->uses()) { + auto *UI = cast<Instruction>(U.getUser()); + if (IsVisited(UI)) + // Already visited this user, continue across the roots. + continue; + + // Otherwise, walk the operand graph depth-first and visit each + // dependency in postorder. + DFSStack.push_back({UI, UI->value_op_begin()}); + do { + User::value_op_iterator OpIt; + std::tie(UI, OpIt) = DFSStack.pop_back_val(); + while (OpIt != UI->value_op_end()) { + auto *OpI = dyn_cast<Instruction>(*OpIt); + // Increment to the next operand for whenever we continue. + ++OpIt; + // No need to visit non-instructions, which can't form dependencies, + // or instructions outside of our potential dependency set that we + // were given. Finally, if we've already visited the node, continue + // to the next. + if (!OpI || IsVisited(OpI)) + continue; + + // Push onto the stack and descend. We can directly continue this + // loop when ascending. + DFSStack.push_back({UI, OpIt}); + UI = OpI; + OpIt = OpI->value_op_begin(); + } + + // Finished visiting children, visit this node. + assert(!IsVisited(UI) && "Should not have already visited a node!"); + Visit(UI); + } while (!DFSStack.empty()); + } +} + +/// Find profitable PHIs to speculate. +/// +/// For a PHI node to be profitable, we need the cost of speculating its users +/// (and their dependencies) to not exceed the savings of folding the PHI's +/// constant operands into the speculated users. +/// +/// Computing this is surprisingly challenging. Because users of two different +/// PHI nodes can depend on each other or on common other instructions, it may +/// be profitable to speculate two PHI nodes together even though neither one +/// in isolation is profitable. The straightforward way to find all the +/// profitable PHIs would be to check each combination of PHIs' cost, but this +/// is exponential in complexity. +/// +/// Even if we assume that we only care about cases where we can consider each +/// PHI node in isolation (rather than considering cases where none are +/// profitable in isolation but some subset are profitable as a set), we still +/// have a challenge. The obvious way to find all individually profitable PHIs +/// is to iterate until reaching a fixed point, but this will be quadratic in +/// complexity. =/ +/// +/// This code currently uses a linear-to-compute order for a greedy approach. +/// It won't find cases where a set of PHIs must be considered together, but it +/// handles most cases of order dependence without quadratic iteration. The +/// specific order used is the post-order across the operand DAG. When the last +/// user of a PHI is visited in this postorder walk, we check it for +/// profitability. +/// +/// There is an orthogonal extra complexity to all of this: computing the cost +/// itself can easily become a linear computation making everything again (at +/// best) quadratic. Using a postorder over the operand graph makes it +/// particularly easy to avoid this through dynamic programming. As we do the +/// postorder walk, we build the transitive cost of that subgraph. It is also +/// straightforward to then update these costs when we mark a PHI for +/// speculation so that subsequent PHIs don't re-pay the cost of already +/// speculated instructions. +static SmallVector<PHINode *, 16> +findProfitablePHIs(ArrayRef<PHINode *> PNs, + const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap, + const SmallPtrSetImpl<Instruction *> &PotentialSpecSet, + int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) { + SmallVector<PHINode *, 16> SpecPNs; + + // First, establish a reverse mapping from immediate users of the PHI nodes + // to the nodes themselves, and count how many users each PHI node has in + // a way we can update while processing them. + SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap; + SmallDenseMap<PHINode *, int, 16> PNUserCountMap; + SmallPtrSet<Instruction *, 16> UserSet; + for (auto *PN : PNs) { + assert(UserSet.empty() && "Must start with an empty user set!"); + for (Use &U : PN->uses()) + UserSet.insert(cast<Instruction>(U.getUser())); + PNUserCountMap[PN] = UserSet.size(); + for (auto *UI : UserSet) + UserToPNMap.insert({UI, {}}).first->second.push_back(PN); + UserSet.clear(); + } + + // Now do a DFS across the operand graph of the users, computing cost as we + // go and when all costs for a given PHI are known, checking that PHI for + // profitability. + SmallDenseMap<Instruction *, int, 16> SpecCostMap; + visitPHIUsersAndDepsInPostOrder( + PNs, + /*IsVisited*/ + [&](Instruction *I) { + // We consider anything that isn't potentially speculated to be + // "visited" as it is already handled. Similarly, anything that *is* + // potentially speculated but for which we have an entry in our cost + // map, we're done. + return !PotentialSpecSet.count(I) || SpecCostMap.count(I); + }, + /*Visit*/ + [&](Instruction *I) { + // We've fully visited the operands, so sum their cost with this node + // and update the cost map. + int Cost = TTI.TCC_Free; + for (Value *OpV : I->operand_values()) + if (auto *OpI = dyn_cast<Instruction>(OpV)) { + auto CostMapIt = SpecCostMap.find(OpI); + if (CostMapIt != SpecCostMap.end()) + Cost += CostMapIt->second; + } + Cost += TTI.getUserCost(I); + bool Inserted = SpecCostMap.insert({I, Cost}).second; + (void)Inserted; + assert(Inserted && "Must not re-insert a cost during the DFS!"); + + // Now check if this node had a corresponding PHI node using it. If so, + // we need to decrement the outstanding user count for it. + auto UserPNsIt = UserToPNMap.find(I); + if (UserPNsIt == UserToPNMap.end()) + return; + auto &UserPNs = UserPNsIt->second; + auto UserPNsSplitIt = std::stable_partition( + UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) { + int &PNUserCount = PNUserCountMap.find(UserPN)->second; + assert( + PNUserCount > 0 && + "Should never re-visit a PN after its user count hits zero!"); + --PNUserCount; + return PNUserCount != 0; + }); + + // FIXME: Rather than one at a time, we should sum the savings as the + // cost will be completely shared. + SmallVector<Instruction *, 16> SpecWorklist; + for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) { + int SpecCost = TTI.TCC_Free; + for (Use &U : PN->uses()) + SpecCost += + SpecCostMap.find(cast<Instruction>(U.getUser()))->second; + SpecCost *= (NumPreds - 1); + // When the user count of a PHI node hits zero, we should check its + // profitability. If profitable, we should mark it for speculation + // and zero out the cost of everything it depends on. + int CostSavings = CostSavingsMap.find(PN)->second; + if (SpecCost > CostSavings) { + DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n" + " Cost savings: " << CostSavings << "\n" + " Speculation cost: " << SpecCost << "\n"); + continue; + } + + // We're going to speculate this user-associated PHI. Copy it out and + // add its users to the worklist to update their cost. + SpecPNs.push_back(PN); + for (Use &U : PN->uses()) { + auto *UI = cast<Instruction>(U.getUser()); + auto CostMapIt = SpecCostMap.find(UI); + if (CostMapIt->second == 0) + continue; + // Zero out this cost entry to avoid duplicates. + CostMapIt->second = 0; + SpecWorklist.push_back(UI); + } + } + + // Now walk all the operands of the users in the worklist transitively + // to zero out all the memoized costs. + while (!SpecWorklist.empty()) { + Instruction *SpecI = SpecWorklist.pop_back_val(); + assert(SpecCostMap.find(SpecI)->second == 0 && + "Didn't zero out a cost!"); + + // Walk the operands recursively to zero out their cost as well. + for (auto *OpV : SpecI->operand_values()) { + auto *OpI = dyn_cast<Instruction>(OpV); + if (!OpI) + continue; + auto CostMapIt = SpecCostMap.find(OpI); + if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0) + continue; + CostMapIt->second = 0; + SpecWorklist.push_back(OpI); + } + } + }); + + return SpecPNs; +} + +/// Speculate users around a set of PHI nodes. +/// +/// This routine does the actual speculation around a set of PHI nodes where we +/// have determined this to be both safe and profitable. +/// +/// This routine handles any spliting of critical edges necessary to create +/// a safe block to speculate into as well as cloning the instructions and +/// rewriting all uses. +static void speculatePHIs(ArrayRef<PHINode *> SpecPNs, + SmallPtrSetImpl<Instruction *> &PotentialSpecSet, + SmallSetVector<BasicBlock *, 16> &PredSet, + DominatorTree &DT) { + DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n"); + NumPHIsSpeculated += SpecPNs.size(); + + // Split any critical edges so that we have a block to hoist into. + auto *ParentBB = SpecPNs[0]->getParent(); + SmallVector<BasicBlock *, 16> SpecPreds; + SpecPreds.reserve(PredSet.size()); + for (auto *PredBB : PredSet) { + auto *NewPredBB = SplitCriticalEdge( + PredBB, ParentBB, + CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges()); + if (NewPredBB) { + ++NumEdgesSplit; + DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName() + << "\n"); + SpecPreds.push_back(NewPredBB); + } else { + assert(PredBB->getSingleSuccessor() == ParentBB && + "We need a non-critical predecessor to speculate into."); + assert(!isa<InvokeInst>(PredBB->getTerminator()) && + "Cannot have a non-critical invoke!"); + + // Already non-critical, use existing pred. + SpecPreds.push_back(PredBB); + } + } + + SmallPtrSet<Instruction *, 16> SpecSet; + SmallVector<Instruction *, 16> SpecList; + visitPHIUsersAndDepsInPostOrder(SpecPNs, + /*IsVisited*/ + [&](Instruction *I) { + // This is visited if we don't need to + // speculate it or we already have + // speculated it. + return !PotentialSpecSet.count(I) || + SpecSet.count(I); + }, + /*Visit*/ + [&](Instruction *I) { + // All operands scheduled, schedule this + // node. + SpecSet.insert(I); + SpecList.push_back(I); + }); + + int NumSpecInsts = SpecList.size() * SpecPreds.size(); + int NumRedundantInsts = NumSpecInsts - SpecList.size(); + DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, " + << NumRedundantInsts << " redundancies\n"); + NumSpeculatedInstructions += NumSpecInsts; + NumNewRedundantInstructions += NumRedundantInsts; + + // Each predecessor is numbered by its index in `SpecPreds`, so for each + // instruction we speculate, the speculated instruction is stored in that + // index of the vector asosciated with the original instruction. We also + // store the incoming values for each predecessor from any PHIs used. + SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap; + + // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming + // value. This handles both the PHIs we are speculating around and any other + // PHIs that happen to be used. + for (auto *OrigI : SpecList) + for (auto *OpV : OrigI->operand_values()) { + auto *OpPN = dyn_cast<PHINode>(OpV); + if (!OpPN || OpPN->getParent() != ParentBB) + continue; + + auto InsertResult = SpeculatedValueMap.insert({OpPN, {}}); + if (!InsertResult.second) + continue; + + auto &SpeculatedVals = InsertResult.first->second; + + // Populating our structure for mapping is particularly annoying because + // finding an incoming value for a particular predecessor block in a PHI + // node is a linear time operation! To avoid quadratic behavior, we build + // a map for this PHI node's incoming values and then translate it into + // the more compact representation used below. + SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap; + for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues())) + IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i); + + for (auto *PredBB : SpecPreds) + SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second); + } + + // Speculate into each predecessor. + for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) { + auto *PredBB = SpecPreds[PredIdx]; + assert(PredBB->getSingleSuccessor() == ParentBB && + "We need a non-critical predecessor to speculate into."); + + for (auto *OrigI : SpecList) { + auto *NewI = OrigI->clone(); + NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx)); + NewI->insertBefore(PredBB->getTerminator()); + + // Rewrite all the operands to the previously speculated instructions. + // Because we're walking in-order, the defs must precede the uses and we + // should already have these mappings. + for (Use &U : NewI->operands()) { + auto *OpI = dyn_cast<Instruction>(U.get()); + if (!OpI) + continue; + auto MapIt = SpeculatedValueMap.find(OpI); + if (MapIt == SpeculatedValueMap.end()) + continue; + const auto &SpeculatedVals = MapIt->second; + assert(SpeculatedVals[PredIdx] && + "Must have a speculated value for this predecessor!"); + assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() && + "Speculated value has the wrong type!"); + + // Rewrite the use to this predecessor's speculated instruction. + U.set(SpeculatedVals[PredIdx]); + } + + // Commute instructions which now have a constant in the LHS but not the + // RHS. + if (NewI->isBinaryOp() && NewI->isCommutative() && + isa<Constant>(NewI->getOperand(0)) && + !isa<Constant>(NewI->getOperand(1))) + NewI->getOperandUse(0).swap(NewI->getOperandUse(1)); + + SpeculatedValueMap[OrigI].push_back(NewI); + assert(SpeculatedValueMap[OrigI][PredIdx] == NewI && + "Mismatched speculated instruction index!"); + } + } + + // Walk the speculated instruction list and if they have uses, insert a PHI + // for them from the speculated versions, and replace the uses with the PHI. + // Then erase the instructions as they have been fully speculated. The walk + // needs to be in reverse so that we don't think there are users when we'll + // actually eventually remove them later. + IRBuilder<> IRB(SpecPNs[0]); + for (auto *OrigI : llvm::reverse(SpecList)) { + // Check if we need a PHI for any remaining users and if so, insert it. + if (!OrigI->use_empty()) { + auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(), + Twine(OrigI->getName()) + ".phi"); + // Add the incoming values we speculated. + auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second; + for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) + SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]); + + // And replace the uses with the PHI node. + OrigI->replaceAllUsesWith(SpecIPN); + } + + // It is important to immediately erase this so that it stops using other + // instructions. This avoids inserting needless PHIs of them. + OrigI->eraseFromParent(); + } + + // All of the uses of the speculated phi nodes should be removed at this + // point, so erase them. + for (auto *SpecPN : SpecPNs) { + assert(SpecPN->use_empty() && "All users should have been speculated!"); + SpecPN->eraseFromParent(); + } +} + +/// Try to speculate around a series of PHIs from a single basic block. +/// +/// This routine checks whether any of these PHIs are profitable to speculate +/// users around. If safe and profitable, it does the speculation. It returns +/// true when at least some speculation occurs. +static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs, + DominatorTree &DT, TargetTransformInfo &TTI) { + DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n"); + + // Savings in cost from speculating around a PHI node. + SmallDenseMap<PHINode *, int, 16> CostSavingsMap; + + // Remember the set of instructions that are candidates for speculation so + // that we can quickly walk things within that space. This prunes out + // instructions already available along edges, etc. + SmallPtrSet<Instruction *, 16> PotentialSpecSet; + + // Remember the set of instructions that are (transitively) unsafe to + // speculate into the incoming edges of this basic block. This avoids + // recomputing them for each PHI node we check. This set is specific to this + // block though as things are pruned out of it based on what is available + // along incoming edges. + SmallPtrSet<Instruction *, 16> UnsafeSet; + + // For each PHI node in this block, check whether there are immediate folding + // opportunities from speculation, and whether that speculation will be + // valid. This determise the set of safe PHIs to speculate. + PNs.erase(llvm::remove_if(PNs, + [&](PHINode *PN) { + return !isSafeAndProfitableToSpeculateAroundPHI( + *PN, CostSavingsMap, PotentialSpecSet, + UnsafeSet, DT, TTI); + }), + PNs.end()); + // If no PHIs were profitable, skip. + if (PNs.empty()) { + DEBUG(dbgs() << " No safe and profitable PHIs found!\n"); + return false; + } + + // We need to know how much speculation will cost which is determined by how + // many incoming edges will need a copy of each speculated instruction. + SmallSetVector<BasicBlock *, 16> PredSet; + for (auto *PredBB : PNs[0]->blocks()) { + if (!PredSet.insert(PredBB)) + continue; + + // We cannot speculate when a predecessor is an indirect branch. + // FIXME: We also can't reliably create a non-critical edge block for + // speculation if the predecessor is an invoke. This doesn't seem + // fundamental and we should probably be splitting critical edges + // differently. + if (isa<IndirectBrInst>(PredBB->getTerminator()) || + isa<InvokeInst>(PredBB->getTerminator())) { + DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName() + << "\n"); + return false; + } + } + if (PredSet.size() < 2) { + DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n"); + return false; + } + + SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs( + PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI); + if (SpecPNs.empty()) + // Nothing to do. + return false; + + speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT); + return true; +} + +PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + + bool Changed = false; + for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) { + SmallVector<PHINode *, 16> PNs; + auto BBI = BB->begin(); + while (auto *PN = dyn_cast<PHINode>(&*BBI)) { + PNs.push_back(PN); + ++BBI; + } + + if (PNs.empty()) + continue; + + Changed |= tryToSpeculatePHIs(PNs, DT, TTI); + } + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + return PA; +} diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 8b8d6590aa6a..ce40af1223f6 100644 --- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -1,4 +1,4 @@ -//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===// +//===- StraightLineStrengthReduce.cpp - -----------------------------------===// // // The LLVM Compiler Infrastructure // @@ -55,26 +55,45 @@ // // - When (i' - i) is constant but i and i' are not, we could still perform // SLSR. + +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <cassert> +#include <cstdint> +#include <limits> #include <list> #include <vector> using namespace llvm; using namespace PatternMatch; -namespace { +static const unsigned UnknownAddressSpace = + std::numeric_limits<unsigned>::max(); -static const unsigned UnknownAddressSpace = ~0u; +namespace { class StraightLineStrengthReduce : public FunctionPass { public: @@ -88,20 +107,22 @@ public: GEP, // &B[..][i * S][..] }; - Candidate() - : CandidateKind(Invalid), Base(nullptr), Index(nullptr), - Stride(nullptr), Ins(nullptr), Basis(nullptr) {} + Candidate() = default; Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, Instruction *I) - : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I), - Basis(nullptr) {} - Kind CandidateKind; - const SCEV *Base; + : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {} + + Kind CandidateKind = Invalid; + + const SCEV *Base = nullptr; + // Note that Index and Stride of a GEP candidate do not necessarily have the // same integer type. In that case, during rewriting, Stride will be // sign-extended or truncated to Index's type. - ConstantInt *Index; - Value *Stride; + ConstantInt *Index = nullptr; + + Value *Stride = nullptr; + // The instruction this candidate corresponds to. It helps us to rewrite a // candidate with respect to its immediate basis. Note that one instruction // can correspond to multiple candidates depending on how you associate the @@ -116,16 +137,16 @@ public: // or // // <Base: b, Index: 2, Stride: a + 1> - Instruction *Ins; + Instruction *Ins = nullptr; + // Points to the immediate basis of this candidate, or nullptr if we cannot // find any basis for this candidate. - Candidate *Basis; + Candidate *Basis = nullptr; }; static char ID; - StraightLineStrengthReduce() - : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) { + StraightLineStrengthReduce() : FunctionPass(ID) { initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry()); } @@ -148,46 +169,58 @@ private: // Returns true if Basis is a basis for C, i.e., Basis dominates C and they // share the same base and stride. bool isBasisFor(const Candidate &Basis, const Candidate &C); + // Returns whether the candidate can be folded into an addressing mode. bool isFoldable(const Candidate &C, TargetTransformInfo *TTI, const DataLayout *DL); + // Returns true if C is already in a simplest form and not worth being // rewritten. bool isSimplestForm(const Candidate &C); + // Checks whether I is in a candidate form. If so, adds all the matching forms // to Candidates, and tries to find the immediate basis for each of them. void allocateCandidatesAndFindBasis(Instruction *I); + // Allocate candidates and find bases for Add instructions. void allocateCandidatesAndFindBasisForAdd(Instruction *I); + // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a // candidate. void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS, Instruction *I); // Allocate candidates and find bases for Mul instructions. void allocateCandidatesAndFindBasisForMul(Instruction *I); + // Splits LHS into Base + Index and, if succeeds, calls // allocateCandidatesAndFindBasis. void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS, Instruction *I); + // Allocate candidates and find bases for GetElementPtr instructions. void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP); + // A helper function that scales Idx with ElementSize before invoking // allocateCandidatesAndFindBasis. void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize, Instruction *I); + // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate // basis. void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, Instruction *I); + // Rewrites candidate C with respect to Basis. void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); + // A helper function that factors ArrayIdx to a product of a stride and a // constant index, and invokes allocateCandidatesAndFindBasis with the // factorings. void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize, GetElementPtrInst *GEP); + // Emit code that computes the "bump" from Basis to C. If the candidate is a // GEP and the bump is not divisible by the element size of the GEP, this // function sets the BumpWithUglyGEP flag to notify its caller to bump the @@ -196,19 +229,22 @@ private: IRBuilder<> &Builder, const DataLayout *DL, bool &BumpWithUglyGEP); - const DataLayout *DL; - DominatorTree *DT; + const DataLayout *DL = nullptr; + DominatorTree *DT = nullptr; ScalarEvolution *SE; - TargetTransformInfo *TTI; + TargetTransformInfo *TTI = nullptr; std::list<Candidate> Candidates; + // Temporarily holds all instructions that are unlinked (but not deleted) by // rewriteCandidateWithBasis. These instructions will be actually removed // after all rewriting finishes. std::vector<Instruction *> UnlinkedInstructions; }; -} // anonymous namespace + +} // end anonymous namespace char StraightLineStrengthReduce::ID = 0; + INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) @@ -650,8 +686,8 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis( else Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump); } + break; } - break; default: llvm_unreachable("C.CandidateKind is invalid"); }; diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index 0cccb415efdb..2972e1cff9a4 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -1,4 +1,4 @@ -//===-- StructurizeCFG.cpp ------------------------------------------------===// +//===- StructurizeCFG.cpp -------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,49 +7,72 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <algorithm> +#include <cassert> +#include <utility> using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "structurizecfg" +// The name for newly created blocks. +static const char *const FlowBlockName = "Flow"; + namespace { // Definition of the complex types used in this pass. -typedef std::pair<BasicBlock *, Value *> BBValuePair; +using BBValuePair = std::pair<BasicBlock *, Value *>; -typedef SmallVector<RegionNode*, 8> RNVector; -typedef SmallVector<BasicBlock*, 8> BBVector; -typedef SmallVector<BranchInst*, 8> BranchVector; -typedef SmallVector<BBValuePair, 2> BBValueVector; +using RNVector = SmallVector<RegionNode *, 8>; +using BBVector = SmallVector<BasicBlock *, 8>; +using BranchVector = SmallVector<BranchInst *, 8>; +using BBValueVector = SmallVector<BBValuePair, 2>; -typedef SmallPtrSet<BasicBlock *, 8> BBSet; +using BBSet = SmallPtrSet<BasicBlock *, 8>; -typedef MapVector<PHINode *, BBValueVector> PhiMap; -typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap; +using PhiMap = MapVector<PHINode *, BBValueVector>; +using BB2BBVecMap = MapVector<BasicBlock *, BBVector>; -typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap; -typedef DenseMap<BasicBlock *, Value *> BBPredicates; -typedef DenseMap<BasicBlock *, BBPredicates> PredMap; -typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap; - -// The name for newly created blocks. -static const char *const FlowBlockName = "Flow"; +using BBPhiMap = DenseMap<BasicBlock *, PhiMap>; +using BBPredicates = DenseMap<BasicBlock *, Value *>; +using PredMap = DenseMap<BasicBlock *, BBPredicates>; +using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>; /// Finds the nearest common dominator of a set of BasicBlocks. /// @@ -736,7 +759,6 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed, changeExit(PrevNode, Node->getEntry(), true); } PrevNode = Node; - } else { // Insert extra prefix node (or reuse last one) BasicBlock *Flow = needPrefix(false); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 90c5c243f464..2a1106b41de2 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -60,6 +60,7 @@ #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" @@ -78,7 +79,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "tailcallelim" @@ -177,7 +177,8 @@ struct AllocaDerivedValueTracker { }; } -static bool markTails(Function &F, bool &AllCallsAreTailCalls) { +static bool markTails(Function &F, bool &AllCallsAreTailCalls, + OptimizationRemarkEmitter *ORE) { if (F.callsFunctionThatReturnsTwice()) return false; AllCallsAreTailCalls = true; @@ -228,7 +229,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) { Escaped = ESCAPED; CallInst *CI = dyn_cast<CallInst>(&I); - if (!CI || CI->isTailCall()) + if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I)) continue; bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles(); @@ -252,9 +253,11 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) { break; } if (SafeToTail) { - emitOptimizationRemark( - F.getContext(), "tailcallelim", F, CI->getDebugLoc(), - "marked this readnone call a tail call candidate"); + using namespace ore; + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI) + << "marked as tail call candidate (readnone)"; + }); CI->setTailCall(); Modified = true; continue; @@ -299,9 +302,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) { if (Visited[CI->getParent()] != ESCAPED) { // If the escape point was part way through the block, calls after the // escape point wouldn't have been put into DeferredTails. - emitOptimizationRemark(F.getContext(), "tailcallelim", F, - CI->getDebugLoc(), - "marked this call a tail call candidate"); + DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n"); CI->setTailCall(); Modified = true; } else { @@ -330,7 +331,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { // Writes to memory only matter if they may alias the pointer // being loaded from. const DataLayout &DL = L->getModule()->getDataLayout(); - if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) || + if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) || !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getAlignment(), DL, L)) return false; @@ -491,7 +492,8 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, - AliasAnalysis *AA) { + AliasAnalysis *AA, + OptimizationRemarkEmitter *ORE) { // If we are introducing accumulator recursion to eliminate operations after // the call instruction that are both associative and commutative, the initial // value for the accumulator is placed in this variable. If this value is set @@ -551,8 +553,11 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *BB = Ret->getParent(); Function *F = BB->getParent(); - emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(), - "transforming tail recursion to loop"); + using namespace ore; + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI) + << "transforming tail recursion into loop"; + }); // OK! We can transform this tail call. If this is the first one found, // create the new entry block, allowing us to branch back to the old entry. @@ -666,13 +671,11 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, return true; } -static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, - BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVectorImpl<PHINode *> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail, - const TargetTransformInfo *TTI, - AliasAnalysis *AA) { +static bool foldReturnAndProcessPred( + BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI, + AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) { bool Change = false; // Make sure this block is a trivial return block. @@ -708,7 +711,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, BB->eraseFromParent(); eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, AA); + ArgumentPHIs, AA, ORE); ++NumRetDuped; Change = true; } @@ -722,23 +725,25 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI, - AliasAnalysis *AA) { + AliasAnalysis *AA, + OptimizationRemarkEmitter *ORE) { CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI); if (!CI) return false; return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, AA); + ArgumentPHIs, AA, ORE); } static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI, - AliasAnalysis *AA) { + AliasAnalysis *AA, + OptimizationRemarkEmitter *ORE) { if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true") return false; bool MadeChange = false; bool AllCallsAreTailCalls = false; - MadeChange |= markTails(F, AllCallsAreTailCalls); + MadeChange |= markTails(F, AllCallsAreTailCalls, ORE); if (!AllCallsAreTailCalls) return MadeChange; @@ -765,13 +770,13 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI, for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB. if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { - bool Change = - processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA); + bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall, + TTI, AA, ORE); if (!Change && BB->getFirstNonPHIOrDbg() == Ret) Change = foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, - !CanTRETailMarkedCall, TTI, AA); + !CanTRETailMarkedCall, TTI, AA, ORE); MadeChange |= Change; } } @@ -802,6 +807,7 @@ struct TailCallElim : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -811,7 +817,8 @@ struct TailCallElim : public FunctionPass { return eliminateTailRecursion( F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), - &getAnalysis<AAResultsWrapperPass>().getAAResults()); + &getAnalysis<AAResultsWrapperPass>().getAAResults(), + &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE()); } }; } @@ -820,6 +827,7 @@ char TailCallElim::ID = 0; INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) @@ -833,8 +841,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F, TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); AliasAnalysis &AA = AM.getResult<AAManager>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - bool Changed = eliminateTailRecursion(F, &TTI, &AA); + bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE); if (!Changed) return PreservedAnalyses::all(); |
