diff options
Diffstat (limited to 'lib/Transforms/Scalar/LoopUnrollPass.cpp')
-rw-r--r-- | lib/Transforms/Scalar/LoopUnrollPass.cpp | 992 |
1 files changed, 509 insertions, 483 deletions
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index ecef6dbe24e64..91af4a1922ce1 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -12,13 +12,13 @@ // counts of loops easily. //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopUnrollAnalyzer.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -31,8 +31,11 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include <climits> +#include <utility> using namespace llvm; @@ -43,40 +46,54 @@ static cl::opt<unsigned> cl::desc("The baseline cost threshold for loop unrolling")); static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold( - "unroll-percent-dynamic-cost-saved-threshold", cl::Hidden, + "unroll-percent-dynamic-cost-saved-threshold", cl::init(50), cl::Hidden, cl::desc("The percentage of estimated dynamic cost which must be saved by " "unrolling to allow unrolling up to the max threshold.")); static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount( - "unroll-dynamic-cost-savings-discount", cl::Hidden, + "unroll-dynamic-cost-savings-discount", cl::init(100), cl::Hidden, cl::desc("This is the amount discounted from the total unroll cost when " "the unrolled form has a high dynamic cost savings (triggered by " "the '-unroll-perecent-dynamic-cost-saved-threshold' flag).")); static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( - "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden, + "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of" "iterations when checking full unroll profitability")); -static cl::opt<unsigned> -UnrollCount("unroll-count", cl::Hidden, - cl::desc("Use this unroll count for all loops including those with " - "unroll_count pragma values, for testing purposes")); +static cl::opt<unsigned> UnrollCount( + "unroll-count", cl::Hidden, + cl::desc("Use this unroll count for all loops including those with " + "unroll_count pragma values, for testing purposes")); -static cl::opt<bool> -UnrollAllowPartial("unroll-allow-partial", cl::Hidden, - cl::desc("Allows loops to be partially unrolled until " - "-unroll-threshold loop size is reached.")); +static cl::opt<unsigned> UnrollMaxCount( + "unroll-max-count", cl::Hidden, + cl::desc("Set the max unroll count for partial and runtime unrolling, for" + "testing purposes")); + +static cl::opt<unsigned> UnrollFullMaxCount( + "unroll-full-max-count", cl::Hidden, + cl::desc( + "Set the max unroll count for full unrolling, for testing purposes")); static cl::opt<bool> -UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, - cl::desc("Unroll loops with run-time trip counts")); + UnrollAllowPartial("unroll-allow-partial", cl::Hidden, + cl::desc("Allows loops to be partially unrolled until " + "-unroll-threshold loop size is reached.")); -static cl::opt<unsigned> -PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, - cl::desc("Unrolled size limit for loops with an unroll(full) or " - "unroll_count pragma.")); +static cl::opt<bool> UnrollAllowRemainder( + "unroll-allow-remainder", cl::Hidden, + cl::desc("Allow generation of a loop remainder (extra iterations) " + "when unrolling a loop.")); +static cl::opt<bool> + UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, + cl::desc("Unroll loops with run-time trip counts")); + +static cl::opt<unsigned> PragmaUnrollThreshold( + "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, + cl::desc("Unrolled size limit for loops with an unroll(full) or " + "unroll_count pragma.")); /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much @@ -88,26 +105,28 @@ static const unsigned NoThreshold = UINT_MAX; static const unsigned DefaultUnrollRuntimeCount = 8; /// Gather the various unrolling parameters based on the defaults, compiler -/// flags, TTI overrides, pragmas, and user specified parameters. +/// flags, TTI overrides and user specified parameters. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, - Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll, - bool PragmaEnableUnroll, unsigned TripCount) { + Optional<bool> UserRuntime) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults UP.Threshold = 150; - UP.PercentDynamicCostSavedThreshold = 20; - UP.DynamicCostSavingsDiscount = 2000; - UP.OptSizeThreshold = 50; + UP.PercentDynamicCostSavedThreshold = 50; + UP.DynamicCostSavingsDiscount = 100; + UP.OptSizeThreshold = 0; UP.PartialThreshold = UP.Threshold; - UP.PartialOptSizeThreshold = UP.OptSizeThreshold; + UP.PartialOptSizeThreshold = 0; UP.Count = 0; UP.MaxCount = UINT_MAX; + UP.FullUnrollMaxCount = UINT_MAX; UP.Partial = false; UP.Runtime = false; + UP.AllowRemainder = true; UP.AllowExpensiveTripCount = false; + UP.Force = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); @@ -118,12 +137,6 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.PartialThreshold = UP.PartialOptSizeThreshold; } - // Apply unroll count pragmas - if (PragmaCount) - UP.Count = PragmaCount; - else if (PragmaFullUnroll) - UP.Count = TripCount; - // Apply any user values specified by cl::opt if (UnrollThreshold.getNumOccurrences() > 0) { UP.Threshold = UnrollThreshold; @@ -134,10 +147,14 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UnrollPercentDynamicCostSavedThreshold; if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0) UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; - if (UnrollCount.getNumOccurrences() > 0) - UP.Count = UnrollCount; + if (UnrollMaxCount.getNumOccurrences() > 0) + UP.MaxCount = UnrollMaxCount; + if (UnrollFullMaxCount.getNumOccurrences() > 0) + UP.FullUnrollMaxCount = UnrollFullMaxCount; if (UnrollAllowPartial.getNumOccurrences() > 0) UP.Partial = UnrollAllowPartial; + if (UnrollAllowRemainder.getNumOccurrences() > 0) + UP.AllowRemainder = UnrollAllowRemainder; if (UnrollRuntime.getNumOccurrences() > 0) UP.Runtime = UnrollRuntime; @@ -153,259 +170,42 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( if (UserRuntime.hasValue()) UP.Runtime = *UserRuntime; - if (PragmaCount > 0 || - ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) { - // If the loop has an unrolling pragma, we want to be more aggressive with - // unrolling limits. Set thresholds to at least the PragmaTheshold value - // which is larger than the default limits. - if (UP.Threshold != NoThreshold) - UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); - if (UP.PartialThreshold != NoThreshold) - UP.PartialThreshold = - std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); - } - return UP; } namespace { -// This class is used to get an estimate of the optimization effects that we -// could get from complete loop unrolling. It comes from the fact that some -// loads might be replaced with concrete constant values and that could trigger -// a chain of instruction simplifications. -// -// E.g. we might have: -// int a[] = {0, 1, 0}; -// v = 0; -// for (i = 0; i < 3; i ++) -// v += b[i]*a[i]; -// If we completely unroll the loop, we would get: -// v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2] -// Which then will be simplified to: -// v = b[0]* 0 + b[1]* 1 + b[2]* 0 -// And finally: -// v = b[1] -class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> { - typedef InstVisitor<UnrolledInstAnalyzer, bool> Base; - friend class InstVisitor<UnrolledInstAnalyzer, bool>; - struct SimplifiedAddress { - Value *Base = nullptr; - ConstantInt *Offset = nullptr; - }; +/// A struct to densely store the state of an instruction after unrolling at +/// each iteration. +/// +/// This is designed to work like a tuple of <Instruction *, int> for the +/// purposes of hashing and lookup, but to be able to associate two boolean +/// states with each key. +struct UnrolledInstState { + Instruction *I; + int Iteration : 30; + unsigned IsFree : 1; + unsigned IsCounted : 1; +}; -public: - UnrolledInstAnalyzer(unsigned Iteration, - DenseMap<Value *, Constant *> &SimplifiedValues, - ScalarEvolution &SE) - : SimplifiedValues(SimplifiedValues), SE(SE) { - IterationNumber = SE.getConstant(APInt(64, Iteration)); +/// Hashing and equality testing for a set of the instruction states. +struct UnrolledInstStateKeyInfo { + typedef DenseMapInfo<Instruction *> PtrInfo; + typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo; + static inline UnrolledInstState getEmptyKey() { + return {PtrInfo::getEmptyKey(), 0, 0, 0}; } - - // Allow access to the initial visit method. - using Base::visit; - -private: - /// \brief A cache of pointer bases and constant-folded offsets corresponding - /// to GEP (or derived from GEP) instructions. - /// - /// In order to find the base pointer one needs to perform non-trivial - /// traversal of the corresponding SCEV expression, so it's good to have the - /// results saved. - DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses; - - /// \brief SCEV expression corresponding to number of currently simulated - /// iteration. - const SCEV *IterationNumber; - - /// \brief A Value->Constant map for keeping values that we managed to - /// constant-fold on the given iteration. - /// - /// While we walk the loop instructions, we build up and maintain a mapping - /// of simplified values specific to this iteration. The idea is to propagate - /// any special information we have about loads that can be replaced with - /// constants after complete unrolling, and account for likely simplifications - /// post-unrolling. - DenseMap<Value *, Constant *> &SimplifiedValues; - - ScalarEvolution &SE; - - /// \brief Try to simplify instruction \param I using its SCEV expression. - /// - /// The idea is that some AddRec expressions become constants, which then - /// could trigger folding of other instructions. However, that only happens - /// for expressions whose start value is also constant, which isn't always the - /// case. In another common and important case the start value is just some - /// address (i.e. SCEVUnknown) - in this case we compute the offset and save - /// it along with the base address instead. - bool simplifyInstWithSCEV(Instruction *I) { - if (!SE.isSCEVable(I->getType())) - return false; - - const SCEV *S = SE.getSCEV(I); - if (auto *SC = dyn_cast<SCEVConstant>(S)) { - SimplifiedValues[I] = SC->getValue(); - return true; - } - - auto *AR = dyn_cast<SCEVAddRecExpr>(S); - if (!AR) - return false; - - const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE); - // Check if the AddRec expression becomes a constant. - if (auto *SC = dyn_cast<SCEVConstant>(ValueAtIteration)) { - SimplifiedValues[I] = SC->getValue(); - return true; - } - - // Check if the offset from the base address becomes a constant. - auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S)); - if (!Base) - return false; - auto *Offset = - dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base)); - if (!Offset) - return false; - SimplifiedAddress Address; - Address.Base = Base->getValue(); - Address.Offset = Offset->getValue(); - SimplifiedAddresses[I] = Address; - return true; + static inline UnrolledInstState getTombstoneKey() { + return {PtrInfo::getTombstoneKey(), 0, 0, 0}; } - - /// Base case for the instruction visitor. - bool visitInstruction(Instruction &I) { - return simplifyInstWithSCEV(&I); + static inline unsigned getHashValue(const UnrolledInstState &S) { + return PairInfo::getHashValue({S.I, S.Iteration}); } - - /// Try to simplify binary operator I. - /// - /// TODO: Probably it's worth to hoist the code for estimating the - /// simplifications effects to a separate class, since we have a very similar - /// code in InlineCost already. - bool visitBinaryOperator(BinaryOperator &I) { - Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - if (!isa<Constant>(LHS)) - if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) - LHS = SimpleLHS; - if (!isa<Constant>(RHS)) - if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) - RHS = SimpleRHS; - - Value *SimpleV = nullptr; - const DataLayout &DL = I.getModule()->getDataLayout(); - if (auto FI = dyn_cast<FPMathOperator>(&I)) - SimpleV = - SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); - else - SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); - - if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) - SimplifiedValues[&I] = C; - - if (SimpleV) - return true; - return Base::visitBinaryOperator(I); - } - - /// Try to fold load I. - bool visitLoad(LoadInst &I) { - Value *AddrOp = I.getPointerOperand(); - - auto AddressIt = SimplifiedAddresses.find(AddrOp); - if (AddressIt == SimplifiedAddresses.end()) - return false; - ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset; - - auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base); - // We're only interested in loads that can be completely folded to a - // constant. - if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant()) - return false; - - ConstantDataSequential *CDS = - dyn_cast<ConstantDataSequential>(GV->getInitializer()); - if (!CDS) - return false; - - // We might have a vector load from an array. FIXME: for now we just bail - // out in this case, but we should be able to resolve and simplify such - // loads. - if(!CDS->isElementTypeCompatible(I.getType())) - return false; - - int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; - assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 && - "Unexpectedly large index value."); - int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize; - if (Index >= CDS->getNumElements()) { - // FIXME: For now we conservatively ignore out of bound accesses, but - // we're allowed to perform the optimization in this case. - return false; - } - - Constant *CV = CDS->getElementAsConstant(Index); - assert(CV && "Constant expected."); - SimplifiedValues[&I] = CV; - - return true; - } - - bool visitCastInst(CastInst &I) { - // Propagate constants through casts. - Constant *COp = dyn_cast<Constant>(I.getOperand(0)); - if (!COp) - COp = SimplifiedValues.lookup(I.getOperand(0)); - if (COp) - if (Constant *C = - ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { - SimplifiedValues[&I] = C; - return true; - } - - return Base::visitCastInst(I); - } - - bool visitCmpInst(CmpInst &I) { - Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - - // First try to handle simplified comparisons. - if (!isa<Constant>(LHS)) - if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) - LHS = SimpleLHS; - if (!isa<Constant>(RHS)) - if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) - RHS = SimpleRHS; - - if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) { - auto SimplifiedLHS = SimplifiedAddresses.find(LHS); - if (SimplifiedLHS != SimplifiedAddresses.end()) { - auto SimplifiedRHS = SimplifiedAddresses.find(RHS); - if (SimplifiedRHS != SimplifiedAddresses.end()) { - SimplifiedAddress &LHSAddr = SimplifiedLHS->second; - SimplifiedAddress &RHSAddr = SimplifiedRHS->second; - if (LHSAddr.Base == RHSAddr.Base) { - LHS = LHSAddr.Offset; - RHS = RHSAddr.Offset; - } - } - } - } - - if (Constant *CLHS = dyn_cast<Constant>(LHS)) { - if (Constant *CRHS = dyn_cast<Constant>(RHS)) { - if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { - SimplifiedValues[&I] = C; - return true; - } - } - } - - return Base::visitCmpInst(I); + static inline bool isEqual(const UnrolledInstState &LHS, + const UnrolledInstState &RHS) { + return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); } }; -} // namespace - +} namespace { struct EstimatedUnrollCost { @@ -441,18 +241,25 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); + // Only analyze inner loops. We can't properly estimate cost of nested loops + // and we won't visit inner loops again anyway. + if (!L->empty()) + return None; + // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; + SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; + // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic @@ -460,6 +267,97 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // unrolling. int RolledDynamicCost = 0; + // We track the simplification of each instruction in each iteration. We use + // this to recursively merge costs into the unrolled cost on-demand so that + // we don't count the cost of any dead code. This is essentially a map from + // <instruction, int> to <bool, bool>, but stored as a densely packed struct. + DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap; + + // A small worklist used to accumulate cost of instructions from each + // observable and reached root in the loop. + SmallVector<Instruction *, 16> CostWorklist; + + // PHI-used worklist used between iterations while accumulating cost. + SmallVector<Instruction *, 4> PHIUsedList; + + // Helper function to accumulate cost for instructions in the loop. + auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { + assert(Iteration >= 0 && "Cannot have a negative iteration!"); + assert(CostWorklist.empty() && "Must start with an empty cost list"); + assert(PHIUsedList.empty() && "Must start with an empty phi used list"); + CostWorklist.push_back(&RootI); + for (;; --Iteration) { + do { + Instruction *I = CostWorklist.pop_back_val(); + + // InstCostMap only uses I and Iteration as a key, the other two values + // don't matter here. + auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); + if (CostIter == InstCostMap.end()) + // If an input to a PHI node comes from a dead path through the loop + // we may have no cost data for it here. What that actually means is + // that it is free. + continue; + auto &Cost = *CostIter; + if (Cost.IsCounted) + // Already counted this instruction. + continue; + + // Mark that we are counting the cost of this instruction now. + Cost.IsCounted = true; + + // If this is a PHI node in the loop header, just add it to the PHI set. + if (auto *PhiI = dyn_cast<PHINode>(I)) + if (PhiI->getParent() == L->getHeader()) { + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during unrolling."); + if (Iteration == 0) + continue; + + // Push the incoming value from the backedge into the PHI used list + // if it is an in-loop instruction. We'll use this to populate the + // cost worklist for the next iteration (as we count backwards). + if (auto *OpI = dyn_cast<Instruction>( + PhiI->getIncomingValueForBlock(L->getLoopLatch()))) + if (L->contains(OpI)) + PHIUsedList.push_back(OpI); + continue; + } + + // First accumulate the cost of this instruction. + if (!Cost.IsFree) { + UnrolledCost += TTI.getUserCost(I); + DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration + << "): "); + DEBUG(I->dump()); + } + + // We must count the cost of every operand which is not free, + // recursively. If we reach a loop PHI node, simply add it to the set + // to be considered on the next iteration (backwards!). + for (Value *Op : I->operands()) { + // Check whether this operand is free due to being a constant or + // outside the loop. + auto *OpI = dyn_cast<Instruction>(Op); + if (!OpI || !L->contains(OpI)) + continue; + + // Otherwise accumulate its cost. + CostWorklist.push_back(OpI); + } + } while (!CostWorklist.empty()); + + if (PHIUsedList.empty()) + // We've exhausted the search. + break; + + assert(Iteration > 0 && + "Cannot track PHI-used values past the first iteration!"); + CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); + PHIUsedList.clear(); + } + }; + // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); @@ -502,7 +400,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -514,22 +412,32 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - int InstCost = TTI.getUserCost(&I); + // Track this instruction's expected baseline cost when executing the + // rolled loop form. + RolledDynamicCost += TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, - // and if the visitor returns false, include this instruction in the - // unrolled cost. - if (!Analyzer.visit(I)) - UnrolledCost += InstCost; - else { - DEBUG(dbgs() << " " << I - << " would be simplified if loop is unrolled.\n"); - (void)0; - } + // and if the visitor returns true, mark the instruction as free after + // unrolling and continue. + bool IsFree = Analyzer.visit(I); + bool Inserted = InstCostMap.insert({&I, (int)Iteration, + (unsigned)IsFree, + /*IsCounted*/ false}).second; + (void)Inserted; + assert(Inserted && "Cannot have a state for an unvisited instruction!"); + + if (IsFree) + continue; - // Also track this instructions expected cost when executing the rolled - // loop form. - RolledDynamicCost += InstCost; + // If the instruction might have a side-effect recursively account for + // the cost of it and all the instructions leading up to it. + if (I.mayHaveSideEffects()) + AddCostRecursively(I, Iteration); + + // Can't properly model a cost of a call. + // FIXME: With a proper cost model we should be able to do it. + if(isa<CallInst>(&I)) + return None; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { @@ -545,42 +453,45 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. + BasicBlock *KnownSucc = nullptr; if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isConditional()) { if (Constant *SimpleCond = SimplifiedValues.lookup(BI->getCondition())) { - BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) - Succ = BI->getSuccessor(0); - else - Succ = BI->getSuccessor( - cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); - if (L->contains(Succ)) - BBWorklist.insert(Succ); - continue; + KnownSucc = BI->getSuccessor(0); + else if (ConstantInt *SimpleCondVal = + dyn_cast<ConstantInt>(SimpleCond)) + KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0); } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { if (Constant *SimpleCond = SimplifiedValues.lookup(SI->getCondition())) { - BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) - Succ = SI->getSuccessor(0); - else - Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) - .getCaseSuccessor(); - if (L->contains(Succ)) - BBWorklist.insert(Succ); - continue; + KnownSucc = SI->getSuccessor(0); + else if (ConstantInt *SimpleCondVal = + dyn_cast<ConstantInt>(SimpleCond)) + KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor(); } } + if (KnownSucc) { + if (L->contains(KnownSucc)) + BBWorklist.insert(KnownSucc); + else + ExitWorklist.insert({BB, KnownSucc}); + continue; + } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); + else + ExitWorklist.insert({BB, Succ}); + AddCostRecursively(*TI, Iteration); } // If we found no optimization opportunities on the first iteration, we @@ -591,6 +502,23 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, return None; } } + + while (!ExitWorklist.empty()) { + BasicBlock *ExitingBB, *ExitBB; + std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); + + for (Instruction &I : *ExitBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + break; + + Value *Op = PN->getIncomingValueForBlock(ExitingBB); + if (auto *OpI = dyn_cast<Instruction>(Op)) + if (L->contains(OpI)) + AddCostRecursively(*OpI, TripCount - 1); + } + } + DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); @@ -599,18 +527,18 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - bool &NotDuplicatable, + bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, AssumptionCache *AC) { SmallPtrSet<const Value *, 32> EphValues; CodeMetrics::collectEphemeralValues(L, AC, EphValues); CodeMetrics Metrics; - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) - Metrics.analyzeBasicBlock(*I, TTI, EphValues); + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, TTI, EphValues); NumCalls = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; + Convergent = Metrics.convergent; unsigned LoopSize = Metrics.NumInsts; @@ -676,21 +604,22 @@ static unsigned UnrollCountPragmaValue(const Loop *L) { // unrolling pass is run more than once (which it generally is). static void SetLoopAlreadyUnrolled(Loop *L) { MDNode *LoopID = L->getLoopID(); - if (!LoopID) return; - // First remove any existing loop unrolling metadata. SmallVector<Metadata *, 4> MDs; // Reserve first location for self reference to the LoopID metadata node. MDs.push_back(nullptr); - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - bool IsUnrollMetadata = false; - MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (MD) { - const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsUnrollMetadata = false; + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + } + if (!IsUnrollMetadata) + MDs.push_back(LoopID->getOperand(i)); } - if (!IsUnrollMetadata) - MDs.push_back(LoopID->getOperand(i)); } // Add unroll(disable) metadata to disable future unrolling. @@ -737,9 +666,9 @@ static bool canUnrollCompletely(Loop *L, unsigned Threshold, (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <= (int64_t)Threshold) { DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the " - "expected dynamic cost by " << PercentDynamicCostSaved - << "% (threshold: " << PercentDynamicCostSavedThreshold - << "%)\n" + "expected dynamic cost by " + << PercentDynamicCostSaved << "% (threshold: " + << PercentDynamicCostSavedThreshold << "%)\n" << " and the unrolled cost (" << UnrolledCost << ") is less than the max threshold (" << DynamicCostSavingsDiscount << ").\n"); @@ -758,82 +687,77 @@ static bool canUnrollCompletely(Loop *L, unsigned Threshold, return false; } -static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, const TargetTransformInfo &TTI, - AssumptionCache &AC, bool PreserveLCSSA, - Optional<unsigned> ProvidedCount, - Optional<unsigned> ProvidedThreshold, - Optional<bool> ProvidedAllowPartial, - Optional<bool> ProvidedRuntime) { - BasicBlock *Header = L->getHeader(); - DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() - << "] Loop %" << Header->getName() << "\n"); +// Returns true if unroll count was set explicitly. +// Calculates unroll count and writes it to UP.Count. +static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, + DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE, unsigned TripCount, + unsigned TripMultiple, unsigned LoopSize, + TargetTransformInfo::UnrollingPreferences &UP) { + // BEInsns represents number of instructions optimized when "back edge" + // becomes "fall through" in unrolled loop. + // For now we count a conditional branch on a backedge and a comparison + // feeding it. + unsigned BEInsns = 2; + // Check for explicit Count. + // 1st priority is unroll count set by "unroll-count" option. + bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; + if (UserUnrollCount) { + UP.Count = UnrollCount; + UP.AllowExpensiveTripCount = true; + UP.Force = true; + if (UP.AllowRemainder && + (LoopSize - BEInsns) * UP.Count + BEInsns < UP.Threshold) + return true; + } - if (HasUnrollDisablePragma(L)) { - return false; + // 2nd priority is unroll count set by pragma. + unsigned PragmaCount = UnrollCountPragmaValue(L); + if (PragmaCount > 0) { + UP.Count = PragmaCount; + UP.Runtime = true; + UP.AllowExpensiveTripCount = true; + UP.Force = true; + if (UP.AllowRemainder && + (LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold) + return true; } bool PragmaFullUnroll = HasUnrollFullPragma(L); - bool PragmaEnableUnroll = HasUnrollEnablePragma(L); - unsigned PragmaCount = UnrollCountPragmaValue(L); - bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0; - - // Find trip count and trip multiple if count is not available - unsigned TripCount = 0; - unsigned TripMultiple = 1; - // If there are multiple exiting blocks but one of them is the latch, use the - // latch for the trip count estimation. Otherwise insist on a single exiting - // block for the trip count estimation. - BasicBlock *ExitingBlock = L->getLoopLatch(); - if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) - ExitingBlock = L->getExitingBlock(); - if (ExitingBlock) { - TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); - TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); + if (PragmaFullUnroll && TripCount != 0) { + UP.Count = TripCount; + if ((LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold) + return false; } - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, - ProvidedRuntime, PragmaCount, PragmaFullUnroll, PragmaEnableUnroll, - TripCount); - - unsigned Count = UP.Count; - bool CountSetExplicitly = Count != 0; - // Use a heuristic count if we didn't set anything explicitly. - if (!CountSetExplicitly) - Count = TripCount == 0 ? DefaultUnrollRuntimeCount : TripCount; - if (TripCount && Count > TripCount) - Count = TripCount; + bool PragmaEnableUnroll = HasUnrollEnablePragma(L); + bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || + PragmaEnableUnroll || UserUnrollCount; - unsigned NumInlineCandidates; - bool notDuplicatable; - unsigned LoopSize = - ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC); - DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + uint64_t UnrolledSize; + DebugLoc LoopLoc = L->getStartLoc(); + Function *F = L->getHeader()->getParent(); + LLVMContext &Ctx = F->getContext(); - // When computing the unrolled size, note that the conditional branch on the - // backedge and the comparison feeding it are not replicated like the rest of - // the loop body (which is why 2 is subtracted). - uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2; - if (notDuplicatable) { - DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" - << " instructions.\n"); - return false; - } - if (NumInlineCandidates != 0) { - DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); - return false; + if (ExplicitUnroll && TripCount != 0) { + // If the loop has an unrolling pragma, we want to be more aggressive with + // unrolling limits. Set thresholds to at least the PragmaThreshold value + // which is larger than the default limits. + UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); + UP.PartialThreshold = + std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); } - // Given Count, TripCount and thresholds determine the type of - // unrolling which is to be performed. - enum { Full = 0, Partial = 1, Runtime = 2 }; - int Unrolling; - if (TripCount && Count == TripCount) { - Unrolling = Partial; - // If the loop is really small, we don't need to run an expensive analysis. + // 3rd priority is full unroll count. + // Full unroll make sense only when TripCount could be staticaly calculated. + // Also we need to check if we exceed FullUnrollMaxCount. + if (TripCount && TripCount <= UP.FullUnrollMaxCount) { + // When computing the unrolled size, note that BEInsns are not replicated + // like the rest of the loop body. + UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns; if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount, UnrolledSize, UnrolledSize)) { - Unrolling = Full; + UP.Count = TripCount; + return ExplicitUnroll; } else { // The loop isn't that small, but we still can fully unroll it if that // helps to remove a significant number of instructions. @@ -845,99 +769,216 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, UP.PercentDynamicCostSavedThreshold, UP.DynamicCostSavingsDiscount, Cost->UnrolledCost, Cost->RolledDynamicCost)) { - Unrolling = Full; + UP.Count = TripCount; + return ExplicitUnroll; } } - } else if (TripCount && Count < TripCount) { - Unrolling = Partial; - } else { - Unrolling = Runtime; } - // Reduce count based on the type of unrolling and the threshold values. - unsigned OriginalCount = Count; - bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || UP.Runtime; - // Don't unroll a runtime trip count loop with unroll full pragma. - if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { - AllowRuntime = false; - } - if (Unrolling == Partial) { - bool AllowPartial = PragmaEnableUnroll || UP.Partial; - if (!AllowPartial && !CountSetExplicitly) { + // 4rd priority is partial unrolling. + // Try partial unroll only when TripCount could be staticaly calculated. + if (TripCount) { + if (UP.Count == 0) + UP.Count = TripCount; + UP.Partial |= ExplicitUnroll; + if (!UP.Partial) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); + UP.Count = 0; return false; } - if (UP.PartialThreshold != NoThreshold && - UnrolledSize > UP.PartialThreshold) { + if (UP.PartialThreshold != NoThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - Count = (std::max(UP.PartialThreshold, 3u) - 2) / (LoopSize - 2); - while (Count != 0 && TripCount % Count != 0) - Count--; - } - } else if (Unrolling == Runtime) { - if (!AllowRuntime && !CountSetExplicitly) { - DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " - << "-unroll-runtime not given\n"); - return false; - } - // Reduce unroll count to be the largest power-of-two factor of - // the original count which satisfies the threshold limit. - while (Count != 0 && UnrolledSize > UP.PartialThreshold) { - Count >>= 1; - UnrolledSize = (LoopSize-2) * Count + 2; + UnrolledSize = (uint64_t)(LoopSize - BEInsns) * UP.Count + BEInsns; + if (UnrolledSize > UP.PartialThreshold) + UP.Count = (std::max(UP.PartialThreshold, 3u) - BEInsns) / + (LoopSize - BEInsns); + if (UP.Count > UP.MaxCount) + UP.Count = UP.MaxCount; + while (UP.Count != 0 && TripCount % UP.Count != 0) + UP.Count--; + if (UP.AllowRemainder && UP.Count <= 1) { + // If there is no Count that is modulo of TripCount, set Count to + // largest power-of-two factor that satisfies the threshold limit. + // As we'll create fixup loop, do the type of unrolling only if + // remainder loop is allowed. + UP.Count = DefaultUnrollRuntimeCount; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) { + UP.Count >>= 1; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + } + } + if (UP.Count < 2) { + if (PragmaEnableUnroll) + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop as directed by unroll(enable) pragma " + "because unrolled size is too large."); + UP.Count = 0; + } + } else { + UP.Count = TripCount; } - if (Count > UP.MaxCount) - Count = UP.MaxCount; - DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); - } - - if (HasPragma) { - if (PragmaCount != 0) - // If loop has an unroll count pragma mark loop as unrolled to prevent - // unrolling beyond that requested by the pragma. - SetLoopAlreadyUnrolled(L); - - // Emit optimization remarks if we are unable to unroll the loop - // as directed by a pragma. - DebugLoc LoopLoc = L->getStartLoc(); - Function *F = Header->getParent(); - LLVMContext &Ctx = F->getContext(); - if ((PragmaCount > 0) && Count != OriginalCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to unroll loop the number of times directed by " - "unroll_count pragma because unrolled size is too large."); - } else if (PragmaFullUnroll && !TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); - } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) { + if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && + UP.Count != TripCount) emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to unroll loop as directed by unroll(enable) pragma because " + "Unable to fully unroll loop as directed by unroll pragma because " "unrolled size is too large."); - } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && - Count != TripCount) { + return ExplicitUnroll; + } + assert(TripCount == 0 && + "All cases when TripCount is constant should be covered here."); + if (PragmaFullUnroll) + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(full) pragma " + "because loop has a runtime trip count."); + + // 5th priority is runtime unrolling. + // Don't unroll a runtime trip count loop when it is disabled. + if (HasRuntimeUnrollDisablePragma(L)) { + UP.Count = 0; + return false; + } + // Reduce count based on the type of unrolling and the threshold values. + UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; + if (!UP.Runtime) { + DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " + << "-unroll-runtime not given\n"); + UP.Count = 0; + return false; + } + if (UP.Count == 0) + UP.Count = DefaultUnrollRuntimeCount; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + + // Reduce unroll count to be the largest power-of-two factor of + // the original count which satisfies the threshold limit. + while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) { + UP.Count >>= 1; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + } + +#ifndef NDEBUG + unsigned OrigCount = UP.Count; +#endif + + if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) { + while (UP.Count != 0 && TripMultiple % UP.Count != 0) + UP.Count >>= 1; + DEBUG(dbgs() << "Remainder loop is restricted (that could architecture " + "specific or because the loop contains a convergent " + "instruction), so unroll count must divide the trip " + "multiple, " + << TripMultiple << ". Reducing unroll count from " + << OrigCount << " to " << UP.Count << ".\n"); + if (PragmaCount > 0 && !UP.AllowRemainder) emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll pragma because " - "unrolled size is too large."); - } + Twine("Unable to unroll loop the number of times directed by " + "unroll_count pragma because remainder loop is restricted " + "(that could architecture specific or because the loop " + "contains a convergent instruction) and so must have an unroll " + "count that divides the loop trip multiple of ") + + Twine(TripMultiple) + ". Unrolling instead " + Twine(UP.Count) + + " time(s)."); } - if (Unrolling != Full && Count < 2) { - // Partial unrolling by 1 is a nop. For full unrolling, a factor - // of 1 makes sense because loop control can be eliminated. + if (UP.Count > UP.MaxCount) + UP.Count = UP.MaxCount; + DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n"); + if (UP.Count < 2) + UP.Count = 0; + return ExplicitUnroll; +} + +static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo &TTI, + AssumptionCache &AC, bool PreserveLCSSA, + Optional<unsigned> ProvidedCount, + Optional<unsigned> ProvidedThreshold, + Optional<bool> ProvidedAllowPartial, + Optional<bool> ProvidedRuntime) { + DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() + << "] Loop %" << L->getHeader()->getName() << "\n"); + if (HasUnrollDisablePragma(L)) { return false; } + unsigned NumInlineCandidates; + bool NotDuplicatable; + bool Convergent; + unsigned LoopSize = ApproximateLoopSize( + L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC); + DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + if (NotDuplicatable) { + DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" + << " instructions.\n"); + return false; + } + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return false; + } + if (!L->isLoopSimplifyForm()) { + DEBUG( + dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); + return false; + } + + // Find trip count and trip multiple if count is not available + unsigned TripCount = 0; + unsigned TripMultiple = 1; + // If there are multiple exiting blocks but one of them is the latch, use the + // latch for the trip count estimation. Otherwise insist on a single exiting + // block for the trip count estimation. + BasicBlock *ExitingBlock = L->getLoopLatch(); + if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) + ExitingBlock = L->getExitingBlock(); + if (ExitingBlock) { + TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); + TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); + } + + TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( + L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, + ProvidedRuntime); + + // If the loop contains a convergent operation, the prelude we'd add + // to do the first few instructions before we hit the unrolled loop + // is unsafe -- it adds a control-flow dependency to the convergent + // operation. Therefore restrict remainder loop (try unrollig without). + // + // TODO: This is quite conservative. In practice, convergent_op() + // is likely to be called unconditionally in the loop. In this + // case, the program would be ill-formed (on most architectures) + // unless n were the same on all threads in a thread group. + // Assuming n is the same on all threads, any kind of unrolling is + // safe. But currently llvm's notion of convergence isn't powerful + // enough to express this. + if (Convergent) + UP.AllowRemainder = false; + + bool IsCountSetExplicitly = computeUnrollCount(L, TTI, DT, LI, SE, TripCount, + TripMultiple, LoopSize, UP); + if (!UP.Count) + return false; + // Unroll factor (Count) must be less or equal to TripCount. + if (TripCount && UP.Count > TripCount) + UP.Count = TripCount; + // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, - TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA)) + if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, + UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC, + PreserveLCSSA)) return false; + // If loop has an unroll count pragma or unrolled by explicitly set count + // mark loop as unrolled to prevent unrolling beyond that requested. + if (IsCountSetExplicitly) + SetLoopAlreadyUnrolled(L); return true; } @@ -948,8 +989,9 @@ public: LoopUnroll(Optional<unsigned> Threshold = None, Optional<unsigned> Count = None, Optional<bool> AllowPartial = None, Optional<bool> Runtime = None) - : LoopPass(ID), ProvidedCount(Count), ProvidedThreshold(Threshold), - ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime) { + : LoopPass(ID), ProvidedCount(std::move(Count)), + ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), + ProvidedRuntime(Runtime) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -959,7 +1001,7 @@ public: Optional<bool> ProvidedRuntime; bool runOnLoop(Loop *L, LPPassManager &) override { - if (skipOptnoneFunction(L)) + if (skipLoop(L)) return false; Function &F = *L->getHeader()->getParent(); @@ -982,35 +1024,19 @@ public: /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. - // If loop unroll does not preserve dom info then LCSSA pass on next - // loop will receive invalid dom info. - // For now, recreate dom info, if loop is unrolled. - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); + // FIXME: Loop passes are required to preserve domtree, and for now we just + // recreate dom info if anything gets unrolled. + getLoopAnalysisUsage(AU); } }; } char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, |