diff options
Diffstat (limited to 'lib/Transforms/Scalar/LoopStrengthReduce.cpp')
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 432 |
1 files changed, 355 insertions, 77 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 194587a85e7c1..af137f6faa634 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -129,6 +129,17 @@ static cl::opt<bool> EnablePhiElim( "enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination")); +// The flag adds instruction count to solutions cost comparision. +static cl::opt<bool> InsnsCost( + "lsr-insns-cost", cl::Hidden, cl::init(false), + cl::desc("Add instruction count to a LSR cost model")); + +// Flag to choose how to narrow complex lsr solution +static cl::opt<bool> LSRExpNarrow( + "lsr-exp-narrow", cl::Hidden, cl::init(false), + cl::desc("Narrow LSR complex solution using" + " expectation of registers number")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt<bool> StressIVChain( @@ -181,10 +192,11 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -LLVM_DUMP_METHOD -void RegSortData::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegSortData::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { @@ -295,9 +307,13 @@ struct Formula { /// canonical representation of a formula is /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). + /// 3. The reg containing recurrent expr related with currect loop in the + /// formula should be put in the ScaledReg. /// #1 enforces that the scaled register is always used when at least two /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2. /// #2 enforces that 1 * reg is reg. + /// #3 ensures invariant regs with respect to current loop can be combined + /// together in LSR codegen. /// This invariant can be temporarly broken while building a formula. /// However, every formula inserted into the LSRInstance must be in canonical /// form. @@ -318,12 +334,14 @@ struct Formula { void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); - bool isCanonical() const; + bool isCanonical(const Loop &L) const; - void canonicalize(); + void canonicalize(const Loop &L); bool unscale(); + bool hasZeroEnd() const; + size_t getNumRegs() const; Type *getType() const; @@ -410,16 +428,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { BaseRegs.push_back(Sum); HasBaseReg = true; } - canonicalize(); + canonicalize(*L); } /// \brief Check whether or not this formula statisfies the canonical /// representation. /// \see Formula::BaseRegs. -bool Formula::isCanonical() const { - if (ScaledReg) - return Scale != 1 || !BaseRegs.empty(); - return BaseRegs.size() <= 1; +bool Formula::isCanonical(const Loop &L) const { + if (!ScaledReg) + return BaseRegs.size() <= 1; + + if (Scale != 1) + return true; + + if (Scale == 1 && BaseRegs.empty()) + return false; + + const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); + if (SAR && SAR->getLoop() == &L) + return true; + + // If ScaledReg is not a recurrent expr, or it is but its loop is not current + // loop, meanwhile BaseRegs contains a recurrent expr reg related with current + // loop, we want to swap the reg in BaseRegs with ScaledReg. + auto I = + find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) { + return isa<const SCEVAddRecExpr>(S) && + (cast<SCEVAddRecExpr>(S)->getLoop() == &L); + }); + return I == BaseRegs.end(); } /// \brief Helper method to morph a formula into its canonical representation. @@ -428,21 +465,33 @@ bool Formula::isCanonical() const { /// field. Otherwise, we would have to do special cases everywhere in LSR /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... /// On the other hand, 1*reg should be canonicalized into reg. -void Formula::canonicalize() { - if (isCanonical()) +void Formula::canonicalize(const Loop &L) { + if (isCanonical(L)) return; // So far we did not need this case. This is easy to implement but it is // useless to maintain dead code. Beside it could hurt compile time. assert(!BaseRegs.empty() && "1*reg => reg, should not be needed."); + // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg. - ScaledReg = BaseRegs.back(); - BaseRegs.pop_back(); - Scale = 1; - size_t BaseRegsSize = BaseRegs.size(); - size_t Try = 0; - // If ScaledReg is an invariant, try to find a variant expression. - while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg)) - std::swap(ScaledReg, BaseRegs[Try++]); + if (!ScaledReg) { + ScaledReg = BaseRegs.back(); + BaseRegs.pop_back(); + Scale = 1; + } + + // If ScaledReg is an invariant with respect to L, find the reg from + // BaseRegs containing the recurrent expr related with Loop L. Swap the + // reg with ScaledReg. + const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); + if (!SAR || SAR->getLoop() != &L) { + auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()), + [&](const SCEV *S) { + return isa<const SCEVAddRecExpr>(S) && + (cast<SCEVAddRecExpr>(S)->getLoop() == &L); + }); + if (I != BaseRegs.end()) + std::swap(ScaledReg, *I); + } } /// \brief Get rid of the scale in the formula. @@ -458,6 +507,14 @@ bool Formula::unscale() { return true; } +bool Formula::hasZeroEnd() const { + if (UnfoldedOffset || BaseOffset) + return false; + if (BaseRegs.size() != 1 || ScaledReg) + return false; + return true; +} + /// Return the total number of register operands used by this formula. This does /// not include register uses implied by non-constant addrec strides. size_t Formula::getNumRegs() const { @@ -534,10 +591,11 @@ void Formula::print(raw_ostream &OS) const { } } -LLVM_DUMP_METHOD -void Formula::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Formula::dump() const { print(errs()); errs() << '\n'; } +#endif /// Return true if the given addrec can be sign-extended without changing its /// value. @@ -711,7 +769,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { static bool isAddressUse(Instruction *Inst, Value *OperandVal) { bool isAddress = isa<LoadInst>(Inst); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (SI->getOperand(1) == OperandVal) + if (SI->getPointerOperand() == OperandVal) isAddress = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { // Addressing modes can also be folded into prefetches and a variety @@ -723,6 +781,12 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { isAddress = true; break; } + } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { + if (RMW->getPointerOperand() == OperandVal) + isAddress = true; + } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { + if (CmpX->getPointerOperand() == OperandVal) + isAddress = true; } return isAddress; } @@ -735,6 +799,10 @@ static MemAccessTy getAccessType(const Instruction *Inst) { AccessTy.AddrSpace = SI->getPointerAddressSpace(); } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { AccessTy.AddrSpace = LI->getPointerAddressSpace(); + } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { + AccessTy.AddrSpace = RMW->getPointerAddressSpace(); + } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { + AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); } // All pointers have the same requirements, so canonicalize them to an @@ -875,7 +943,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F); // Get the cost of the scaling factor used in F for LU. static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F); + const LSRUse &LU, const Formula &F, + const Loop &L); namespace { @@ -883,6 +952,7 @@ namespace { class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. + unsigned Insns; unsigned NumRegs; unsigned AddRecCost; unsigned NumIVMuls; @@ -893,8 +963,8 @@ class Cost { public: Cost() - : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0), ScaleCost(0) {} + : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), + ImmCost(0), SetupCost(0), ScaleCost(0) {} bool operator<(const Cost &Other) const; @@ -903,9 +973,9 @@ public: #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. bool isValid() { - return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds + return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds | ImmCost | SetupCost | ScaleCost) != ~0u) - || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds + || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds & ImmCost & SetupCost & ScaleCost) == ~0u); } #endif @@ -1067,7 +1137,8 @@ public: } bool HasFormulaWithSameRegs(const Formula &F) const; - bool InsertFormula(const Formula &F); + float getNotSelectedProbability(const SCEV *Reg) const; + bool InsertFormula(const Formula &F, const Loop &L); void DeleteFormula(Formula &F); void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); @@ -1083,17 +1154,23 @@ void Cost::RateRegister(const SCEV *Reg, const Loop *L, ScalarEvolution &SE, DominatorTree &DT) { if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { - // If this is an addrec for another loop, don't second-guess its addrec phi - // nodes. LSR isn't currently smart enough to reason about more than one - // loop at a time. LSR has already run on inner loops, will not run on outer - // loops, and cannot be expected to change sibling loops. + // If this is an addrec for another loop, it should be an invariant + // with respect to L since L is the innermost loop (at least + // for now LSR only handles innermost loops). if (AR->getLoop() != L) { // If the AddRec exists, consider it's register free and leave it alone. if (isExistingPhi(AR, SE)) return; - // Otherwise, do not consider this formula at all. - Lose(); + // It is bad to allow LSR for current loop to add induction variables + // for its sibling loops. + if (!AR->getLoop()->contains(L)) { + Lose(); + return; + } + + // Otherwise, it will be an invariant with respect to Loop L. + ++NumRegs; return; } AddRecCost += 1; /// TODO: This should be a function of the stride. @@ -1150,8 +1227,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSetImpl<const SCEV *> *LoserRegs) { - assert(F.isCanonical() && "Cost is accurate only for canonical formula"); + assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula"); // Tally up the registers. + unsigned PrevAddRecCost = AddRecCost; + unsigned PrevNumRegs = NumRegs; + unsigned PrevNumBaseAdds = NumBaseAdds; if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { Lose(); @@ -1171,6 +1251,18 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, return; } + // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as + // additional instruction (at least fill). + unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1; + if (NumRegs > TTIRegNum) { + // Cost already exceeded TTIRegNum, then only newly added register can add + // new instructions. + if (PrevNumRegs > TTIRegNum) + Insns += (NumRegs - PrevNumRegs); + else + Insns += (NumRegs - TTIRegNum); + } + // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.getNumRegs(); if (NumBaseParts > 1) @@ -1181,7 +1273,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, NumBaseAdds += (F.UnfoldedOffset != 0); // Accumulate non-free scaling amounts. - ScaleCost += getScalingFactorCost(TTI, LU, F); + ScaleCost += getScalingFactorCost(TTI, LU, F, *L); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { @@ -1199,11 +1291,30 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset)) NumBaseAdds++; } + + // If ICmpZero formula ends with not 0, it could not be replaced by + // just add or sub. We'll need to compare final result of AddRec. + // That means we'll need an additional instruction. + // For -10 + {0, +, 1}: + // i = i + 1; + // cmp i, 10 + // + // For {-10, +, 1}: + // i = i + 1; + if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd()) + Insns++; + // Each new AddRec adds 1 instruction to calculation. + Insns += (AddRecCost - PrevAddRecCost); + + // BaseAdds adds instructions for unfolded registers. + if (LU.Kind != LSRUse::ICmpZero) + Insns += NumBaseAdds - PrevNumBaseAdds; assert(isValid() && "invalid cost"); } /// Set this cost to a losing value. void Cost::Lose() { + Insns = ~0u; NumRegs = ~0u; AddRecCost = ~0u; NumIVMuls = ~0u; @@ -1215,6 +1326,8 @@ void Cost::Lose() { /// Choose the lower cost. bool Cost::operator<(const Cost &Other) const { + if (InsnsCost && Insns != Other.Insns) + return Insns < Other.Insns; return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, ImmCost, SetupCost) < std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, @@ -1223,6 +1336,7 @@ bool Cost::operator<(const Cost &Other) const { } void Cost::print(raw_ostream &OS) const { + OS << Insns << " instruction" << (Insns == 1 ? " " : "s "); OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); if (AddRecCost != 0) OS << ", with addrec cost " << AddRecCost; @@ -1239,10 +1353,11 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << SetupCost << " setup cost"; } -LLVM_DUMP_METHOD -void Cost::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Cost::dump() const { print(errs()); errs() << '\n'; } +#endif LSRFixup::LSRFixup() : UserInst(nullptr), OperandValToReplace(nullptr), @@ -1285,10 +1400,11 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -LLVM_DUMP_METHOD -void LSRFixup::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRFixup::dump() const { print(errs()); errs() << '\n'; } +#endif /// Test whether this use as a formula which has the same registers as the given /// formula. @@ -1300,10 +1416,19 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { return Uniquifier.count(Key); } +/// The function returns a probability of selecting formula without Reg. +float LSRUse::getNotSelectedProbability(const SCEV *Reg) const { + unsigned FNum = 0; + for (const Formula &F : Formulae) + if (F.referencesReg(Reg)) + FNum++; + return ((float)(Formulae.size() - FNum)) / Formulae.size(); +} + /// If the given formula has not yet been inserted, add it to the list, and /// return true. Return false otherwise. The formula must be in canonical form. -bool LSRUse::InsertFormula(const Formula &F) { - assert(F.isCanonical() && "Invalid canonical representation"); +bool LSRUse::InsertFormula(const Formula &F, const Loop &L) { + assert(F.isCanonical(L) && "Invalid canonical representation"); if (!Formulae.empty() && RigidFormula) return false; @@ -1391,10 +1516,11 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -LLVM_DUMP_METHOD -void LSRUse::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRUse::dump() const { print(errs()); errs() << '\n'; } +#endif static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, @@ -1472,7 +1598,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, - const Formula &F) { + const Formula &F, const Loop &L) { // For the purpose of isAMCompletelyFolded either having a canonical formula // or a scale not equal to zero is correct. // Problems may arise from non canonical formulae having a scale == 0. @@ -1480,7 +1606,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, // However, when we generate the scaled formulae, we first check that the // scaling factor is profitable before computing the actual ScaledReg for // compile time sake. - assert((F.isCanonical() || F.Scale != 0)); + assert((F.isCanonical(L) || F.Scale != 0)); return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } @@ -1515,14 +1641,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F) { + const LSRUse &LU, const Formula &F, + const Loop &L) { if (!F.Scale) return 0; // If the use is not completely folded in that instruction, we will have to // pay an extra cost only for scale != 1. if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, F)) + LU.AccessTy, F, L)) return F.Scale != 1; switch (LU.Kind) { @@ -1772,6 +1899,7 @@ class LSRInstance { void NarrowSearchSpaceByDetectingSupersets(); void NarrowSearchSpaceByCollapsingUnrolledCode(); void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + void NarrowSearchSpaceByDeletingCostlyFormulas(); void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); @@ -2492,7 +2620,12 @@ static Value *getWideOperand(Value *Oper) { static bool isCompatibleIVType(Value *LVal, Value *RVal) { Type *LType = LVal->getType(); Type *RType = RVal->getType(); - return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); + return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() && + // Different address spaces means (possibly) + // different types of the pointer implementation, + // e.g. i16 vs i32 so disallow that. + (LType->getPointerAddressSpace() == + RType->getPointerAddressSpace())); } /// Return an approximation of this SCEV expression's "base", or NULL for any @@ -2989,8 +3122,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { User::op_iterator UseI = find(UserInst->operands(), U.getOperandValToReplace()); assert(UseI != UserInst->op_end() && "cannot find IV operand"); - if (IVIncSet.count(UseI)) + if (IVIncSet.count(UseI)) { + DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n'); continue; + } LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; @@ -3025,8 +3160,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. - N = TransformForPostIncUse(Normalize, N, CI, nullptr, - TmpPostIncLoops, SE, DT); + N = normalizeForPostIncUse(N, TmpPostIncLoops, SE); Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); } @@ -3108,7 +3242,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { // Do not insert formula that we will not be able to expand. assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && "Formula is illegal"); - if (!LU.InsertFormula(F)) + + if (!LU.InsertFormula(F, *L)) return false; CountRegisters(F, LUIdx); @@ -3347,7 +3482,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the // formula accordingly. - F.canonicalize(); + F.canonicalize(*L); if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like @@ -3359,7 +3494,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, /// Split out subexpressions from adds and the bases of addrecs. void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth) { - assert(Base.isCanonical() && "Input must be in the canonical form"); + assert(Base.isCanonical(*L) && "Input must be in the canonical form"); // Arbitrarily cap recursion to protect compile time. if (Depth >= 3) return; @@ -3400,7 +3535,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // rather than proceed with zero in a register. if (!Sum->isZero()) { F.BaseRegs.push_back(Sum); - F.canonicalize(); + F.canonicalize(*L); (void)InsertFormula(LU, LUIdx, F); } } @@ -3457,7 +3592,7 @@ void LSRInstance::GenerateConstantOffsetsImpl( F.ScaledReg = nullptr; } else F.deleteBaseReg(F.BaseRegs[Idx]); - F.canonicalize(); + F.canonicalize(*L); } else if (IsScaledReg) F.ScaledReg = NewG; else @@ -3620,10 +3755,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) continue; - // For each addrec base reg, apply the scale, if possible. - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) - if (const SCEVAddRecExpr *AR = - dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { + // For each addrec base reg, if its loop is current loop, apply the scale. + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]); + if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) { const SCEV *FactorS = SE.getConstant(IntTy, Factor); if (FactorS->isZero()) continue; @@ -3637,11 +3772,17 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // The canonical representation of 1*reg is reg, which is already in // Base. In that case, do not try to insert the formula, it will be // rejected anyway. - if (F.Scale == 1 && F.BaseRegs.empty()) + if (F.Scale == 1 && (F.BaseRegs.empty() || + (AR->getLoop() != L && LU.AllFixupsOutsideLoop))) continue; + // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate + // non canonical Formula with ScaledReg's loop not being L. + if (F.Scale == 1 && LU.AllFixupsOutsideLoop) + F.canonicalize(*L); (void)InsertFormula(LU, LUIdx, F); } } + } } } @@ -3697,10 +3838,11 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } -LLVM_DUMP_METHOD -void WorkItem::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void WorkItem::dump() const { print(errs()); errs() << '\n'; } +#endif /// Look for registers which are a constant distance apart and try to form reuse /// opportunities between them. @@ -3821,7 +3963,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { continue; // OK, looks good. - NewF.canonicalize(); + NewF.canonicalize(*this->L); (void)InsertFormula(LU, LUIdx, NewF); } else { // Use the immediate in a base register. @@ -3853,7 +3995,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { goto skip_formula; // Ok, looks good. - NewF.canonicalize(); + NewF.canonicalize(*this->L); (void)InsertFormula(LU, LUIdx, NewF); break; skip_formula:; @@ -4165,6 +4307,144 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ } } +/// The function delete formulas with high registers number expectation. +/// Assuming we don't know the value of each formula (already delete +/// all inefficient), generate probability of not selecting for each +/// register. +/// For example, +/// Use1: +/// reg(a) + reg({0,+,1}) +/// reg(a) + reg({-1,+,1}) + 1 +/// reg({a,+,1}) +/// Use2: +/// reg(b) + reg({0,+,1}) +/// reg(b) + reg({-1,+,1}) + 1 +/// reg({b,+,1}) +/// Use3: +/// reg(c) + reg(b) + reg({0,+,1}) +/// reg(c) + reg({b,+,1}) +/// +/// Probability of not selecting +/// Use1 Use2 Use3 +/// reg(a) (1/3) * 1 * 1 +/// reg(b) 1 * (1/3) * (1/2) +/// reg({0,+,1}) (2/3) * (2/3) * (1/2) +/// reg({-1,+,1}) (2/3) * (2/3) * 1 +/// reg({a,+,1}) (2/3) * 1 * 1 +/// reg({b,+,1}) 1 * (2/3) * (2/3) +/// reg(c) 1 * 1 * 0 +/// +/// Now count registers number mathematical expectation for each formula: +/// Note that for each use we exclude probability if not selecting for the use. +/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding +/// probabilty 1/3 of not selecting for Use1). +/// Use1: +/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted +/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted +/// reg({a,+,1}) 1 +/// Use2: +/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted +/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted +/// reg({b,+,1}) 2/3 +/// Use3: +/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted +/// reg(c) + reg({b,+,1}) 1 + 2/3 + +void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + return; + // Ok, we have too many of formulae on our hands to conveniently handle. + // Use a rough heuristic to thin out the list. + + // Set of Regs wich will be 100% used in final solution. + // Used in each formula of a solution (in example above this is reg(c)). + // We can skip them in calculations. + SmallPtrSet<const SCEV *, 4> UniqRegs; + DEBUG(dbgs() << "The search space is too complex.\n"); + + // Map each register to probability of not selecting + DenseMap <const SCEV *, float> RegNumMap; + for (const SCEV *Reg : RegUses) { + if (UniqRegs.count(Reg)) + continue; + float PNotSel = 1; + for (const LSRUse &LU : Uses) { + if (!LU.Regs.count(Reg)) + continue; + float P = LU.getNotSelectedProbability(Reg); + if (P != 0.0) + PNotSel *= P; + else + UniqRegs.insert(Reg); + } + RegNumMap.insert(std::make_pair(Reg, PNotSel)); + } + + DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n"); + + // Delete formulas where registers number expectation is high. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + // If nothing to delete - continue. + if (LU.Formulae.size() < 2) + continue; + // This is temporary solution to test performance. Float should be + // replaced with round independent type (based on integers) to avoid + // different results for different target builds. + float FMinRegNum = LU.Formulae[0].getNumRegs(); + float FMinARegNum = LU.Formulae[0].getNumRegs(); + size_t MinIdx = 0; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + float FRegNum = 0; + float FARegNum = 0; + for (const SCEV *BaseReg : F.BaseRegs) { + if (UniqRegs.count(BaseReg)) + continue; + FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); + if (isa<SCEVAddRecExpr>(BaseReg)) + FARegNum += + RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); + } + if (const SCEV *ScaledReg = F.ScaledReg) { + if (!UniqRegs.count(ScaledReg)) { + FRegNum += + RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); + if (isa<SCEVAddRecExpr>(ScaledReg)) + FARegNum += + RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); + } + } + if (FMinRegNum > FRegNum || + (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) { + FMinRegNum = FRegNum; + FMinARegNum = FARegNum; + MinIdx = i; + } + } + DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs()); + dbgs() << " with min reg num " << FMinRegNum << '\n'); + if (MinIdx != 0) + std::swap(LU.Formulae[MinIdx], LU.Formulae[0]); + while (LU.Formulae.size() != 1) { + DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs()); + dbgs() << '\n'); + LU.Formulae.pop_back(); + } + LU.RecomputeRegs(LUIdx, RegUses); + assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula"); + Formula &F = LU.Formulae[0]; + DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n'); + // When we choose the formula, the regs become unique. + UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + if (F.ScaledReg) + UniqRegs.insert(F.ScaledReg); + } + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); +} + + /// Pick a register which seems likely to be profitable, and then in any use /// which has any reference to that register, delete all formulae which do not /// reference that register. @@ -4237,7 +4517,10 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByDetectingSupersets(); NarrowSearchSpaceByCollapsingUnrolledCode(); NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); - NarrowSearchSpaceByPickingWinnerRegs(); + if (LSRExpNarrow) + NarrowSearchSpaceByDeletingCostlyFormulas(); + else + NarrowSearchSpaceByPickingWinnerRegs(); } /// This is the recursive solver. @@ -4515,11 +4798,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, assert(!Reg->isZero() && "Zero allocated in a base register!"); // If we're expanding for a post-inc user, make the post-inc adjustment. - PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); - Reg = TransformForPostIncUse(Denormalize, Reg, - LF.UserInst, LF.OperandValToReplace, - Loops, SE, DT); - + Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE); Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr))); } @@ -4530,9 +4809,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, // If we're expanding for a post-inc user, make the post-inc adjustment. PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); - ScaledS = TransformForPostIncUse(Denormalize, ScaledS, - LF.UserInst, LF.OperandValToReplace, - Loops, SE, DT); + ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE); if (LU.Kind == LSRUse::ICmpZero) { // Expand ScaleReg as if it was part of the base regs. @@ -4975,10 +5252,11 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -LLVM_DUMP_METHOD -void LSRInstance::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRInstance::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { |