src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
commit	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree	599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Transforms/Scalar/LoopStrengthReduce.cpp
parent	1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)

vendor/llvm/llvm-trunk-r366426

Diffstat (limited to 'lib/Transforms/Scalar/LoopStrengthReduce.cpp')

-rw-r--r--

lib/Transforms/Scalar/LoopStrengthReduce.cpp

344

1 files changed, 242 insertions, 102 deletions

diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 773ffb9df0a2..59a387a186b8 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

@@ -1,9 +1,8 @@

//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -116,6 +115,7 @@

#include <cstdlib>

#include <iterator>

#include <limits>

+#include <numeric>

#include <map>

#include <utility>

@@ -155,11 +155,19 @@ static cl::opt<bool> FilterSameScaledReg(

cl::desc("Narrow LSR search space by filtering non-optimal formulae"

" with the same ScaledReg and Scale"));

+static cl::opt<bool> EnableBackedgeIndexing(

+ "lsr-backedge-indexing", cl::Hidden, cl::init(true),

+ cl::desc("Enable the generation of cross iteration indexed memops"));

static cl::opt<unsigned> ComplexityLimit(

"lsr-complexity-limit", cl::Hidden,

cl::init(std::numeric_limits<uint16_t>::max()),

cl::desc("LSR search space complexity limit"));

+static cl::opt<unsigned> SetupCostDepthLimit(

+ "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),

+ cl::desc("The limit on recursion depth for LSRs setup cost"));

#ifndef NDEBUG

// Stress test IV chain generation.

static cl::opt<bool> StressIVChain(

@@ -1007,10 +1015,15 @@ namespace {

/// This class is used to measure and compare candidate formulae.

class Cost {

+ const Loop *L = nullptr;

+ ScalarEvolution *SE = nullptr;

+ const TargetTransformInfo *TTI = nullptr;

TargetTransformInfo::LSRCost C;

public:

- Cost() {

+ Cost() = delete;

+ Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) :

+ L(L), SE(&SE), TTI(&TTI) {

C.Insns = 0;

C.NumRegs = 0;

C.AddRecCost = 0;

@@ -1021,7 +1034,7 @@ public:

C.ScaleCost = 0;

}

- bool isLess(Cost &Other, const TargetTransformInfo &TTI);

+ bool isLess(Cost &Other);

void Lose();

@@ -1040,12 +1053,9 @@ public:

return C.NumRegs == ~0u;

}

- void RateFormula(const TargetTransformInfo &TTI,

- const Formula &F,

+ void RateFormula(const Formula &F,

SmallPtrSetImpl<const SCEV *> &Regs,

const DenseSet<const SCEV *> &VisitedRegs,

- const Loop *L,

- ScalarEvolution &SE, DominatorTree &DT,

const LSRUse &LU,

SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);

@@ -1053,17 +1063,11 @@ public:

void dump() const;

private:

- void RateRegister(const SCEV *Reg,

- SmallPtrSetImpl<const SCEV *> &Regs,

- const Loop *L,

- ScalarEvolution &SE, DominatorTree &DT,

- const TargetTransformInfo &TTI);

- void RatePrimaryRegister(const SCEV *Reg,

+ void RateRegister(const Formula &F, const SCEV *Reg,

+ SmallPtrSetImpl<const SCEV *> &Regs);

+ void RatePrimaryRegister(const Formula &F, const SCEV *Reg,

SmallPtrSetImpl<const SCEV *> &Regs,

- const Loop *L,

- ScalarEvolution &SE, DominatorTree &DT,

- SmallPtrSetImpl<const SCEV *> *LoserRegs,

- const TargetTransformInfo &TTI);

+ SmallPtrSetImpl<const SCEV *> *LoserRegs);

};

/// An operand value in an instruction which is to be replaced with some

@@ -1208,19 +1212,36 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,

bool HasBaseReg, int64_t Scale,

Instruction *Fixup = nullptr);

+static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {

+ if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))

+ return 1;

+ if (Depth == 0)

+ return 0;

+ if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))

+ return getSetupCost(S->getStart(), Depth - 1);

+ if (auto S = dyn_cast<SCEVCastExpr>(Reg))

+ return getSetupCost(S->getOperand(), Depth - 1);

+ if (auto S = dyn_cast<SCEVNAryExpr>(Reg))

+ return std::accumulate(S->op_begin(), S->op_end(), 0,

+ [&](unsigned i, const SCEV *Reg) {

+ return i + getSetupCost(Reg, Depth - 1);

+ });

+ if (auto S = dyn_cast<SCEVUDivExpr>(Reg))

+ return getSetupCost(S->getLHS(), Depth - 1) +

+ getSetupCost(S->getRHS(), Depth - 1);

+ return 0;

/// Tally up interesting quantities from the given register.

-void Cost::RateRegister(const SCEV *Reg,

- SmallPtrSetImpl<const SCEV *> &Regs,

- const Loop *L,

- ScalarEvolution &SE, DominatorTree &DT,

- const TargetTransformInfo &TTI) {

+void Cost::RateRegister(const Formula &F, const SCEV *Reg,

+ SmallPtrSetImpl<const SCEV *> &Regs) {

if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {

// If this is an addrec for another loop, it should be an invariant

// with respect to L since L is the innermost loop (at least

// for now LSR only handles innermost loops).

if (AR->getLoop() != L) {

// If the AddRec exists, consider it's register free and leave it alone.

- if (isExistingPhi(AR, SE))

+ if (isExistingPhi(AR, *SE))

return;

// It is bad to allow LSR for current loop to add induction variables

@@ -1236,16 +1257,24 @@ void Cost::RateRegister(const SCEV *Reg,

}

unsigned LoopCost = 1;

- if (TTI.shouldFavorPostInc()) {

- const SCEV *LoopStep = AR->getStepRecurrence(SE);

- if (isa<SCEVConstant>(LoopStep)) {

- // Check if a post-indexed load/store can be used.

- if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||

- TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {

+ if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||

+ TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {

+ // If the step size matches the base offset, we could use pre-indexed

+ // addressing.

+ if (TTI->shouldFavorBackedgeIndex(L)) {

+ if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))

+ if (Step->getAPInt() == F.BaseOffset)

+ LoopCost = 0;

+ }

+ if (TTI->shouldFavorPostInc()) {

+ const SCEV *LoopStep = AR->getStepRecurrence(*SE);

+ if (isa<SCEVConstant>(LoopStep)) {

const SCEV *LoopStart = AR->getStart();

if (!isa<SCEVConstant>(LoopStart) &&

- SE.isLoopInvariant(LoopStart, L))

- LoopCost = 0;

+ SE->isLoopInvariant(LoopStart, L))

+ LoopCost = 0;

}

@@ -1255,7 +1284,7 @@ void Cost::RateRegister(const SCEV *Reg,

// TODO: The non-affine case isn't precisely modeled here.

if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {

if (!Regs.count(AR->getOperand(1))) {

- RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);

+ RateRegister(F, AR->getOperand(1), Regs);

if (isLoser())

return;

}

@@ -1265,43 +1294,34 @@ void Cost::RateRegister(const SCEV *Reg,

// Rough heuristic; favor registers which don't require extra setup

// instructions in the preheader.

- if (!isa<SCEVUnknown>(Reg) &&

- !isa<SCEVConstant>(Reg) &&

- !(isa<SCEVAddRecExpr>(Reg) &&

- (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||

- isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))

- ++C.SetupCost;

+ C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);

+ // Ensure we don't, even with the recusion limit, produce invalid costs.

+ C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);

C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&

- SE.hasComputableLoopEvolution(Reg, L);

+ SE->hasComputableLoopEvolution(Reg, L);

}

/// Record this register in the set. If we haven't seen it before, rate

/// it. Optional LoserRegs provides a way to declare any formula that refers to

/// one of those regs an instant loser.

-void Cost::RatePrimaryRegister(const SCEV *Reg,

+void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,

SmallPtrSetImpl<const SCEV *> &Regs,

- const Loop *L,

- ScalarEvolution &SE, DominatorTree &DT,

- SmallPtrSetImpl<const SCEV *> *LoserRegs,

- const TargetTransformInfo &TTI) {

+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {

if (LoserRegs && LoserRegs->count(Reg)) {

Lose();

return;

}

if (Regs.insert(Reg).second) {

- RateRegister(Reg, Regs, L, SE, DT, TTI);

+ RateRegister(F, Reg, Regs);

if (LoserRegs && isLoser())

LoserRegs->insert(Reg);

}

-void Cost::RateFormula(const TargetTransformInfo &TTI,

- const Formula &F,

+void Cost::RateFormula(const Formula &F,

SmallPtrSetImpl<const SCEV *> &Regs,

const DenseSet<const SCEV *> &VisitedRegs,

- const Loop *L,

- ScalarEvolution &SE, DominatorTree &DT,

const LSRUse &LU,

SmallPtrSetImpl<const SCEV *> *LoserRegs) {

assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");

@@ -1314,7 +1334,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,

Lose();

return;

}

- RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);

+ RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);

if (isLoser())

return;

}

@@ -1323,7 +1343,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,

Lose();

return;

}

- RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);

+ RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);

if (isLoser())

return;

}

@@ -1334,11 +1354,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,

// Do not count the base and a possible second register if the target

// allows to fold 2 registers.

C.NumBaseAdds +=

- NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));

+ NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));

C.NumBaseAdds += (F.UnfoldedOffset != 0);

// Accumulate non-free scaling amounts.

- C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L);

+ C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L);

// Tally up the non-zero immediates.

for (const LSRFixup &Fixup : LU.Fixups) {

@@ -1353,7 +1373,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,

// Check with target if this offset with this instruction is

// specifically not supported.

if (LU.Kind == LSRUse::Address && Offset != 0 &&

- !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,

+ !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,

Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))

C.NumBaseAdds++;

}

@@ -1366,7 +1386,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,

// Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as

// additional instruction (at least fill).

- unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;

+ unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1;

if (C.NumRegs > TTIRegNum) {

// Cost already exceeded TTIRegNum, then only newly added register can add

// new instructions.

@@ -1386,7 +1406,8 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,

// For {-10, +, 1}:

// i = i + 1;

- if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())

+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&

+ !TTI->canMacroFuseCmp())

C.Insns++;

// Each new AddRec adds 1 instruction to calculation.

C.Insns += (C.AddRecCost - PrevAddRecCost);

@@ -1410,11 +1431,11 @@ void Cost::Lose() {

}

/// Choose the lower cost.

-bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {

+bool Cost::isLess(Cost &Other) {

if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&

C.Insns != Other.C.Insns)

return C.Insns < Other.C.Insns;

- return TTI.isLSRCostLess(C, Other.C);

+ return TTI->isLSRCostLess(C, Other.C);

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

@@ -1888,8 +1909,11 @@ class LSRInstance {

ScalarEvolution &SE;

DominatorTree &DT;

LoopInfo &LI;

+ AssumptionCache &AC;

+ TargetLibraryInfo &LibInfo;

const TargetTransformInfo &TTI;

Loop *const L;

+ bool FavorBackedgeIndex = false;

bool Changed = false;

/// This is the insert position that the current loop's induction variable

@@ -1910,7 +1934,7 @@ class LSRInstance {

SmallSetVector<Type *, 4> Types;

/// The list of interesting uses.

- SmallVector<LSRUse, 16> Uses;

+ mutable SmallVector<LSRUse, 16> Uses;

/// Track which uses use which register candidates.

RegUseTracker RegUses;

@@ -2025,7 +2049,8 @@ class LSRInstance {

public:

LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,

- LoopInfo &LI, const TargetTransformInfo &TTI);

+ LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,

+ TargetLibraryInfo &LibInfo);

bool getChanged() const { return Changed; }

@@ -2804,7 +2829,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,

/// TODO: Consider IVInc free if it's already used in another chains.

static bool

isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,

- ScalarEvolution &SE, const TargetTransformInfo &TTI) {

+ ScalarEvolution &SE) {

if (StressIVChain)

return true;

@@ -3064,7 +3089,7 @@ void LSRInstance::CollectChains() {

for (unsigned UsersIdx = 0, NChains = IVChainVec.size();

UsersIdx < NChains; ++UsersIdx) {

if (!isProfitableChain(IVChainVec[UsersIdx],

- ChainUsersVec[UsersIdx].FarUsers, SE, TTI))

+ ChainUsersVec[UsersIdx].FarUsers, SE))

continue;

// Preserve the chain at UsesIdx.

if (ChainIdx != UsersIdx)

@@ -3078,7 +3103,7 @@ void LSRInstance::CollectChains() {

void LSRInstance::FinalizeChain(IVChain &Chain) {

assert(!Chain.Incs.empty() && "empty IV chains are not allowed");

LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");

for (const IVInc &Inc : Chain) {

LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");

auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);

@@ -3100,7 +3125,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,

MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);

int64_t IncOffset = IncConst->getValue()->getSExtValue();

if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,

- IncOffset, /*HaseBaseReg=*/false))

+ IncOffset, /*HasBaseReg=*/false))

return false;

return true;

@@ -3210,6 +3235,9 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,

}

void LSRInstance::CollectFixupsAndInitialFormulae() {

+ BranchInst *ExitBranch = nullptr;

+ bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &LibInfo);

for (const IVStrideUse &U : IU) {

Instruction *UserInst = U.getUser();

// Skip IV users that are part of profitable IV Chains.

@@ -3239,6 +3267,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {

// equality icmps, thanks to IndVarSimplify.

if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))

if (CI->isEquality()) {

+ // If CI can be saved in some target, like replaced inside hardware loop

+ // in PowerPC, no need to generate initial formulae for it.

+ if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))

+ continue;

// Swap the operands if needed to put the OperandValToReplace on the

// left, for consistency.

Value *NV = CI->getOperand(1);

@@ -3738,10 +3770,11 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,

void LSRInstance::GenerateConstantOffsetsImpl(

LSRUse &LU, unsigned LUIdx, const Formula &Base,

const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {

- const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];

- for (int64_t Offset : Worklist) {

+ auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {

Formula F = Base;

F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;

if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,

LU.AccessTy, F)) {

// Add the offset to the base register.

@@ -3761,7 +3794,35 @@ void LSRInstance::GenerateConstantOffsetsImpl(

(void)InsertFormula(LU, LUIdx, F);

}

+ };

+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];

+ // With constant offsets and constant steps, we can generate pre-inc

+ // accesses by having the offset equal the step. So, for access #0 with a

+ // step of 8, we generate a G - 8 base which would require the first access

+ // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer

+ // for itself and hopefully becomes the base for other accesses. This means

+ // means that a single pre-indexed access can be generated to become the new

+ // base pointer for each iteration of the loop, resulting in no extra add/sub

+ // instructions for pointer updating.

+ if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {

+ if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {

+ if (auto *StepRec =

+ dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {

+ const APInt &StepInt = StepRec->getAPInt();

+ int64_t Step = StepInt.isNegative() ?

+ StepInt.getSExtValue() : StepInt.getZExtValue();

+ for (int64_t Offset : Worklist) {

+ Offset -= Step;

+ GenerateOffset(G, Offset);

+ }

}

+ for (int64_t Offset : Worklist)

+ GenerateOffset(G, Offset);

int64_t Imm = ExtractImmediate(G, SE);

if (G->isZero() || Imm == 0)

@@ -3968,9 +4029,27 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {

if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {

Formula F = Base;

- if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);

- for (const SCEV *&BaseReg : F.BaseRegs)

- BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);

+ // Sometimes SCEV is able to prove zero during ext transform. It may

+ // happen if SCEV did not do all possible transforms while creating the

+ // initial node (maybe due to depth limitations), but it can do them while

+ // taking ext.

+ if (F.ScaledReg) {

+ const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);

+ if (NewScaledReg->isZero())

+ continue;

+ F.ScaledReg = NewScaledReg;

+ }

+ bool HasZeroBaseReg = false;

+ for (const SCEV *&BaseReg : F.BaseRegs) {

+ const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);

+ if (NewBaseReg->isZero()) {

+ HasZeroBaseReg = true;

+ break;

+ }

+ BaseReg = NewBaseReg;

+ }

+ if (HasZeroBaseReg)

+ continue;

// TODO: This assumes we've done basic processing on all uses and

// have an idea what the register usage is.

@@ -4067,11 +4146,17 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {

// Conservatively examine offsets between this orig reg a few selected

// other orig regs.

+ int64_t First = Imms.begin()->first;

+ int64_t Last = std::prev(Imms.end())->first;

+ // Compute (First + Last) / 2 without overflow using the fact that

+ // First + Last = 2 * (First + Last) + (First ^ Last).

+ int64_t Avg = (First & Last) + ((First ^ Last) >> 1);

+ // If the result is negative and First is odd and Last even (or vice versa),

+ // we rounded towards -inf. Add 1 in that case, to round towards 0.

+ Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));

ImmMapTy::const_iterator OtherImms[] = {

- Imms.begin(), std::prev(Imms.end()),

- Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /

- 2)

- };

+ Imms.begin(), std::prev(Imms.end()),

+ Imms.lower_bound(Avg)};

for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {

ImmMapTy::const_iterator M = OtherImms[i];

if (M == J || M == JE) continue;

@@ -4249,9 +4334,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {

// avoids the need to recompute this information across formulae using the

// same bad AddRec. Passing LoserRegs is also essential unless we remove

// the corresponding bad register from the Regs set.

- Cost CostF;

+ Cost CostF(L, SE, TTI);

Regs.clear();

- CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);

+ CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);

if (CostF.isLoser()) {

// During initial formula generation, undesirable formulae are generated

// by uses within other loops that have some non-trivial address mode or

@@ -4282,10 +4367,10 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {

Formula &Best = LU.Formulae[P.first->second];

- Cost CostBest;

+ Cost CostBest(L, SE, TTI);

Regs.clear();

- CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);

- if (CostF.isLess(CostBest, TTI))

+ CostBest.RateFormula(Best, Regs, VisitedRegs, LU);

+ if (CostF.isLess(CostBest))

std::swap(F, Best);

LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());

dbgs() << "\n"

@@ -4357,7 +4442,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {

I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {

if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {

Formula NewF = F;

- NewF.BaseOffset += C->getValue()->getSExtValue();

+ //FIXME: Formulas should store bitwidth to do wrapping properly.

+ // See PR41034.

+ NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();

NewF.BaseRegs.erase(NewF.BaseRegs.begin() +

(I - F.BaseRegs.begin()));

if (LU.HasFormulaWithSameRegs(NewF)) {

@@ -4400,7 +4487,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {

/// When there are many registers for expressions like A, A+1, A+2, etc.,

/// allocate a single register for them.

void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {

- if (EstimateSearchSpaceComplexity() < ComplexityLimit)

+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)

return;

LLVM_DEBUG(

@@ -4533,12 +4620,13 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {

// If the new register numbers are the same, choose the Formula with

// less Cost.

- Cost CostFA, CostFB;

+ Cost CostFA(L, SE, TTI);

+ Cost CostFB(L, SE, TTI);

Regs.clear();

- CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);

+ CostFA.RateFormula(FA, Regs, VisitedRegs, LU);

Regs.clear();

- CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);

- return CostFA.isLess(CostFB, TTI);

+ CostFB.RateFormula(FB, Regs, VisitedRegs, LU);

+ return CostFA.isLess(CostFB);

};

bool Any = false;

@@ -4824,7 +4912,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,

ReqRegs.insert(S);

SmallPtrSet<const SCEV *, 16> NewRegs;

- Cost NewCost;

+ Cost NewCost(L, SE, TTI);

for (const Formula &F : LU.Formulae) {

// Ignore formulae which may not be ideal in terms of register reuse of

// ReqRegs. The formula should use all required registers before

@@ -4848,8 +4936,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,

// the current best, prune the search at that point.

NewCost = CurCost;

NewRegs = CurRegs;

- NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);

- if (NewCost.isLess(SolutionCost, TTI)) {

+ NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);

+ if (NewCost.isLess(SolutionCost)) {

Workspace.push_back(&F);

if (Workspace.size() != Uses.size()) {

SolveRecurse(Solution, SolutionCost, Workspace, NewCost,

@@ -4858,9 +4946,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,

VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);

} else {

LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());

- dbgs() << ".\n Regs:"; for (const SCEV *S

- : NewRegs) dbgs()

- << ' ' << *S;

+ dbgs() << ".\nRegs:\n";

+ for (const SCEV *S : NewRegs) dbgs()

+ << "- " << *S << "\n";

dbgs() << '\n');

SolutionCost = NewCost;

@@ -4875,9 +4963,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,

/// vector.

void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {

SmallVector<const Formula *, 8> Workspace;

- Cost SolutionCost;

+ Cost SolutionCost(L, SE, TTI);

SolutionCost.Lose();

- Cost CurCost;

+ Cost CurCost(L, SE, TTI);

SmallPtrSet<const SCEV *, 16> CurRegs;

DenseSet<const SCEV *> VisitedRegs;

Workspace.reserve(Uses.size());

@@ -5215,6 +5303,7 @@ void LSRInstance::RewriteForPHI(

DenseMap<BasicBlock *, Value *> Inserted;

for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)

if (PN->getIncomingValue(i) == LF.OperandValToReplace) {

+ bool needUpdateFixups = false;

BasicBlock *BB = PN->getIncomingBlock(i);

// If this is a critical edge, split the edge so that we do not insert

@@ -5233,7 +5322,7 @@ void LSRInstance::RewriteForPHI(

NewBB = SplitCriticalEdge(BB, Parent,

CriticalEdgeSplittingOptions(&DT, &LI)

.setMergeIdenticalEdges()

- .setDontDeleteUselessPHIs());

+ .setKeepOneInputPHIs());

} else {

SmallVector<BasicBlock*, 2> NewBBs;

SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);

@@ -5253,6 +5342,8 @@ void LSRInstance::RewriteForPHI(

e = PN->getNumIncomingValues();

BB = NewBB;

i = PN->getBasicBlockIndex(BB);

+ needUpdateFixups = true;

}

@@ -5277,6 +5368,44 @@ void LSRInstance::RewriteForPHI(

PN->setIncomingValue(i, FullV);

Pair.first->second = FullV;

}

+ // If LSR splits critical edge and phi node has other pending

+ // fixup operands, we need to update those pending fixups. Otherwise

+ // formulae will not be implemented completely and some instructions

+ // will not be eliminated.

+ if (needUpdateFixups) {

+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)

+ for (LSRFixup &Fixup : Uses[LUIdx].Fixups)

+ // If fixup is supposed to rewrite some operand in the phi

+ // that was just updated, it may be already moved to

+ // another phi node. Such fixup requires update.

+ if (Fixup.UserInst == PN) {

+ // Check if the operand we try to replace still exists in the

+ // original phi.

+ bool foundInOriginalPHI = false;

+ for (const auto &val : PN->incoming_values())

+ if (val == Fixup.OperandValToReplace) {

+ foundInOriginalPHI = true;

+ break;

+ }

+ // If fixup operand found in original PHI - nothing to do.

+ if (foundInOriginalPHI)

+ continue;

+ // Otherwise it might be moved to another PHI and requires update.

+ // If fixup operand not found in any of the incoming blocks that

+ // means we have already rewritten it - nothing to do.

+ for (const auto &Block : PN->blocks())

+ for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);

+ ++I) {

+ PHINode *NewPN = cast<PHINode>(I);

+ for (const auto &val : NewPN->incoming_values())

+ if (val == Fixup.OperandValToReplace)

+ Fixup.UserInst = NewPN;

+ }

}

@@ -5360,8 +5489,11 @@ void LSRInstance::ImplementSolution(

LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,

DominatorTree &DT, LoopInfo &LI,

- const TargetTransformInfo &TTI)

- : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {

+ const TargetTransformInfo &TTI, AssumptionCache &AC,

+ TargetLibraryInfo &LibInfo)

+ : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), LibInfo(LibInfo), TTI(TTI), L(L),

+ FavorBackedgeIndex(EnableBackedgeIndexing &&

+ TTI.shouldFavorBackedgeIndex(L)) {

// If LoopSimplify form is not available, stay out of trouble.

if (!L->isLoopSimplifyForm())

return;

@@ -5556,6 +5688,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {

AU.addPreserved<DominatorTreeWrapperPass>();

AU.addRequired<ScalarEvolutionWrapperPass>();

AU.addPreserved<ScalarEvolutionWrapperPass>();

+ AU.addRequired<AssumptionCacheTracker>();

+ AU.addRequired<TargetLibraryInfoWrapperPass>();

// Requiring LoopSimplify a second time here prevents IVUsers from running

// twice, since LoopSimplify was invalidated by running ScalarEvolution.

AU.addRequiredID(LoopSimplifyID);

@@ -5566,11 +5700,14 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {

static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,

DominatorTree &DT, LoopInfo &LI,

- const TargetTransformInfo &TTI) {

+ const TargetTransformInfo &TTI,

+ AssumptionCache &AC,

+ TargetLibraryInfo &LibInfo) {

bool Changed = false;

// Run the main LSR transformation.

- Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();

+ Changed |= LSRInstance(L, IU, SE, DT, LI, TTI, AC, LibInfo).getChanged();

// Remove any extra phis created by processing inner loops.

Changed |= DeleteDeadPHIs(L->getHeader());

@@ -5601,14 +5738,17 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {

auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(

*L->getHeader()->getParent());

- return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);

+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(

+ *L->getHeader()->getParent());

+ auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();

+ return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo);

}

PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,

LoopStandardAnalysisResults &AR,

LPMUpdater &) {

if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,

- AR.DT, AR.LI, AR.TTI))

+ AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI))

return PreservedAnalyses::all();

return getLoopPassPreservedAnalyses();