src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-12-25 22:36:56 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2022-05-14 11:44:01 +0000
commit	0eae32dcef82f6f06de6419a0d623d7def0cc8f6 (patch)
tree	55b7e05be47b835fd137915bee1e64026c35e71c /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent	4824e7fd18a1223177218d4aec1b3c6c5c4a444e (diff)
parent	77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

463

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

633

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

118

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

12 files changed, 912 insertions, 551 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 805011191da0..81e5aa223c07 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -55,22 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(

cl::desc("The maximum number of SCEV checks allowed with a "

"vectorize(enable) pragma"));

-// FIXME: When scalable vectorization is stable enough, change the default

-// to SK_PreferFixedWidth.

-static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization(

- "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly),

- cl::Hidden,

- cl::desc("Control whether the compiler can use scalable vectors to "

- "vectorize a loop"),

- cl::values(

- clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",

- "Scalable vectorization is disabled."),

- clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on",

- "Scalable vectorization is available, but favor fixed-width "

- "vectorization when the cost is inconclusive."),

- clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred",

- "Scalable vectorization is available and favored when the "

- "cost is inconclusive.")));

+static cl::opt<LoopVectorizeHints::ScalableForceKind>

+ ForceScalableVectorization(

+ "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified),

+ cl::Hidden,

+ cl::desc("Control whether the compiler can use scalable vectors to "

+ "vectorize a loop"),

+ cl::values(

+ clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",

+ "Scalable vectorization is disabled."),

+ clEnumValN(

+ LoopVectorizeHints::SK_PreferScalable, "preferred",

+ "Scalable vectorization is available and favored when the "

+ "cost is inconclusive."),

+ clEnumValN(

+ LoopVectorizeHints::SK_PreferScalable, "on",

+ "Scalable vectorization is available and favored when the "

+ "cost is inconclusive.")));

/// Maximum vectorization interleave count.

static const unsigned MaxInterleaveFactor = 16;

@@ -95,7 +96,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {

LoopVectorizeHints::LoopVectorizeHints(const Loop *L,

bool InterleaveOnlyWhenForced,

- OptimizationRemarkEmitter &ORE)

+ OptimizationRemarkEmitter &ORE,

+ const TargetTransformInfo *TTI)

: Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),

Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),

Force("vectorize.enable", FK_Undefined, HK_FORCE),

@@ -110,14 +112,32 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,

if (VectorizerParams::isInterleaveForced())

Interleave.Value = VectorizerParams::VectorizationInterleave;

+ // If the metadata doesn't explicitly specify whether to enable scalable

+ // vectorization, then decide based on the following criteria (increasing

+ // level of priority):

+ // - Target default

+ // - Metadata width

+ // - Force option (always overrides)

+ if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {

+ if (TTI)

+ Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable

+ : SK_FixedWidthOnly;

+ if (Width.Value)

+ // If the width is set, but the metadata says nothing about the scalable

+ // property, then assume it concerns only a fixed-width UserVF.

+ // If width is not set, the flag takes precedence.

+ Scalable.Value = SK_FixedWidthOnly;

+ }

+ // If the flag is set to force any use of scalable vectors, override the loop

+ // hints.

+ if (ForceScalableVectorization.getValue() !=

+ LoopVectorizeHints::SK_Unspecified)

+ Scalable.Value = ForceScalableVectorization.getValue();

+ // Scalable vectorization is disabled if no preference is specified.

if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)

- // If the width is set, but the metadata says nothing about the scalable

- // property, then assume it concerns only a fixed-width UserVF.

- // If width is not set, the flag takes precedence.

- Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization;

- else if (ScalableVectorization == SK_FixedWidthOnly)

- // If the flag is set to disable any use of scalable vectors, override the

- // loop hint.

Scalable.Value = SK_FixedWidthOnly;

if (IsVectorized.Value != 1)

@@ -929,7 +949,7 @@ bool LoopVectorizationLegality::canVectorizeFPMath(

}));

}

-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {

+bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {

Value *In0 = const_cast<Value *>(V);

PHINode *PN = dyn_cast_or_null<PHINode>(In0);

if (!PN)

@@ -938,16 +958,29 @@ bool LoopVectorizationLegality::isInductionPhi(const Value *V) {

return Inductions.count(PN);

}

-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {

+const InductionDescriptor *

+LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {

+ if (!isInductionPhi(Phi))

+ return nullptr;

+ auto &ID = getInductionVars().find(Phi)->second;

+ if (ID.getKind() == InductionDescriptor::IK_IntInduction ||

+ ID.getKind() == InductionDescriptor::IK_FpInduction)

+ return &ID;

+ return nullptr;

+bool LoopVectorizationLegality::isCastedInductionVariable(

+ const Value *V) const {

auto *Inst = dyn_cast<Instruction>(V);

return (Inst && InductionCastsToIgnore.count(Inst));

}

-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {

+bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {

return isInductionPhi(V) || isCastedInductionVariable(V);

}

-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {

+bool LoopVectorizationLegality::isFirstOrderRecurrence(

+ const PHINode *Phi) const {

return FirstOrderRecurrences.count(Phi);

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7d6609f8c56..71eb39a18d2f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -45,16 +45,17 @@ class VPBuilder {

VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();

VPInstruction *createInstruction(unsigned Opcode,

- ArrayRef<VPValue *> Operands) {

- VPInstruction *Instr = new VPInstruction(Opcode, Operands);

+ ArrayRef<VPValue *> Operands, DebugLoc DL) {

+ VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL);

if (BB)

BB->insert(Instr, InsertPt);

return Instr;

}

VPInstruction *createInstruction(unsigned Opcode,

- std::initializer_list<VPValue *> Operands) {

- return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));

+ std::initializer_list<VPValue *> Operands,

+ DebugLoc DL) {

+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL);

}

public:

@@ -123,30 +124,33 @@ public:

/// its underlying Instruction.

VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,

Instruction *Inst = nullptr) {

- VPInstruction *NewVPInst = createInstruction(Opcode, Operands);

+ DebugLoc DL;

+ if (Inst)

+ DL = Inst->getDebugLoc();

+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL);

NewVPInst->setUnderlyingValue(Inst);

return NewVPInst;

}

- VPValue *createNaryOp(unsigned Opcode,

- std::initializer_list<VPValue *> Operands,

- Instruction *Inst = nullptr) {

- return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);

+ VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,

+ DebugLoc DL) {

+ return createInstruction(Opcode, Operands, DL);

}

- VPValue *createNot(VPValue *Operand) {

- return createInstruction(VPInstruction::Not, {Operand});

+ VPValue *createNot(VPValue *Operand, DebugLoc DL) {

+ return createInstruction(VPInstruction::Not, {Operand}, DL);

}

- VPValue *createAnd(VPValue *LHS, VPValue *RHS) {

- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});

+ VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) {

+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL);

}

- VPValue *createOr(VPValue *LHS, VPValue *RHS) {

- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});

+ VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) {

+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL);

}

- VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {

- return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});

+ VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,

+ DebugLoc DL) {

+ return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL);

}

//===--------------------------------------------------------------------===//

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ca0adb4242c..4747f34fcc62 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -428,6 +428,8 @@ class GeneratedRTChecks;

namespace llvm {

+AnalysisKey ShouldRunExtraVectorPasses::Key;

/// InnerLoopVectorizer vectorizes loops which contain only one basic

/// block to a specified vectorization factor (VF).

/// This class performs the widening of scalars into vectors, or multiple

@@ -506,8 +508,8 @@ public:

/// Widen an integer or floating-point induction variable \p IV. If \p Trunc

/// is provided, the integer induction variable will first be truncated to

/// the corresponding type.

- void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,

- VPValue *Def, VPValue *CastDef,

+ void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,

+ Value *Start, TruncInst *Trunc, VPValue *Def,

VPTransformState &State);

/// Construct the vector value of a scalarized value \p V one lane at a time.

@@ -534,7 +536,7 @@ public:

/// Returns true if the reordering of FP operations is not allowed, but we are

/// able to vectorize with strict in-order reductions for the given RdxDesc.

- bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);

+ bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);

/// Create a broadcast instruction. This method generates a broadcast

/// instruction (shuffle) for loop invariant values and for the induction

@@ -619,7 +621,7 @@ protected:

/// can also be a truncate instruction.

void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,

const InductionDescriptor &ID, VPValue *Def,

- VPValue *CastDef, VPTransformState &State);

+ VPTransformState &State);

/// Create a vector induction phi node based on an existing scalar one. \p

/// EntryVal is the value from the original loop that maps to the vector phi

@@ -629,7 +631,6 @@ protected:

void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,

Value *Step, Value *Start,

Instruction *EntryVal, VPValue *Def,

- VPValue *CastDef,

VPTransformState &State);

/// Returns true if an instruction \p I should be scalarized instead of

@@ -639,29 +640,6 @@ protected:

/// Returns true if we should generate a scalar version of \p IV.

bool needsScalarInduction(Instruction *IV) const;

- /// If there is a cast involved in the induction variable \p ID, which should

- /// be ignored in the vectorized loop body, this function records the

- /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the

- /// cast. We had already proved that the casted Phi is equal to the uncasted

- /// Phi in the vectorized loop (under a runtime guard), and therefore

- /// there is no need to vectorize the cast - the same value can be used in the

- /// vector loop for both the Phi and the cast.

- /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,

- /// Otherwise, \p VectorLoopValue is a widened/vectorized value.

- ///

- /// \p EntryVal is the value from the original loop that maps to the vector

- /// phi node and is used to distinguish what is the IV currently being

- /// processed - original one (if \p EntryVal is a phi corresponding to the

- /// original IV) or the "newly-created" one based on the proof mentioned above

- /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the

- /// latter case \p EntryVal is a TruncInst and we must not record anything for

- /// that IV, but it's error-prone to expect callers of this routine to care

- /// about that, hence this explicit parameter.

- void recordVectorLoopValueForInductionCast(

- const InductionDescriptor &ID, const Instruction *EntryVal,

- Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,

- unsigned Part, unsigned Lane = UINT_MAX);

/// Generate a shuffle sequence that will reverse the vector Vec.

virtual Value *reverseVector(Value *Vec);

@@ -698,7 +676,8 @@ protected:

/// flags, which can be found from the original scalar operations.

Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,

const DataLayout &DL,

- const InductionDescriptor &ID) const;

+ const InductionDescriptor &ID,

+ BasicBlock *VectorHeader) const;

/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,

/// vector loop preheader, middle block and scalar preheader. Also

@@ -1728,7 +1707,8 @@ private:

/// disabled or unsupported, then the scalable part will be equal to

/// ElementCount::getScalable(0).

FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,

- ElementCount UserVF);

+ ElementCount UserVF,

+ bool FoldTailByMasking);

/// \return the maximized element count based on the targets vector

/// registers and the loop trip-count, but limited to a maximum safe VF.

@@ -1741,7 +1721,8 @@ private:

ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,

unsigned SmallestType,

unsigned WidestType,

- const ElementCount &MaxSafeVF);

+ const ElementCount &MaxSafeVF,

+ bool FoldTailByMasking);

/// \return the maximum legal scalable VF, based on the safe max number

/// of elements.

@@ -2356,8 +2337,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {

void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(

const InductionDescriptor &II, Value *Step, Value *Start,

- Instruction *EntryVal, VPValue *Def, VPValue *CastDef,

- VPTransformState &State) {

+ Instruction *EntryVal, VPValue *Def, VPTransformState &State) {

+ IRBuilder<> &Builder = State.Builder;

assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&

"Expected either an induction phi-node or a truncate of it!");

@@ -2373,7 +2354,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(

}

Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);

- Value *SplatStart = Builder.CreateVectorSplat(VF, Start);

+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);

Value *SteppedStart =

getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());

@@ -2394,9 +2375,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(

Type *StepType = Step->getType();

Value *RuntimeVF;

if (Step->getType()->isFloatingPointTy())

- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);

+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);

else

- RuntimeVF = getRuntimeVF(Builder, StepType, VF);

+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);

Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);

// Create a vector splat to use in the induction update.

@@ -2405,8 +2386,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(

// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't

// handle a constant vector splat.

Value *SplatVF = isa<Constant>(Mul)

- ? ConstantVector::getSplat(VF, cast<Constant>(Mul))

- : Builder.CreateVectorSplat(VF, Mul);

+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))

+ : Builder.CreateVectorSplat(State.VF, Mul);

Builder.restoreIP(CurrIP);

// We may need to add the step a number of times, depending on the unroll

@@ -2420,8 +2401,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(

if (isa<TruncInst>(EntryVal))

addMetadata(LastInduction, EntryVal);

- recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,

- State, Part);

LastInduction = cast<Instruction>(

Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));

@@ -2455,56 +2434,21 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {

return llvm::any_of(IV->users(), isScalarInst);

}

-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(

- const InductionDescriptor &ID, const Instruction *EntryVal,

- Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,

- unsigned Part, unsigned Lane) {

- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&

- "Expected either an induction phi-node or a truncate of it!");

- // This induction variable is not the phi from the original loop but the

- // newly-created IV based on the proof that casted Phi is equal to the

- // uncasted Phi in the vectorized loop (under a runtime guard possibly). It

- // re-uses the same InductionDescriptor that original IV uses but we don't

- // have to do any recording in this case - that is done when original IV is

- // processed.

- if (isa<TruncInst>(EntryVal))

- return;

- if (!CastDef) {

- assert(ID.getCastInsts().empty() &&

- "there are casts for ID, but no CastDef");

- return;

- }

- assert(!ID.getCastInsts().empty() &&

- "there is a CastDef, but no casts for ID");

- // Only the first Cast instruction in the Casts vector is of interest.

- // The rest of the Casts (if exist) have no uses outside the

- // induction update chain itself.

- if (Lane < UINT_MAX)

- State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));

- else

- State.set(CastDef, VectorLoopVal, Part);

-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

- TruncInst *Trunc, VPValue *Def,

- VPValue *CastDef,

+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,

+ const InductionDescriptor &ID,

+ Value *Start, TruncInst *Trunc,

+ VPValue *Def,

VPTransformState &State) {

+ IRBuilder<> &Builder = State.Builder;

assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&

"Primary induction variable must have an integer type");

- auto II = Legal->getInductionVars().find(IV);

- assert(II != Legal->getInductionVars().end() && "IV is not an induction");

- auto ID = II->second;

assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");

// The value from the original loop to which we are mapping the new induction

// variable.

Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

- auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

+ auto &DL = EntryVal->getModule()->getDataLayout();

// Generate code for the induction step. Note that induction steps are

// required to be loop-invariant

@@ -2514,7 +2458,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

if (PSE.getSE()->isSCEVable(IV->getType())) {

SCEVExpander Exp(*PSE.getSE(), DL, "induction");

return Exp.expandCodeFor(Step, Step->getType(),

- LoopVectorPreHeader->getTerminator());

+ State.CFG.VectorPreHeader->getTerminator());

}

return cast<SCEVUnknown>(Step)->getValue();

};

@@ -2530,7 +2474,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

? Builder.CreateSExtOrTrunc(Induction, IV->getType())

: Builder.CreateCast(Instruction::SIToFP, Induction,

IV->getType());

- ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);

+ ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,

+ State.CFG.PrevBB);

ScalarIV->setName("offset.idx");

}

if (Trunc) {

@@ -2548,20 +2493,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {

Value *Broadcasted = getBroadcastInstrs(ScalarIV);

for (unsigned Part = 0; Part < UF; ++Part) {

- assert(!VF.isScalable() && "scalable vectors not yet supported.");

+ assert(!State.VF.isScalable() && "scalable vectors not yet supported.");

Value *StartIdx;

if (Step->getType()->isFloatingPointTy())

- StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);

+ StartIdx =

+ getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);

else

- StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);

+ StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);

Value *EntryPart =

getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());

State.set(Def, EntryPart, Part);

if (Trunc)

addMetadata(EntryPart, Trunc);

- recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,

- State, Part);

}

};

@@ -2572,7 +2516,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

// Now do the actual transformations, and start with creating the step value.

Value *Step = CreateStepValue(ID.getStep());

- if (VF.isZero() || VF.isScalar()) {

+ if (State.VF.isZero() || State.VF.isScalar()) {

Value *ScalarIV = CreateScalarIV(Step);

CreateSplatIV(ScalarIV, Step);

return;

@@ -2583,8 +2527,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

// least one user in the loop that is not widened.

auto NeedsScalarIV = needsScalarInduction(EntryVal);

if (!NeedsScalarIV) {

- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,

- State);

+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);

return;

}

@@ -2592,14 +2535,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

// create the phi node, we will splat the scalar induction variable in each

// loop iteration.

if (!shouldScalarizeInstruction(EntryVal)) {

- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,

- State);

+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);

Value *ScalarIV = CreateScalarIV(Step);

// Create scalar steps that can be used by instructions we will later

// scalarize. Note that the addition of the scalar steps will not increase

// the number of instructions in the loop in the common case prior to

// InstCombine. We will be trading one vector extract for each scalar step.

- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);

+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);

return;

}

@@ -2609,7 +2551,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,

Value *ScalarIV = CreateScalarIV(Step);

if (!Cost->isScalarEpilogueAllowed())

CreateSplatIV(ScalarIV, Step);

- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);

+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);

}

Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,

@@ -2663,10 +2605,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,

void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,

Instruction *EntryVal,

const InductionDescriptor &ID,

- VPValue *Def, VPValue *CastDef,

+ VPValue *Def,

VPTransformState &State) {

+ IRBuilder<> &Builder = State.Builder;

// We shouldn't have to build scalar steps if we aren't vectorizing.

- assert(VF.isVector() && "VF should be greater than one");

+ assert(State.VF.isVector() && "VF should be greater than one");

// Get the value type and ensure it and the step have the same integer type.

Type *ScalarIVTy = ScalarIV->getType()->getScalarType();

assert(ScalarIVTy == Step->getType() &&

@@ -2688,33 +2631,32 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,

// iteration. If EntryVal is uniform, we only need to generate the first

// lane. Otherwise, we generate all VF values.

bool IsUniform =

- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);

- unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();

+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);

+ unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();

// Compute the scalar steps and save the results in State.

Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),

ScalarIVTy->getScalarSizeInBits());

Type *VecIVTy = nullptr;

Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;

- if (!IsUniform && VF.isScalable()) {

- VecIVTy = VectorType::get(ScalarIVTy, VF);

- UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));

- SplatStep = Builder.CreateVectorSplat(VF, Step);

- SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);

+ if (!IsUniform && State.VF.isScalable()) {

+ VecIVTy = VectorType::get(ScalarIVTy, State.VF);

+ UnitStepVec =

+ Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));

+ SplatStep = Builder.CreateVectorSplat(State.VF, Step);

+ SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);

}

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);

- if (!IsUniform && VF.isScalable()) {

- auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);

+ if (!IsUniform && State.VF.isScalable()) {

+ auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);

auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);

if (ScalarIVTy->isFloatingPointTy())

InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);

auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);

auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);

State.set(Def, Add, Part);

- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,

- Part);

// It's useful to record the lane values too for the known minimum number

// of elements so we do those below. This improves the code quality when

// trying to extract the first element, for example.

@@ -2728,14 +2670,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,

AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));

// The step returned by `createStepForVF` is a runtime-evaluated value

// when VF is scalable. Otherwise, it should be folded into a Constant.

- assert((VF.isScalable() || isa<Constant>(StartIdx)) &&

+ assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&

"Expected StartIdx to be folded to a constant when VF is not "

"scalable");

auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);

auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);

State.set(Def, Add, VPIteration(Part, Lane));

- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,

- Part, Lane);

}

@@ -3023,21 +2963,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,

// poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized

// instruction could feed a poison value to the base address of the widen

// load/store.

- if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)

+ if (State.MayGeneratePoisonRecipes.contains(RepRecipe))

Cloned->dropPoisonGeneratingFlags();

State.Builder.SetInsertPoint(Builder.GetInsertBlock(),

Builder.GetInsertPoint());

// Replace the operands of the cloned instructions with their scalar

// equivalents in the new loop.

- for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {

- auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));

+ for (auto &I : enumerate(RepRecipe->operands())) {

auto InputInstance = Instance;

- if (!Operand || !OrigLoop->contains(Operand) ||

- (Cost->isUniformAfterVectorization(Operand, State.VF)))

+ VPValue *Operand = I.value();

+ if (State.Plan->isUniformAfterVectorization(Operand))

InputInstance.Lane = VPLane::getFirstLane();

- auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);

- Cloned->setOperand(op, NewOp);

+ Cloned->setOperand(I.index(), State.get(Operand, InputInstance));

}

addNewMetadata(Cloned, Instr);

@@ -3339,7 +3277,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,

Value *InnerLoopVectorizer::emitTransformedIndex(

IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,

- const InductionDescriptor &ID) const {

+ const InductionDescriptor &ID, BasicBlock *VectorHeader) const {

SCEVExpander Exp(*SE, DL, "induction");

auto Step = ID.getStep();

@@ -3382,15 +3320,15 @@ Value *InnerLoopVectorizer::emitTransformedIndex(

};

// Get a suitable insert point for SCEV expansion. For blocks in the vector

- // loop, choose the end of the vector loop header (=LoopVectorBody), because

+ // loop, choose the end of the vector loop header (=VectorHeader), because

// the DomTree is not kept up-to-date for additional blocks generated in the

// vector loop. By using the header as insertion point, we guarantee that the

// expanded instructions dominate all their uses.

- auto GetInsertPoint = [this, &B]() {

+ auto GetInsertPoint = [this, &B, VectorHeader]() {

BasicBlock *InsertBB = B.GetInsertPoint()->getParent();

if (InsertBB != LoopVectorBody &&

- LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))

- return LoopVectorBody->getTerminator();

+ LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))

+ return VectorHeader->getTerminator();

return &*B.GetInsertPoint();

};

@@ -3538,7 +3476,8 @@ void InnerLoopVectorizer::createInductionResumeValues(

CastInst::getCastOpcode(VectorTripCount, true, StepType, true);

Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");

const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();

- EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);

+ EndValue =

+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);

EndValue->setName("ind.end");

// Compute the end value for the additional bypass (if applicable).

@@ -3549,7 +3488,7 @@ void InnerLoopVectorizer::createInductionResumeValues(

CRD =

B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");

EndValueFromAdditionalBypass =

- emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);

+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);

EndValueFromAdditionalBypass->setName("ind.end");

}

@@ -3623,7 +3562,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,

if (MDNode *LID = OrigLoop->getLoopID())

L->setLoopID(LID);

- LoopVectorizeHints Hints(L, true, *ORE);

+ LoopVectorizeHints Hints(L, true, *ORE, TTI);

Hints.setAlreadyVectorized();

#ifdef EXPENSIVE_CHECKS

@@ -3780,7 +3719,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,

II.getStep()->getType())

: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());

CMO->setName("cast.cmo");

- Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);

+ Value *Escape =

+ emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);

Escape->setName("ind.escape");

MissingVals[UI] = Escape;

}

@@ -4573,7 +4513,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {

}

-bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {

+bool InnerLoopVectorizer::useOrderedReductions(

+ const RecurrenceDescriptor &RdxDesc) {

return Cost->useOrderedReductions(RdxDesc);

}

@@ -4648,8 +4589,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,

Value *Idx = Builder.CreateAdd(

PartStart, ConstantInt::get(PtrInd->getType(), Lane));

Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);

- Value *SclrGep =

- emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);

+ Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),

+ DL, II, State.CFG.PrevBB);

SclrGep->setName("next.gep");

State.set(PhiR, SclrGep, VPIteration(Part, Lane));

}

@@ -5368,13 +5309,9 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

// Limit MaxScalableVF by the maximum safe dependence distance.

Optional<unsigned> MaxVScale = TTI.getMaxVScale();

- if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {

- unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)

- .getVScaleRangeArgs()

- .second;

- if (VScaleMax > 0)

- MaxVScale = VScaleMax;

- }

+ if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))

+ MaxVScale =

+ TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

MaxScalableVF = ElementCount::getScalable(

MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);

if (!MaxScalableVF)

@@ -5386,9 +5323,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

return MaxScalableVF;

}

-FixedScalableVFPair

-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,

- ElementCount UserVF) {

+FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

+ unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {

MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);

unsigned SmallestType, WidestType;

std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();

@@ -5475,12 +5411,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,

FixedScalableVFPair Result(ElementCount::getFixed(1),

ElementCount::getScalable(0));

- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,

- WidestType, MaxSafeFixedVF))

+ if (auto MaxVF =

+ getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,

+ MaxSafeFixedVF, FoldTailByMasking))

Result.FixedVF = MaxVF;

- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,

- WidestType, MaxSafeScalableVF))

+ if (auto MaxVF =

+ getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,

+ MaxSafeScalableVF, FoldTailByMasking))

if (MaxVF.isScalable()) {

Result.ScalableVF = MaxVF;

LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF

@@ -5513,7 +5451,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

switch (ScalarEpilogueStatus) {

case CM_ScalarEpilogueAllowed:

- return computeFeasibleMaxVF(TC, UserVF);

+ return computeFeasibleMaxVF(TC, UserVF, false);

case CM_ScalarEpilogueNotAllowedUsePredicate:

LLVM_FALLTHROUGH;

case CM_ScalarEpilogueNotNeededUsePredicate:

@@ -5551,7 +5489,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "

"scalar epilogue instead.\n");

ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;

- return computeFeasibleMaxVF(TC, UserVF);

+ return computeFeasibleMaxVF(TC, UserVF, false);

}

return FixedScalableVFPair::getNone();

}

@@ -5568,7 +5506,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();

}

- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);

+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);

// Avoid tail folding if the trip count is known to be a multiple of any VF

// we chose.

// FIXME: The condition below pessimises the case for fixed-width vectors,

@@ -5641,7 +5579,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,

- const ElementCount &MaxSafeVF) {

+ const ElementCount &MaxSafeVF, bool FoldTailByMasking) {

bool ComputeScalableMaxVF = MaxSafeVF.isScalable();

TypeSize WidestRegister = TTI.getRegisterBitWidth(

ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector

@@ -5673,14 +5611,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

const auto TripCountEC = ElementCount::getFixed(ConstTripCount);

if (ConstTripCount &&

ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&

- isPowerOf2_32(ConstTripCount)) {

- // We need to clamp the VF to be the ConstTripCount. There is no point in

- // choosing a higher viable VF as done in the loop below. If

- // MaxVectorElementCount is scalable, we only fall back on a fixed VF when

- // the TC is less than or equal to the known number of lanes.

- LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "

- << ConstTripCount << "\n");

- return TripCountEC;

+ (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {

+ // If loop trip count (TC) is known at compile time there is no point in

+ // choosing VF greater than TC (as done in the loop below). Select maximum

+ // power of two which doesn't exceed TC.

+ // If MaxVectorElementCount is scalable, we only fall back on a fixed VF

+ // when the TC is less than or equal to the known number of lanes.

+ auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);

+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "

+ "exceeding the constant trip count: "

+ << ClampedConstTripCount << "\n");

+ return ElementCount::getFixed(ClampedConstTripCount);

}

ElementCount MaxVF = MaxVectorElementCount;

@@ -5758,12 +5699,11 @@ bool LoopVectorizationCostModel::isMoreProfitable(

EstimatedWidthB *= VScale.getValue();

}

- // When set to preferred, for now assume vscale may be larger than 1 (or the

- // one being tuned for), so that scalable vectorization is slightly favorable

- // over fixed-width vectorization.

- if (Hints->isScalableVectorizationPreferred())

- if (A.Width.isScalable() && !B.Width.isScalable())

- return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);

+ // Assume vscale may be larger than 1 (or the value being tuned for),

+ // so that scalable vectorization is slightly favorable over fixed-width

+ // vectorization.

+ if (A.Width.isScalable() && !B.Width.isScalable())

+ return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);

// To avoid the need for FP division:

// (CostA / A.Width) < (CostB / B.Width)

@@ -6068,7 +6008,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {

if (auto *PN = dyn_cast<PHINode>(&I)) {

if (!Legal->isReductionVariable(PN))

continue;

- const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];

+ const RecurrenceDescriptor &RdxDesc =

+ Legal->getReductionVars().find(PN)->second;

if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||

TTI.preferInLoopReduction(RdxDesc.getOpcode(),

RdxDesc.getRecurrenceType(),

@@ -7002,7 +6943,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];

const RecurrenceDescriptor &RdxDesc =

- Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];

+ Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;

InstructionCost BaseCost = TTI.getArithmeticReductionCost(

RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);

@@ -7079,22 +7020,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {

if (match(Op0, m_ZExtOrSExt(m_Value())) &&

Op0->getOpcode() == Op1->getOpcode() &&

- Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&

!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {

bool IsUnsigned = isa<ZExtInst>(Op0);

- auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);

- // Matched reduce(mul(ext, ext))

- InstructionCost ExtCost =

- TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,

- TTI::CastContextHint::None, CostKind, Op0);

+ Type *Op0Ty = Op0->getOperand(0)->getType();

+ Type *Op1Ty = Op1->getOperand(0)->getType();

+ Type *LargestOpTy =

+ Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty

+ : Op0Ty;

+ auto *ExtType = VectorType::get(LargestOpTy, VectorTy);

+ // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of

+ // different sizes. We take the largest type as the ext to reduce, and add

+ // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).

+ InstructionCost ExtCost0 = TTI.getCastInstrCost(

+ Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),

+ TTI::CastContextHint::None, CostKind, Op0);

+ InstructionCost ExtCost1 = TTI.getCastInstrCost(

+ Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),

+ TTI::CastContextHint::None, CostKind, Op1);

InstructionCost MulCost =

TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);

InstructionCost RedCost = TTI.getExtendedAddReductionCost(

/*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,

CostKind);

+ InstructionCost ExtraExtCost = 0;

+ if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {

+ Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;

+ ExtraExtCost = TTI.getCastInstrCost(

+ ExtraExtOp->getOpcode(), ExtType,

+ VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),

+ TTI::CastContextHint::None, CostKind, ExtraExtOp);

+ }

- if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)

+ if (RedCost.isValid() &&

+ (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))

return I == RetI ? RedCost : 0;

} else if (!match(I, m_ZExtOrSExt(m_Value()))) {

// Matched reduce(mul())

@@ -7570,8 +7530,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

Type *CondTy = SI->getCondition()->getType();

if (!ScalarCond)

CondTy = VectorType::get(CondTy, VF);

- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,

- CmpInst::BAD_ICMP_PREDICATE, CostKind, I);

+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;

+ if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))

+ Pred = Cmp->getPredicate();

+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,

+ CostKind, I);

}

case Instruction::ICmp:

case Instruction::FCmp: {

@@ -7581,7 +7545,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);

VectorTy = ToVectorTy(ValTy, VF);

return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,

- CmpInst::BAD_ICMP_PREDICATE, CostKind, I);

+ cast<CmpInst>(I)->getPredicate(), CostKind,

+ I);

}

case Instruction::Store:

case Instruction::Load: {

@@ -7762,14 +7727,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {

// Ignore type-promoting instructions we identified during reduction

// detection.

for (auto &Reduction : Legal->getReductionVars()) {

- RecurrenceDescriptor &RedDes = Reduction.second;

+ const RecurrenceDescriptor &RedDes = Reduction.second;

const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();

VecValuesToIgnore.insert(Casts.begin(), Casts.end());

}

// Ignore type-casting instructions we identified during induction

// detection.

for (auto &Induction : Legal->getInductionVars()) {

- InductionDescriptor &IndDes = Induction.second;

+ const InductionDescriptor &IndDes = Induction.second;

const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();

VecValuesToIgnore.insert(Casts.begin(), Casts.end());

}

@@ -7778,7 +7743,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {

void LoopVectorizationCostModel::collectInLoopReductions() {

for (auto &Reduction : Legal->getReductionVars()) {

PHINode *Phi = Reduction.first;

- RecurrenceDescriptor &RdxDesc = Reduction.second;

+ const RecurrenceDescriptor &RdxDesc = Reduction.second;

// We don't collect reductions that are type promoted (yet).

if (RdxDesc.getRecurrenceType() != Phi->getType())

@@ -8064,18 +8029,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(

return U == Ind || DeadInstructions.count(cast<Instruction>(U));

}))

DeadInstructions.insert(IndUpdate);

- // We record as "Dead" also the type-casting instructions we had identified

- // during induction analysis. We don't need any handling for them in the

- // vectorized loop because we have proven that, under a proper runtime

- // test guarding the vectorized loop, the value of the phi, and the casted

- // value of the phi, are the same. The last instruction in this casting chain

- // will get its scalar/vector/widened def from the scalar/vector/widened def

- // of the respective phi node. Any other casts in the induction def-use chain

- // have no other uses outside the phi update chain, and will be ignored.

- InductionDescriptor &IndDes = Induction.second;

- const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();

- DeadInstructions.insert(Casts.begin(), Casts.end());

}

@@ -8461,7 +8414,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

assert(EdgeMask && "No Edge Mask found for condition");

if (BI->getSuccessor(0) != Dst)

- EdgeMask = Builder.createNot(EdgeMask);

+ EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());

if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.

// The condition is 'SrcMask && EdgeMask', which is equivalent to

@@ -8470,7 +8423,8 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

// EdgeMask is poison. Using 'and' here introduces undefined behavior.

VPValue *False = Plan->getOrAddVPValue(

ConstantInt::getFalse(BI->getCondition()->getType()));

- EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);

+ EdgeMask =

+ Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());

}

return EdgeMaskCache[Edge] = EdgeMask;

@@ -8492,22 +8446,24 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

if (!CM.blockNeedsPredicationForAnyReason(BB))

return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.

- // Create the block in mask as the first non-phi instruction in the block.

- VPBuilder::InsertPointGuard Guard(Builder);

- auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();

- Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);

// Introduce the early-exit compare IV <= BTC to form header block mask.

// This is used instead of IV < TC because TC may wrap, unlike BTC.

- // Start by constructing the desired canonical IV.

+ // Start by constructing the desired canonical IV in the header block.

VPValue *IV = nullptr;

if (Legal->getPrimaryInduction())

IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());

else {

+ VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();

auto *IVRecipe = new VPWidenCanonicalIVRecipe();

- Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);

+ HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());

IV = IVRecipe;

}

+ // Create the block in mask as the first non-phi instruction in the block.

+ VPBuilder::InsertPointGuard Guard(Builder);

+ auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();

+ Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);

VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();

bool TailFolded = !CM.isScalarEpilogueAllowed();

@@ -8534,7 +8490,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

continue;

}

- BlockMask = Builder.createOr(BlockMask, EdgeMask);

+ BlockMask = Builder.createOr(BlockMask, EdgeMask, {});

}

return BlockMaskCache[BB] = BlockMask;

@@ -8591,14 +8547,10 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,

ArrayRef<VPValue *> Operands) const {

// Check if this is an integer or fp induction. If so, build the recipe that

// produces its scalar and vector values.

- InductionDescriptor II = Legal->getInductionVars().lookup(Phi);

- if (II.getKind() == InductionDescriptor::IK_IntInduction ||

- II.getKind() == InductionDescriptor::IK_FpInduction) {

- assert(II.getStartValue() ==

+ if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {

+ assert(II->getStartValue() ==

Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));

- const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();

- return new VPWidenIntOrFpInductionRecipe(

- Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());

+ return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);

}

return nullptr;

@@ -8624,11 +8576,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(

if (LoopVectorizationPlanner::getDecisionAndClampRange(

isOptimizableIVTruncate(I), Range)) {

- InductionDescriptor II =

- Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));

+ auto *Phi = cast<PHINode>(I->getOperand(0));

+ const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);

VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());

- return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),

- Start, nullptr, I);

+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);

}

return nullptr;

}

@@ -8844,13 +8795,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

return VPBB;

}

LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");

- assert(VPBB->getSuccessors().empty() &&

- "VPBB has successors when handling predicated replication.");

+ VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();

+ assert(SingleSucc && "VPBB must have a single successor when handling "

+ "predicated replication.");

+ VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);

// Record predicated instructions for above packing optimizations.

VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);

VPBlockUtils::insertBlockAfter(Region, VPBB);

auto *RegSucc = new VPBasicBlock();

VPBlockUtils::insertBlockAfter(RegSucc, Region);

+ VPBlockUtils::connectBlocks(RegSucc, SingleSucc);

return RegSucc;

}

@@ -8910,7 +8865,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {

VPValue *StartV = Operands[0];

if (Legal->isReductionVariable(Phi)) {

- RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];

+ const RecurrenceDescriptor &RdxDesc =

+ Legal->getReductionVars().find(Phi)->second;

assert(RdxDesc.getRecurrenceStartValue() ==

Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));

PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,

@@ -9031,7 +8987,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

}

for (auto &Reduction : CM.getInLoopReductionChains()) {

PHINode *Phi = Reduction.first;

- RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();

+ RecurKind Kind =

+ Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();

const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;

RecipeBuilder.recordRecipeOf(Phi);

@@ -9069,30 +9026,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// visit each basic block after having visited its predecessor basic blocks.

// ---------------------------------------------------------------------------

- auto Plan = std::make_unique<VPlan>();

+ // Create initial VPlan skeleton, with separate header and latch blocks.

+ VPBasicBlock *HeaderVPBB = new VPBasicBlock();

+ VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");

+ VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);

+ auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");

+ auto Plan = std::make_unique<VPlan>(TopRegion);

// Scan the body of the loop in a topological order to visit each basic block

// after having visited its predecessor basic blocks.

LoopBlocksDFS DFS(OrigLoop);

DFS.perform(LI);

- VPBasicBlock *VPBB = nullptr;

- VPBasicBlock *HeaderVPBB = nullptr;

+ VPBasicBlock *VPBB = HeaderVPBB;

SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {

// Relevant instructions from basic block BB will be grouped into VPRecipe

// ingredients and fill a new VPBasicBlock.

unsigned VPBBsForBB = 0;

- auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());

- if (VPBB)

- VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);

- else {

- auto *TopRegion = new VPRegionBlock("vector loop");

- TopRegion->setEntry(FirstVPBBForBB);

- Plan->setEntry(TopRegion);

- HeaderVPBB = FirstVPBBForBB;

- }

- VPBB = FirstVPBBForBB;

+ VPBB->setName(BB->getName());

Builder.setInsertPoint(VPBB);

// Introduce each ingredient into VPlan.

@@ -9159,13 +9111,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

: "");

}

+ VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);

+ VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());

}

+ // Fold the last, empty block into its predecessor.

+ VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);

+ assert(VPBB && "expected to fold last (empty) block");

+ // After here, VPBB should not be used.

+ VPBB = nullptr;

assert(isa<VPRegionBlock>(Plan->getEntry()) &&

!Plan->getEntry()->getEntryBasicBlock()->empty() &&

"entry block must be set to a VPRegionBlock having a non-empty entry "

"VPBasicBlock");

- cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);

RecipeBuilder.fixHeaderPhis();

// ---------------------------------------------------------------------------

@@ -9231,18 +9191,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);

VPBlockUtils::connectBlocks(SplitPred, SinkRegion);

VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);

- if (VPBB == SplitPred)

- VPBB = SplitBlock;

}

+ VPlanTransforms::removeRedundantInductionCasts(*Plan);

// Now that sink-after is done, move induction recipes for optimized truncates

// to the phi section of the header block.

for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)

Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());

// Adjust the recipes for any inloop reductions.

- adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);

+ adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,

+ RecipeBuilder, Range.Start);

// Introduce a recipe to combine the incoming and previous values of a

// first-order recurrence.

@@ -9322,6 +9283,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

RSO.flush();

Plan->setName(PlanName);

+ // Fold Exit block into its predecessor if possible.

+ // TODO: Fold block earlier once all VPlan transforms properly maintain a

+ // VPBasicBlock as exit.

+ VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());

assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");

return Plan;

}

@@ -9355,9 +9321,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

}

SmallPtrSet<Instruction *, 1> DeadInstructions;

- VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,

- Legal->getInductionVars(),

- DeadInstructions, *PSE.getSE());

+ VPlanTransforms::VPInstructionsToVPRecipes(

+ OrigLoop, Plan,

+ [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },

+ DeadInstructions, *PSE.getSE());

return Plan;

}

@@ -9371,7 +9338,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

ElementCount MinVF) {

for (auto &Reduction : CM.getInLoopReductionChains()) {

PHINode *Phi = Reduction.first;

- RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];

+ const RecurrenceDescriptor &RdxDesc =

+ Legal->getReductionVars().find(Phi)->second;

const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;

if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))

@@ -9565,7 +9533,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {

// exact, etc.). The control flow has been linearized and the

// instruction is no longer guarded by the predicate, which could make

// the flag properties to no longer hold.

- if (State.MayGeneratePoisonRecipes.count(this) > 0)

+ if (State.MayGeneratePoisonRecipes.contains(this))

VecOp->dropPoisonGeneratingFlags();

}

@@ -9714,9 +9682,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Int or FP induction being replicated.");

- State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),

- getTruncInst(), getVPValue(0),

- getCastValue(), State);

+ State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),

+ getStartValue()->getLiveInIRValue(),

+ getTruncInst(), getVPValue(0), State);

}

void VPWidenPHIRecipe::execute(VPTransformState &State) {

@@ -10293,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

<< L->getHeader()->getParent()->getName() << "\" from "

<< DebugLocStr << "\n");

- LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);

+ LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);

LLVM_DEBUG(

dbgs() << "LV: Loop hints:"

@@ -10747,8 +10715,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

PA.preserve<LoopAnalysis>();

PA.preserve<DominatorTreeAnalysis>();

}

- if (!Result.MadeCFGChange)

+ if (Result.MadeCFGChange) {

+ // Making CFG changes likely means a loop got vectorized. Indicate that

+ // extra simplification passes should be run.

+ // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only

+ // be run if runtime checks have been added.

+ AM.getResult<ShouldRunExtraVectorPasses>(F);

+ PA.preserve<ShouldRunExtraVectorPasses>();

+ } else {

PA.preserveSet<CFGAnalyses>();

+ }

return PA;

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 95061e9053fa..37ae13666f7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {

/// after: 6 3 5 4 7 2 1 0

static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {

const unsigned Sz = Order.size();

- SmallBitVector UsedIndices(Sz);

- SmallVector<int> MaskedIndices;

+ SmallBitVector UnusedIndices(Sz, /*t=*/true);

+ SmallBitVector MaskedIndices(Sz);

for (unsigned I = 0; I < Sz; ++I) {

if (Order[I] < Sz)

- UsedIndices.set(Order[I]);

+ UnusedIndices.reset(Order[I]);

else

- MaskedIndices.push_back(I);

+ MaskedIndices.set(I);

}

- if (MaskedIndices.empty())

+ if (MaskedIndices.none())

return;

- SmallVector<int> AvailableIndices(MaskedIndices.size());

- unsigned Cnt = 0;

- int Idx = UsedIndices.find_first();

- do {

- AvailableIndices[Cnt] = Idx;

- Idx = UsedIndices.find_next(Idx);

- ++Cnt;

- } while (Idx > 0);

- assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");

- for (int I = 0, E = MaskedIndices.size(); I < E; ++I)

- Order[MaskedIndices[I]] = AvailableIndices[I];

+ assert(UnusedIndices.count() == MaskedIndices.count() &&

+ "Non-synced masked/available indices.");

+ int Idx = UnusedIndices.find_first();

+ int MIdx = MaskedIndices.find_first();

+ while (MIdx >= 0) {

+ assert(Idx >= 0 && "Indices must be synced.");

+ Order[MIdx] = Idx;

+ Idx = UnusedIndices.find_next(Idx);

+ MIdx = MaskedIndices.find_next(MIdx);

+ }

}

namespace llvm {

@@ -812,6 +811,13 @@ public:

/// ExtractElement, ExtractValue), which can be part of the graph.

Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);

+ /// Gets reordering data for the given tree entry. If the entry is vectorized

+ /// - just return ReorderIndices, otherwise check if the scalars can be

+ /// reordered and return the most optimal order.

+ /// \param TopToBottom If true, include the order of vectorized stores and

+ /// insertelement nodes, otherwise skip them.

+ Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);

/// Reorders the current graph to the most profitable order starting from the

/// root node to the leaf nodes. The best order is chosen only from the nodes

/// of the same size (vectorization factor). Smaller nodes are considered

@@ -1010,18 +1016,25 @@ public:

std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

}

- // The hard-coded scores listed here are not very important. When computing

- // the scores of matching one sub-tree with another, we are basically

- // counting the number of values that are matching. So even if all scores

- // are set to 1, we would still get a decent matching result.

+ // The hard-coded scores listed here are not very important, though it shall

+ // be higher for better matches to improve the resulting cost. When

+ // computing the scores of matching one sub-tree with another, we are

+ // basically counting the number of values that are matching. So even if all

+ // scores are set to 1, we would still get a decent matching result.

// However, sometimes we have to break ties. For example we may have to

// choose between matching loads vs matching opcodes. This is what these

- // scores are helping us with: they provide the order of preference.

+ // scores are helping us with: they provide the order of preference. Also,

+ // this is important if the scalar is externally used or used in another

+ // tree entry node in the different lane.

/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

- static const int ScoreConsecutiveLoads = 3;

+ static const int ScoreConsecutiveLoads = 4;

+ /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

+ static const int ScoreReversedLoads = 3;

/// ExtractElementInst from same vector and consecutive indexes.

- static const int ScoreConsecutiveExtracts = 3;

+ static const int ScoreConsecutiveExtracts = 4;

+ /// ExtractElementInst from same vector and reversed indices.

+ static const int ScoreReversedExtracts = 3;

/// Constants.

static const int ScoreConstants = 2;

/// Instructions with the same opcode.

@@ -1041,7 +1054,10 @@ public:

/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,

- ScalarEvolution &SE) {

+ ScalarEvolution &SE, int NumLanes) {

+ if (V1 == V2)

+ return VLOperands::ScoreSplat;

auto *LI1 = dyn_cast<LoadInst>(V1);

auto *LI2 = dyn_cast<LoadInst>(V2);

if (LI1 && LI2) {

@@ -1051,8 +1067,17 @@ public:

Optional<int> Dist = getPointersDiff(

LI1->getType(), LI1->getPointerOperand(), LI2->getType(),

LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);

- return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads

- : VLOperands::ScoreFail;

+ if (!Dist)

+ return VLOperands::ScoreFail;

+ // The distance is too large - still may be profitable to use masked

+ // loads/gathers.

+ if (std::abs(*Dist) > NumLanes / 2)

+ return VLOperands::ScoreAltOpcodes;

+ // This still will detect consecutive loads, but we might have "holes"

+ // in some cases. It is ok for non-power-2 vectorization and may produce

+ // better results. It should not affect current vectorization.

+ return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads

+ : VLOperands::ScoreReversedLoads;

}

auto *C1 = dyn_cast<Constant>(V1);

@@ -1062,18 +1087,41 @@ public:

// Extracts from consecutive indexes of the same vector better score as

// the extracts could be optimized away.

- Value *EV;

- ConstantInt *Ex1Idx, *Ex2Idx;

- if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&

- match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&

- Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())

- return VLOperands::ScoreConsecutiveExtracts;

+ Value *EV1;

+ ConstantInt *Ex1Idx;

+ if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {

+ // Undefs are always profitable for extractelements.

+ if (isa<UndefValue>(V2))

+ return VLOperands::ScoreConsecutiveExtracts;

+ Value *EV2 = nullptr;

+ ConstantInt *Ex2Idx = nullptr;

+ if (match(V2,

+ m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),

+ m_Undef())))) {

+ // Undefs are always profitable for extractelements.

+ if (!Ex2Idx)

+ return VLOperands::ScoreConsecutiveExtracts;

+ if (isUndefVector(EV2) && EV2->getType() == EV1->getType())

+ return VLOperands::ScoreConsecutiveExtracts;

+ if (EV2 == EV1) {

+ int Idx1 = Ex1Idx->getZExtValue();

+ int Idx2 = Ex2Idx->getZExtValue();

+ int Dist = Idx2 - Idx1;

+ // The distance is too large - still may be profitable to use

+ // shuffles.

+ if (std::abs(Dist) > NumLanes / 2)

+ return VLOperands::ScoreAltOpcodes;

+ return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts

+ : VLOperands::ScoreReversedExtracts;

+ }

auto *I1 = dyn_cast<Instruction>(V1);

auto *I2 = dyn_cast<Instruction>(V2);

if (I1 && I2) {

- if (I1 == I2)

- return VLOperands::ScoreSplat;

+ if (I1->getParent() != I2->getParent())

+ return VLOperands::ScoreFail;

InstructionsState S = getSameOpcode({I1, I2});

// Note: Only consider instructions with <= 2 operands to avoid

// complexity explosion.

@@ -1088,11 +1136,13 @@ public:

return VLOperands::ScoreFail;

}

- /// Holds the values and their lane that are taking part in the look-ahead

+ /// Holds the values and their lanes that are taking part in the look-ahead

/// score calculation. This is used in the external uses cost calculation.

- SmallDenseMap<Value *, int> InLookAheadValues;

+ /// Need to hold all the lanes in case of splat/broadcast at least to

+ /// correctly check for the use in the different lane.

+ SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;

- /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are

+ /// \returns the additional cost due to uses of \p LHS and \p RHS that are

/// either external to the vectorized code, or require shuffling.

int getExternalUsesCost(const std::pair<Value *, int> &LHS,

const std::pair<Value *, int> &RHS) {

@@ -1116,22 +1166,30 @@ public:

for (User *U : V->users()) {

if (const TreeEntry *UserTE = R.getTreeEntry(U)) {

// The user is in the VectorizableTree. Check if we need to insert.

- auto It = llvm::find(UserTE->Scalars, U);

- assert(It != UserTE->Scalars.end() && "U is in UserTE");

- int UserLn = std::distance(UserTE->Scalars.begin(), It);

+ int UserLn = UserTE->findLaneForValue(U);

assert(UserLn >= 0 && "Bad lane");

- if (UserLn != Ln)

+ // If the values are different, check just the line of the current

+ // value. If the values are the same, need to add UserInDiffLaneCost

+ // only if UserLn does not match both line numbers.

+ if ((LHS.first != RHS.first && UserLn != Ln) ||

+ (LHS.first == RHS.first && UserLn != LHS.second &&

+ UserLn != RHS.second)) {

Cost += UserInDiffLaneCost;

+ break;

+ }

} else {

// Check if the user is in the look-ahead code.

auto It2 = InLookAheadValues.find(U);

if (It2 != InLookAheadValues.end()) {

// The user is in the look-ahead code. Check the lane.

- if (It2->second != Ln)

+ if (!It2->getSecond().contains(Ln)) {

Cost += UserInDiffLaneCost;

+ break;

+ }

} else {

// The user is neither in SLP tree nor in the look-ahead code.

Cost += ExternalUseCost;

+ break;

}

// Limit the number of visited uses to cap compilation time.

@@ -1170,32 +1228,36 @@ public:

Value *V1 = LHS.first;

Value *V2 = RHS.first;

// Get the shallow score of V1 and V2.

- int ShallowScoreAtThisLevel =

- std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -

- getExternalUsesCost(LHS, RHS));

+ int ShallowScoreAtThisLevel = std::max(

+ (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -

+ getExternalUsesCost(LHS, RHS));

int Lane1 = LHS.second;

int Lane2 = RHS.second;

// If reached MaxLevel,

// or if V1 and V2 are not instructions,

// or if they are SPLAT,

- // or if they are not consecutive, early return the current cost.

+ // or if they are not consecutive,

+ // or if profitable to vectorize loads or extractelements, early return

+ // the current cost.

auto *I1 = dyn_cast<Instruction>(V1);

auto *I2 = dyn_cast<Instruction>(V2);

if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

ShallowScoreAtThisLevel == VLOperands::ScoreFail ||

- (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))

+ (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||

+ (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&

+ ShallowScoreAtThisLevel))

return ShallowScoreAtThisLevel;

assert(I1 && I2 && "Should have early exited.");

// Keep track of in-tree values for determining the external-use cost.

- InLookAheadValues[V1] = Lane1;

- InLookAheadValues[V2] = Lane2;

+ InLookAheadValues[V1].insert(Lane1);

+ InLookAheadValues[V2].insert(Lane2);

// Contains the I2 operand indexes that got matched with I1 operands.

SmallSet<unsigned, 4> Op2Used;

- // Recursion towards the operands of I1 and I2. We are trying all possbile

+ // Recursion towards the operands of I1 and I2. We are trying all possible

// operand pairs, and keeping track of the best score.

for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

OpIdx1 != NumOperands1; ++OpIdx1) {

@@ -1319,27 +1381,79 @@ public:

return None;

}

- /// Helper for reorderOperandVecs. \Returns the lane that we should start

- /// reordering from. This is the one which has the least number of operands

- /// that can freely move about.

+ /// Helper for reorderOperandVecs.

+ /// \returns the lane that we should start reordering from. This is the one

+ /// which has the least number of operands that can freely move about or

+ /// less profitable because it already has the most optimal set of operands.

unsigned getBestLaneToStartReordering() const {

- unsigned BestLane = 0;

unsigned Min = UINT_MAX;

- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

- ++Lane) {

- unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);

- if (NumFreeOps < Min) {

- Min = NumFreeOps;

- BestLane = Lane;

+ unsigned SameOpNumber = 0;

+ // std::pair<unsigned, unsigned> is used to implement a simple voting

+ // algorithm and choose the lane with the least number of operands that

+ // can freely move about or less profitable because it already has the

+ // most optimal set of operands. The first unsigned is a counter for

+ // voting, the second unsigned is the counter of lanes with instructions

+ // with same/alternate opcodes and same parent basic block.

+ MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;

+ // Try to be closer to the original results, if we have multiple lanes

+ // with same cost. If 2 lanes have the same cost, use the one with the

+ // lowest index.

+ for (int I = getNumLanes(); I > 0; --I) {

+ unsigned Lane = I - 1;

+ OperandsOrderData NumFreeOpsHash =

+ getMaxNumOperandsThatCanBeReordered(Lane);

+ // Compare the number of operands that can move and choose the one with

+ // the least number.

+ if (NumFreeOpsHash.NumOfAPOs < Min) {

+ Min = NumFreeOpsHash.NumOfAPOs;

+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

+ HashMap.clear();

+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&

+ NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {

+ // Select the most optimal lane in terms of number of operands that

+ // should be moved around.

+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&

+ NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

+ ++HashMap[NumFreeOpsHash.Hash].first;

+ }

+ // Select the lane with the minimum counter.

+ unsigned BestLane = 0;

+ unsigned CntMin = UINT_MAX;

+ for (const auto &Data : reverse(HashMap)) {

+ if (Data.second.first < CntMin) {

+ CntMin = Data.second.first;

+ BestLane = Data.second.second;

}

return BestLane;

}

- /// \Returns the maximum number of operands that are allowed to be reordered

- /// for \p Lane. This is used as a heuristic for selecting the first lane to

- /// start operand reordering.

- unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {

+ /// Data structure that helps to reorder operands.

+ struct OperandsOrderData {

+ /// The best number of operands with the same APOs, which can be

+ /// reordered.

+ unsigned NumOfAPOs = UINT_MAX;

+ /// Number of operands with the same/alternate instruction opcode and

+ /// parent.

+ unsigned NumOpsWithSameOpcodeParent = 0;

+ /// Hash for the actual operands ordering.

+ /// Used to count operands, actually their position id and opcode

+ /// value. It is used in the voting mechanism to find the lane with the

+ /// least number of operands that can freely move about or less profitable

+ /// because it already has the most optimal set of operands. Can be

+ /// replaced with SmallVector<unsigned> instead but hash code is faster

+ /// and requires less memory.

+ unsigned Hash = 0;

+ };

+ /// \returns the maximum number of operands that are allowed to be reordered

+ /// for \p Lane and the number of compatible instructions(with the same

+ /// parent/opcode). This is used as a heuristic for selecting the first lane

+ /// to start operand reordering.

+ OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {

unsigned CntTrue = 0;

unsigned NumOperands = getNumOperands();

// Operands with the same APO can be reordered. We therefore need to count

@@ -1348,11 +1462,45 @@ public:

// a map. Instead we can simply count the number of operands that

// correspond to one of them (in this case the 'true' APO), and calculate

// the other by subtracting it from the total number of operands.

- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)

- if (getData(OpIdx, Lane).APO)

+ // Operands with the same instruction opcode and parent are more

+ // profitable since we don't need to move them in many cases, with a high

+ // probability such lane already can be vectorized effectively.

+ bool AllUndefs = true;

+ unsigned NumOpsWithSameOpcodeParent = 0;

+ Instruction *OpcodeI = nullptr;

+ BasicBlock *Parent = nullptr;

+ unsigned Hash = 0;

+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

+ const OperandData &OpData = getData(OpIdx, Lane);

+ if (OpData.APO)

++CntTrue;

- unsigned CntFalse = NumOperands - CntTrue;

- return std::max(CntTrue, CntFalse);

+ // Use Boyer-Moore majority voting for finding the majority opcode and

+ // the number of times it occurs.

+ if (auto *I = dyn_cast<Instruction>(OpData.V)) {

+ if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||

+ I->getParent() != Parent) {

+ if (NumOpsWithSameOpcodeParent == 0) {

+ NumOpsWithSameOpcodeParent = 1;

+ OpcodeI = I;

+ Parent = I->getParent();

+ } else {

+ --NumOpsWithSameOpcodeParent;

+ }

+ } else {

+ ++NumOpsWithSameOpcodeParent;

+ }

+ Hash = hash_combine(

+ Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));

+ AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);

+ }

+ if (AllUndefs)

+ return {};

+ OperandsOrderData Data;

+ Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);

+ Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;

+ Data.Hash = Hash;

+ return Data;

}

/// Go through the instructions in VL and append their operands.

@@ -1500,11 +1648,37 @@ public:

ReorderingModes[OpIdx] = ReorderingMode::Failed;

}

+ // Check that we don't have same operands. No need to reorder if operands

+ // are just perfect diamond or shuffled diamond match. Do not do it only

+ // for possible broadcasts or non-power of 2 number of scalars (just for

+ // now).

+ auto &&SkipReordering = [this]() {

+ SmallPtrSet<Value *, 4> UniqueValues;

+ ArrayRef<OperandData> Op0 = OpsVec.front();

+ for (const OperandData &Data : Op0)

+ UniqueValues.insert(Data.V);

+ for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {

+ if (any_of(Op, [&UniqueValues](const OperandData &Data) {

+ return !UniqueValues.contains(Data.V);

+ }))

+ return false;

+ }

+ // TODO: Check if we can remove a check for non-power-2 number of

+ // scalars after full support of non-power-2 vectorization.

+ return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());

+ };

// If the initial strategy fails for any of the operand indexes, then we

// perform reordering again in a second pass. This helps avoid assigning

// high priority to the failed strategy, and should improve reordering for

// the non-failed operand indexes.

for (int Pass = 0; Pass != 2; ++Pass) {

+ // Check if no need to reorder operands since they're are perfect or

+ // shuffled diamond match.

+ // Need to to do it to avoid extra external use cost counting for

+ // shuffled matches, which may cause regressions.

+ if (SkipReordering())

+ break;

// Skip the second pass if the first pass did not fail.

bool StrategyFailed = false;

// Mark all operand data as free to use.

@@ -1792,9 +1966,10 @@ private:

if (Operands.size() < OpIdx + 1)

Operands.resize(OpIdx + 1);

assert(Operands[OpIdx].empty() && "Already resized?");

- Operands[OpIdx].resize(Scalars.size());

- for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)

- Operands[OpIdx][Lane] = OpVL[Lane];

+ assert(OpVL.size() <= Scalars.size() &&

+ "Number of operands is greater than the number of scalars.");

+ Operands[OpIdx].resize(OpVL.size());

+ copy(OpVL, Operands[OpIdx].begin());

}

/// Set the operands of this bundle in their original order.

@@ -1944,7 +2119,7 @@ private:

if (ReuseShuffleIndices.empty())

dbgs() << "Empty";

else

- for (unsigned ReuseIdx : ReuseShuffleIndices)

+ for (int ReuseIdx : ReuseShuffleIndices)

dbgs() << ReuseIdx << ", ";

dbgs() << "\n";

dbgs() << "ReorderIndices: ";

@@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

return None;

}

+Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,

+ bool TopToBottom) {

+ // No need to reorder if need to shuffle reuses, still need to shuffle the

+ // node.

+ if (!TE.ReuseShuffleIndices.empty())

+ return None;

+ if (TE.State == TreeEntry::Vectorize &&

+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&

+ !TE.isAltShuffle())

+ return TE.ReorderIndices;

+ if (TE.State == TreeEntry::NeedToGather) {

+ // TODO: add analysis of other gather nodes with extractelement

+ // instructions and other values/instructions, not only undefs.

+ if (((TE.getOpcode() == Instruction::ExtractElement &&

+ !TE.isAltShuffle()) ||

+ (all_of(TE.Scalars,

+ [](Value *V) {

+ return isa<UndefValue, ExtractElementInst>(V);

+ }) &&

+ any_of(TE.Scalars,

+ [](Value *V) { return isa<ExtractElementInst>(V); }))) &&

+ all_of(TE.Scalars,

+ [](Value *V) {

+ auto *EE = dyn_cast<ExtractElementInst>(V);

+ return !EE || isa<FixedVectorType>(EE->getVectorOperandType());

+ }) &&

+ allSameType(TE.Scalars)) {

+ // Check that gather of extractelements can be represented as

+ // just a shuffle of a single vector.

+ OrdersType CurrentOrder;

+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);

+ if (Reuse || !CurrentOrder.empty()) {

+ if (!CurrentOrder.empty())

+ fixupOrderingIndices(CurrentOrder);

+ return CurrentOrder;

+ }

+ if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))

+ return CurrentOrder;

+ }

+ return None;

void BoUpSLP::reorderTopToBottom() {

// Maps VF to the graph nodes.

DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;

@@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() {

// their ordering.

DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

// Find all reorderable nodes with the given VF.

- // Currently the are vectorized loads,extracts + some gathering of extracts.

+ // Currently the are vectorized stores,loads,extracts + some gathering of

+ // extracts.

for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](

const std::unique_ptr<TreeEntry> &TE) {

- // No need to reorder if need to shuffle reuses, still need to shuffle the

- // node.

- if (!TE->ReuseShuffleIndices.empty())

- return;

- if (TE->State == TreeEntry::Vectorize &&

- isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,

- InsertElementInst>(TE->getMainOp()) &&

- !TE->isAltShuffle()) {

+ if (Optional<OrdersType> CurrentOrder =

+ getReorderingData(*TE.get(), /*TopToBottom=*/true)) {

VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

- return;

- }

- if (TE->State == TreeEntry::NeedToGather) {

- if (TE->getOpcode() == Instruction::ExtractElement &&

- !TE->isAltShuffle() &&

- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())

- ->getVectorOperandType()) &&

- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {

- // Check that gather of extractelements can be represented as

- // just a shuffle of a single vector.

- OrdersType CurrentOrder;

- bool Reuse =

- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);

- if (Reuse || !CurrentOrder.empty()) {

- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

- GathersToOrders.try_emplace(TE.get(), CurrentOrder);

- return;

- }

- if (Optional<OrdersType> CurrentOrder =

- findReusedOrderedScalars(*TE.get())) {

- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

+ if (TE->State != TreeEntry::Vectorize)

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

- }

}

});

@@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

const std::unique_ptr<TreeEntry> &TE) {

if (TE->State != TreeEntry::Vectorize)

NonVectorized.push_back(TE.get());

- // No need to reorder if need to shuffle reuses, still need to shuffle the

- // node.

- if (!TE->ReuseShuffleIndices.empty())

- return;

- if (TE->State == TreeEntry::Vectorize &&

- isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&

- !TE->isAltShuffle()) {

+ if (Optional<OrdersType> CurrentOrder =

+ getReorderingData(*TE.get(), /*TopToBottom=*/false)) {

OrderedEntries.insert(TE.get());

- return;

- }

- if (TE->State == TreeEntry::NeedToGather) {

- if (TE->getOpcode() == Instruction::ExtractElement &&

- !TE->isAltShuffle() &&

- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())

- ->getVectorOperandType()) &&

- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {

- // Check that gather of extractelements can be represented as

- // just a shuffle of a single vector with a single user only.

- OrdersType CurrentOrder;

- bool Reuse =

- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);

- if ((Reuse || !CurrentOrder.empty()) &&

- !any_of(VectorizableTree,

- [&TE](const std::unique_ptr<TreeEntry> &Entry) {

- return Entry->State == TreeEntry::NeedToGather &&

- Entry.get() != TE.get() &&

- Entry->isSame(TE->Scalars);

- })) {

- OrderedEntries.insert(TE.get());

- GathersToOrders.try_emplace(TE.get(), CurrentOrder);

- return;

- }

- if (Optional<OrdersType> CurrentOrder =

- findReusedOrderedScalars(*TE.get())) {

- OrderedEntries.insert(TE.get());

+ if (TE->State != TreeEntry::Vectorize)

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

- }

}

});

@@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Check that every instruction appears once in this bundle.

DenseMap<Value *, unsigned> UniquePositions;

for (Value *V : VL) {

+ if (isConstant(V)) {

+ ReuseShuffleIndicies.emplace_back(

+ isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());

+ UniqueValues.emplace_back(V);

+ continue;

+ }

auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

- ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1

- : Res.first->second);

+ ReuseShuffleIndicies.emplace_back(Res.first->second);

if (Res.second)

UniqueValues.emplace_back(V);

}

@@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

} else {

LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");

if (NumUniqueScalarValues <= 1 ||

+ (UniquePositions.size() == 1 && all_of(UniqueValues,

+ [](Value *V) {

+ return isa<UndefValue>(V) ||

+ !isConstant(V);

+ })) ||

!llvm::isPowerOf2_32(NumUniqueScalarValues)) {

LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

@@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

- // If any of the scalars is marked as a value that needs to stay scalar, then

- // we need to gather the scalars.

// The reduction nodes (stored in UserIgnoreList) also should stay scalar.

for (Value *V : VL) {

- if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {

+ if (is_contained(UserIgnoreList, V)) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

if (TryToFindDuplicates(S))

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

@@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

SmallVectorImpl<unsigned> &CurrentOrder) const {

- Instruction *E0 = cast<Instruction>(OpValue);

- assert(E0->getOpcode() == Instruction::ExtractElement ||

- E0->getOpcode() == Instruction::ExtractValue);

- assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");

+ const auto *It = find_if(VL, [](Value *V) {

+ return isa<ExtractElementInst, ExtractValueInst>(V);

+ });

+ assert(It != VL.end() && "Expected at least one extract instruction.");

+ auto *E0 = cast<Instruction>(*It);

+ assert(all_of(VL,

+ [](Value *V) {

+ return isa<UndefValue, ExtractElementInst, ExtractValueInst>(

+ V);

+ }) &&

+ "Invalid opcode");

// Check if all of the extracts come from the same vector and from the

// correct offset.

Value *Vec = E0->getOperand(0);

@@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

// Also, later we can check that all the indices are used and we have a

// consecutive access in the extract instructions, by checking that no

// element of CurrentOrder still has value E + 1.

- CurrentOrder.assign(E, E + 1);

+ CurrentOrder.assign(E, E);

unsigned I = 0;

for (; I < E; ++I) {

- auto *Inst = cast<Instruction>(VL[I]);

+ auto *Inst = dyn_cast<Instruction>(VL[I]);

+ if (!Inst)

+ continue;

if (Inst->getOperand(0) != Vec)

break;

+ if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

+ if (isa<UndefValue>(EE->getIndexOperand()))

+ continue;

Optional<unsigned> Idx = getExtractIndex(Inst);

if (!Idx)

break;

const unsigned ExtIdx = *Idx;

if (ExtIdx != I) {

- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)

+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)

break;

ShouldKeepOrder = false;

CurrentOrder[ExtIdx] = I;

} else {

- if (CurrentOrder[I] != E + 1)

+ if (CurrentOrder[I] != E)

break;

CurrentOrder[I] = I;

}

@@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

bool BoUpSLP::areAllUsersVectorized(Instruction *I,

ArrayRef<Value *> VectorizedVals) const {

return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||

- llvm::all_of(I->users(), [this](User *U) {

- return ScalarToTreeEntry.count(U) > 0;

+ all_of(I->users(), [this](User *U) {

+ return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);

});

}

@@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

for (auto *V : VL) {

++Idx;

+ // Need to exclude undefs from analysis.

+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)

+ continue;

// Reached the start of a new vector registers.

if (Idx % EltsPerVector == 0) {

AllConsecutive = true;

@@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

// Check all extracts for a vector register on the target directly

// extract values in order.

unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

- CurrentIdx % EltsPerVector == Idx % EltsPerVector;

+ if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {

+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;

+ }

if (AllConsecutive)

continue;

@@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// FIXME: it tries to fix a problem with MSVC buildbots.

TargetTransformInfo &TTIRef = *TTI;

auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,

- VectorizedVals](InstructionCost &Cost,

- bool IsGather) {

+ VectorizedVals, E](InstructionCost &Cost) {

DenseMap<Value *, int> ExtractVectorsTys;

+ SmallPtrSet<Value *, 4> CheckedExtracts;

for (auto *V : VL) {

if (isa<UndefValue>(V))

continue;

@@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// instruction itself is not going to be vectorized, consider this

// instruction as dead and remove its cost from the final cost of the

// vectorized tree.

- if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))

+ // Also, avoid adjusting the cost for extractelements with multiple uses

+ // in different graph entries.

+ const TreeEntry *VE = getTreeEntry(V);

+ if (!CheckedExtracts.insert(V).second ||

+ !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

+ (VE && VE != E))

continue;

auto *EE = cast<ExtractElementInst>(V);

Optional<unsigned> EEIdx = getExtractIndex(EE);

@@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

}

return GatherCost;

}

- if (isSplat(VL)) {

- // Found the broadcasting of the single scalar, calculate the cost as the

- // broadcast.

- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);

- }

if ((E->getOpcode() == Instruction::ExtractElement ||

all_of(E->Scalars,

[](Value *V) {

@@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// single input vector or of 2 input vectors.

InstructionCost Cost =

computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);

- AdjustExtractsCost(Cost, /*IsGather=*/true);

+ AdjustExtractsCost(Cost);

if (NeedToShuffleReuses)

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

FinalVecTy, E->ReuseShuffleIndices);

return Cost;

}

+ if (isSplat(VL)) {

+ // Found the broadcasting of the single scalar, calculate the cost as the

+ // broadcast.

+ assert(VecTy == FinalVecTy &&

+ "No reused scalars expected for broadcast.");

+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);

+ }

InstructionCost ReuseShuffleCost = 0;

if (NeedToShuffleReuses)

ReuseShuffleCost = TTI->getShuffleCost(

@@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);

}

} else {

- AdjustExtractsCost(CommonCost, /*IsGather=*/false);

+ AdjustExtractsCost(CommonCost);

}

return CommonCost;

}

@@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,

FoundOr = true;

}

// Check if the input is an extended load of the required or/shift expression.

- Value *LoadPtr;

+ Value *Load;

if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||

- !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))

+ !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))

return false;

// Require that the total load bit width is a legal integer type.

// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.

// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.

- Type *SrcTy = LoadPtr->getType()->getPointerElementType();

+ Type *SrcTy = Load->getType();

unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;

if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))

return false;

@@ -9061,8 +9253,7 @@ private:

"A call to the llvm.fmuladd intrinsic is not handled yet");

++NumVectorInstructions;

- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,

- ReductionOps.back());

+ return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);

}

};

@@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

return Changed;

}

+/// Compare two cmp instructions. If IsCompatibility is true, function returns

+/// true if 2 cmps have same/swapped predicates and mos compatible corresponding

+/// operands. If IsCompatibility is false, function implements strict weak

+/// ordering relation between two cmp instructions, returning true if the first

+/// instruction is "less" than the second, i.e. its predicate is less than the

+/// predicate of the second or the operands IDs are less than the operands IDs

+/// of the second cmp instruction.

+template <bool IsCompatibility>

+static bool compareCmp(Value *V, Value *V2,

+ function_ref<bool(Instruction *)> IsDeleted) {

+ auto *CI1 = cast<CmpInst>(V);

+ auto *CI2 = cast<CmpInst>(V2);

+ if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))

+ return false;

+ if (CI1->getOperand(0)->getType()->getTypeID() <

+ CI2->getOperand(0)->getType()->getTypeID())

+ return !IsCompatibility;

+ if (CI1->getOperand(0)->getType()->getTypeID() >

+ CI2->getOperand(0)->getType()->getTypeID())

+ return false;

+ CmpInst::Predicate Pred1 = CI1->getPredicate();

+ CmpInst::Predicate Pred2 = CI2->getPredicate();

+ CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);

+ CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);

+ CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);

+ CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);

+ if (BasePred1 < BasePred2)

+ return !IsCompatibility;

+ if (BasePred1 > BasePred2)

+ return false;

+ // Compare operands.

+ bool LEPreds = Pred1 <= Pred2;

+ bool GEPreds = Pred1 >= Pred2;

+ for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {

+ auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);

+ auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);

+ if (Op1->getValueID() < Op2->getValueID())

+ return !IsCompatibility;

+ if (Op1->getValueID() > Op2->getValueID())

+ return false;

+ if (auto *I1 = dyn_cast<Instruction>(Op1))

+ if (auto *I2 = dyn_cast<Instruction>(Op2)) {

+ if (I1->getParent() != I2->getParent())

+ return false;

+ InstructionsState S = getSameOpcode({I1, I2});

+ if (S.getOpcode())

+ continue;

+ return false;

+ }

+ return IsCompatibility;

bool SLPVectorizerPass::vectorizeSimpleInstructions(

SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,

bool AtTerminator) {

@@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(

}

// Try to vectorize list of compares.

// Sort by type, compare predicate, etc.

- // TODO: Add analysis on the operand opcodes (profitable to vectorize

- // instructions with same/alternate opcodes/const values).

auto &&CompareSorter = [&R](Value *V, Value *V2) {

- auto *CI1 = cast<CmpInst>(V);

- auto *CI2 = cast<CmpInst>(V2);

- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))

- return false;

- if (CI1->getOperand(0)->getType()->getTypeID() <

- CI2->getOperand(0)->getType()->getTypeID())

- return true;

- if (CI1->getOperand(0)->getType()->getTypeID() >

- CI2->getOperand(0)->getType()->getTypeID())

- return false;

- return CI1->getPredicate() < CI2->getPredicate() ||

- (CI1->getPredicate() > CI2->getPredicate() &&

- CI1->getPredicate() <

- CmpInst::getSwappedPredicate(CI2->getPredicate()));

+ return compareCmp<false>(V, V2,

+ [&R](Instruction *I) { return R.isDeleted(I); });

};

auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {

if (V1 == V2)

return true;

- auto *CI1 = cast<CmpInst>(V1);

- auto *CI2 = cast<CmpInst>(V2);

- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))

- return false;

- if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())

- return false;

- return CI1->getPredicate() == CI2->getPredicate() ||

- CI1->getPredicate() ==

- CmpInst::getSwappedPredicate(CI2->getPredicate());

+ return compareCmp<true>(V1, V2,

+ [&R](Instruction *I) { return R.isDeleted(I); });

};

auto Limit = [&R](Value *V) {

unsigned EltSize = R.getVectorElementSize(V);

@@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

return true;

if (Opcodes1.size() > Opcodes2.size())

return false;

+ Optional<bool> ConstOrder;

for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

// Undefs are compatible with any other value.

- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {

+ if (!ConstOrder)

+ ConstOrder =

+ !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);

continue;

+ }

if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

@@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

continue;

return I1->getOpcode() < I2->getOpcode();

}

- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {

+ if (!ConstOrder)

+ ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();

continue;

+ }

if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())

return true;

if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())

return false;

}

- return false;

+ return ConstOrder && *ConstOrder;

};

auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {

if (V1 == V2)

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 44b5e1df0839..1d9e71663cd2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -374,8 +374,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {

assert((SplitAt == end() || SplitAt->getParent() == this) &&

"can only split at a position in the same block");

- SmallVector<VPBlockBase *, 2> Succs(getSuccessors().begin(),

- getSuccessors().end());

+ SmallVector<VPBlockBase *, 2> Succs(successors());

// First, disconnect the current block from its successors.

for (VPBlockBase *Succ : Succs)

VPBlockUtils::disconnectBlocks(this, Succ);

@@ -642,6 +641,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,

void VPInstruction::generateInstruction(VPTransformState &State,

unsigned Part) {

IRBuilder<> &Builder = State.Builder;

+ Builder.SetCurrentDebugLocation(DL);

if (Instruction::isBinaryOp(getOpcode())) {

Value *A = State.get(getOperand(0), Part);

@@ -768,6 +768,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,

O << " ";

Operand->printAsOperand(O, SlotTracker);

}

+ if (DL) {

+ O << ", !dbg ";

+ DL.print(O);

+ }

}

#endif

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 810dd5030f95..f4a1883e35d5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

@@ -39,6 +39,7 @@

#include "llvm/ADT/ilist.h"

#include "llvm/ADT/ilist_node.h"

#include "llvm/Analysis/VectorUtils.h"

+#include "llvm/IR/DebugLoc.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/Support/InstructionCost.h"

#include <algorithm>

@@ -51,6 +52,7 @@ namespace llvm {

class BasicBlock;

class DominatorTree;

+class InductionDescriptor;

class InnerLoopVectorizer;

class LoopInfo;

class raw_ostream;

@@ -500,6 +502,8 @@ public:

const VPBlocksTy &getSuccessors() const { return Successors; }

VPBlocksTy &getSuccessors() { return Successors; }

+ iterator_range<VPBlockBase **> successors() { return Successors; }

const VPBlocksTy &getPredecessors() const { return Predecessors; }

VPBlocksTy &getPredecessors() { return Predecessors; }

@@ -795,6 +799,7 @@ private:

typedef unsigned char OpcodeTy;

OpcodeTy Opcode;

FastMathFlags FMF;

+ DebugLoc DL;

/// Utility method serving execute(): generates a single instance of the

/// modeled instruction.

@@ -804,12 +809,14 @@ protected:

void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }

public:

- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)

+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL)

: VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),

- VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}

+ VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode),

+ DL(DL) {}

- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)

- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}

+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,

+ DebugLoc DL = {})

+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {}

/// Method to support type inquiry through isa, cast, and dyn_cast.

static inline bool classof(const VPValue *V) {

@@ -818,7 +825,7 @@ public:

VPInstruction *clone() const {

SmallVector<VPValue *, 2> Operands(operands());

- return new VPInstruction(Opcode, Operands);

+ return new VPInstruction(Opcode, Operands, DL);

}

/// Method to support type inquiry through isa, cast, and dyn_cast.

@@ -1003,21 +1010,22 @@ public:

/// A recipe for handling phi nodes of integer and floating-point inductions,

/// producing their vector and scalar values.

-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {

+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {

PHINode *IV;

+ const InductionDescriptor &IndDesc;

public:

- VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast,

- TruncInst *Trunc = nullptr)

- : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) {

- if (Trunc)

- new VPValue(Trunc, this);

- else

- new VPValue(IV, this);

+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,

+ const InductionDescriptor &IndDesc)

+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),

+ IV(IV), IndDesc(IndDesc) {}

+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,

+ const InductionDescriptor &IndDesc,

+ TruncInst *Trunc)

+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),

+ IV(IV), IndDesc(IndDesc) {}

- if (Cast)

- new VPValue(Cast, this);

- }

~VPWidenIntOrFpInductionRecipe() override = default;

/// Method to support type inquiry through isa, cast, and dyn_cast.

@@ -1038,13 +1046,6 @@ public:

/// Returns the start value of the induction.

VPValue *getStartValue() { return getOperand(0); }

- /// Returns the cast VPValue, if one is attached, or nullptr otherwise.

- VPValue *getCastValue() {

- if (getNumDefinedValues() != 2)

- return nullptr;

- return getVPValue(1);

- }

/// Returns the first defined value as TruncInst, if it is one or nullptr

/// otherwise.

TruncInst *getTruncInst() {

@@ -1053,6 +1054,9 @@ public:

const TruncInst *getTruncInst() const {

return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());

}

+ /// Returns the induction descriptor for the recipe.

+ const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }

};

/// A recipe for handling first order recurrences and pointer inductions. For

@@ -1169,7 +1173,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {

/// operand.

class VPReductionPHIRecipe : public VPWidenPHIRecipe {

/// Descriptor for the reduction.

- RecurrenceDescriptor &RdxDesc;

+ const RecurrenceDescriptor &RdxDesc;

/// The phi is part of an in-loop reduction.

bool IsInLoop;

@@ -1180,7 +1184,7 @@ class VPReductionPHIRecipe : public VPWidenPHIRecipe {

public:

/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p

/// RdxDesc.

- VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc,

+ VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,

VPValue &Start, bool IsInLoop = false,

bool IsOrdered = false)

: VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),

@@ -1210,7 +1214,9 @@ public:

VPSlotTracker &SlotTracker) const override;

#endif

- RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; }

+ const RecurrenceDescriptor &getRecurrenceDescriptor() const {

+ return RdxDesc;

+ }

/// Returns true, if the phi is part of an ordered reduction.

bool isOrdered() const { return IsOrdered; }

@@ -1340,13 +1346,13 @@ public:

/// The Operands are {ChainOp, VecOp, [Condition]}.

class VPReductionRecipe : public VPRecipeBase, public VPValue {

/// The recurrence decriptor for the reduction in question.

- RecurrenceDescriptor *RdxDesc;

+ const RecurrenceDescriptor *RdxDesc;

/// Pointer to the TTI, needed to create the target reduction

const TargetTransformInfo *TTI;

public:

- VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,

- VPValue *VecOp, VPValue *CondOp,

+ VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,

+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,

const TargetTransformInfo *TTI)

: VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}),

VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) {

@@ -2252,6 +2258,12 @@ public:

return map_range(Operands, Fn);

}

+ /// Returns true if \p VPV is uniform after vectorization.

+ bool isUniformAfterVectorization(VPValue *VPV) const {

+ auto RepR = dyn_cast_or_null<VPReplicateRecipe>(VPV->getDef());

+ return !VPV->getDef() || (RepR && RepR->isUniform());

+ }

private:

/// Add to the given dominator tree the header block and every new basic block

/// that was created between it and the latch block, inclusive.

@@ -2340,18 +2352,23 @@ public:

/// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p

/// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p

- /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr

- /// has more than one successor, its conditional bit is propagated to \p

- /// NewBlock. \p NewBlock must have neither successors nor predecessors.

+ /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's

+ /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's

+ /// conditional bit is propagated to \p NewBlock. \p NewBlock must have

+ /// neither successors nor predecessors.

static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {

assert(NewBlock->getSuccessors().empty() &&

- "Can't insert new block with successors.");

- // TODO: move successors from BlockPtr to NewBlock when this functionality

- // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr

- // already has successors.

- BlockPtr->setOneSuccessor(NewBlock);

- NewBlock->setPredecessors({BlockPtr});

+ NewBlock->getPredecessors().empty() &&

+ "Can't insert new block with predecessors or successors.");

NewBlock->setParent(BlockPtr->getParent());

+ SmallVector<VPBlockBase *> Succs(BlockPtr->successors());

+ for (VPBlockBase *Succ : Succs) {

+ disconnectBlocks(BlockPtr, Succ);

+ connectBlocks(NewBlock, Succ);

+ }

+ NewBlock->setCondBit(BlockPtr->getCondBit());

+ BlockPtr->setCondBit(nullptr);

+ connectBlocks(BlockPtr, NewBlock);

}

/// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p

@@ -2394,6 +2411,31 @@ public:

To->removePredecessor(From);

}

+ /// Try to merge \p Block into its single predecessor, if \p Block is a

+ /// VPBasicBlock and its predecessor has a single successor. Returns a pointer

+ /// to the predecessor \p Block was merged into or nullptr otherwise.

+ static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) {

+ auto *VPBB = dyn_cast<VPBasicBlock>(Block);

+ auto *PredVPBB =

+ dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor());

+ if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1)

+ return nullptr;

+ for (VPRecipeBase &R : make_early_inc_range(*VPBB))

+ R.moveBefore(*PredVPBB, PredVPBB->end());

+ VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);

+ auto *ParentRegion = cast<VPRegionBlock>(Block->getParent());

+ if (ParentRegion->getExit() == Block)

+ ParentRegion->setExit(PredVPBB);

+ SmallVector<VPBlockBase *> Successors(Block->successors());

+ for (auto *Succ : Successors) {

+ VPBlockUtils::disconnectBlocks(Block, Succ);

+ VPBlockUtils::connectBlocks(PredVPBB, Succ);

+ }

+ delete Block;

+ return PredVPBB;

+ }

/// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.

static bool isBackEdge(const VPBlockBase *FromBlock,

const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index ac3b3505dc34..86ecd6817873 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp

@@ -50,14 +50,14 @@ VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,

case EdgeType::FALSE_EDGE:

// CurrBB is the False successor of PredBB - compute not of CBV.

- IntermediateVal = Builder.createNot(CBV);

+ IntermediateVal = Builder.createNot(CBV, {});

break;

}

// Now AND intermediate value with PredBB's block predicate if it has one.

VPValue *BP = PredBB->getPredicate();

if (BP)

- return Builder.createAnd(BP, IntermediateVal);

+ return Builder.createAnd(BP, IntermediateVal, {});

else

return IntermediateVal;

}

@@ -96,7 +96,7 @@ VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {

Worklist.pop_front();

// Create an OR of these values.

- VPValue *Or = Builder.createOr(LHS, RHS);

+ VPValue *Or = Builder.createOr(LHS, RHS, {});

// Push OR to the back of the worklist.

Worklist.push_back(Or);

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index c52c8a2229e8..9e19e172dea5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp

@@ -467,8 +467,9 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {

return markFailed();

assert(CombinedOperands.size() > 0 && "Need more some operands");

- auto *VPI = new VPInstruction(Opcode, CombinedOperands);

- VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());

+ auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();

+ auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());

+ VPI->setUnderlyingInstr(Inst);

LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "

<< *cast<VPInstruction>(Values[0]) << "\n");

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ded5bc04beb5..d2daf558c2c5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

@@ -18,7 +18,8 @@ using namespace llvm;

void VPlanTransforms::VPInstructionsToVPRecipes(

Loop *OrigLoop, VPlanPtr &Plan,

- LoopVectorizationLegality::InductionList &Inductions,

+ function_ref<const InductionDescriptor *(PHINode *)>

+ GetIntOrFpInductionDescriptor,

SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) {

auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());

@@ -44,11 +45,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes(

VPRecipeBase *NewRecipe = nullptr;

if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {

auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());

- InductionDescriptor II = Inductions.lookup(Phi);

- if (II.getKind() == InductionDescriptor::IK_IntInduction ||

- II.getKind() == InductionDescriptor::IK_FpInduction) {

- VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());

- NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr);

+ if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {

+ VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());

+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II);

} else {

Plan->addVPValue(Phi, VPPhi);

continue;

@@ -158,8 +157,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {

// TODO: add ".cloned" suffix to name of Clone's VPValue.

Clone->insertBefore(SinkCandidate);

- SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(),

- SinkCandidate->user_end());

+ SmallVector<VPUser *, 4> Users(SinkCandidate->users());

for (auto *U : Users) {

auto *UI = cast<VPRecipeBase>(U);

if (UI->getParent() == SinkTo)

@@ -266,8 +264,7 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {

VPValue *PredInst1 =

cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);

VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();

- SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(),

- Phi1ToMoveV->user_end());

+ SmallVector<VPUser *> Users(Phi1ToMoveV->users());

for (VPUser *U : Users) {

auto *UI = dyn_cast<VPRecipeBase>(U);

if (!UI || UI->getParent() != Then2)

@@ -295,3 +292,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {

delete ToDelete;

return Changed;

}

+void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {

+ SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;

+ for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {

+ auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);

+ if (!IV || IV->getTruncInst())

+ continue;

+ // Visit all casts connected to IV and in Casts. Collect them.

+ // remember them for removal.

+ auto &Casts = IV->getInductionDescriptor().getCastInsts();

+ VPValue *FindMyCast = IV;

+ for (Instruction *IRCast : reverse(Casts)) {

+ VPRecipeBase *FoundUserCast = nullptr;

+ for (auto *U : FindMyCast->users()) {

+ auto *UserCast = cast<VPRecipeBase>(U);

+ if (UserCast->getNumDefinedValues() == 1 &&

+ UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {

+ FoundUserCast = UserCast;

+ break;

+ }

+ assert(FoundUserCast && "Missing a cast to remove");

+ CastsToRemove.emplace_back(FoundUserCast, IV);

+ FindMyCast = FoundUserCast->getVPSingleValue();

+ }

+ for (auto &E : CastsToRemove) {

+ E.first->getVPSingleValue()->replaceAllUsesWith(E.second);

+ E.first->eraseFromParent();

+ }

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index c740f2c022da..a82a562d5e35 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h

@@ -14,24 +14,37 @@

#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H

#include "VPlan.h"

+#include "llvm/ADT/STLExtras.h"

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

namespace llvm {

+class InductionDescriptor;

class Instruction;

+class PHINode;

class ScalarEvolution;

struct VPlanTransforms {

/// Replaces the VPInstructions in \p Plan with corresponding

/// widen recipes.

- static void VPInstructionsToVPRecipes(

- Loop *OrigLoop, VPlanPtr &Plan,

- LoopVectorizationLegality::InductionList &Inductions,

- SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);

+ static void

+ VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan,

+ function_ref<const InductionDescriptor *(PHINode *)>

+ GetIntOrFpInductionDescriptor,

+ SmallPtrSetImpl<Instruction *> &DeadInstructions,

+ ScalarEvolution &SE);

static bool sinkScalarOperands(VPlan &Plan);

static bool mergeReplicateRegions(VPlan &Plan);

+ /// Remove redundant casts of inductions.

+ ///

+ /// Such redundant casts are casts of induction variables that can be ignored,

+ /// because we already proved that the casted phi is equal to the uncasted phi

+ /// in the vectorized loop. There is no need to vectorize the cast - the same

+ /// value can be used for both the phi and casts in the vector loop.

+ static void removeRedundantInductionCasts(VPlan &Plan);

};

} // namespace llvm

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 6d6ea4eb30f1..7732d9367985 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

@@ -156,5 +156,31 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {

RecipeI++;

}

+ const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry());

+ const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry());

+ if (!Entry) {

+ errs() << "VPlan entry block is not a VPBasicBlock\n";

+ return false;

+ }

+ const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());

+ if (!Exit) {

+ errs() << "VPlan exit block is not a VPBasicBlock\n";

+ return false;

+ }

+ for (const VPRegionBlock *Region :

+ VPBlockUtils::blocksOnly<const VPRegionBlock>(

+ depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(

+ Plan.getEntry())))) {

+ if (Region->getEntry()->getNumPredecessors() != 0) {

+ errs() << "region entry block has predecessors\n";

+ return false;

+ }

+ if (Region->getExit()->getNumSuccessors() != 0) {

+ errs() << "region exit block has successors\n";

+ return false;

+ }

return true;

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 57b11e9414ba..c0aedab2fed0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

@@ -989,9 +989,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

if (!FixedVT)

return false;

- InstructionCost OriginalCost = TTI.getMemoryOpCost(

- Instruction::Load, LI->getType(), Align(LI->getAlignment()),

- LI->getPointerAddressSpace());

+ InstructionCost OriginalCost =

+ TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(),

+ LI->getPointerAddressSpace());

InstructionCost ScalarizedCost = 0;

Instruction *LastCheckedInst = LI;