src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:03:47 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:04:23 +0000
commit	7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree	27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent	e3b557809604d036af6e00c60f012c2025b59a5e (diff)

vendor/llvm-project/llvmorg-17-init-19304-gd0b54bb50e51

Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

2346

1 files changed, 1181 insertions, 1165 deletions

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a28099d8ba7d..d7e40e8ef978 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -98,6 +98,7 @@

#include "llvm/IR/Constant.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

+#include "llvm/IR/DebugInfo.h"

#include "llvm/IR/DebugInfoMetadata.h"

#include "llvm/IR/DebugLoc.h"

#include "llvm/IR/DerivedTypes.h"

@@ -120,8 +121,6 @@

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/IR/Verifier.h"

-#include "llvm/InitializePasses.h"

-#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Compiler.h"

@@ -231,6 +230,25 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(

"prefers tail-folding, don't attempt vectorization if "

"tail-folding fails.")));

+static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(

+ "force-tail-folding-style", cl::desc("Force the tail folding style"),

+ cl::init(TailFoldingStyle::None),

+ cl::values(

+ clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),

+ clEnumValN(

+ TailFoldingStyle::Data, "data",

+ "Create lane mask for data only, using active.lane.mask intrinsic"),

+ clEnumValN(TailFoldingStyle::DataWithoutLaneMask,

+ "data-without-lane-mask",

+ "Create lane mask with compare/stepvector"),

+ clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",

+ "Create lane mask using active.lane.mask intrinsic, and use "

+ "it for both data and control flow"),

+ clEnumValN(

+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,

+ "data-and-control-without-rt-check",

+ "Similar to data-and-control, but remove the runtime check")));

static cl::opt<bool> MaximizeBandwidth(

"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,

cl::desc("Maximize bandwidth when selecting vectorization factor which "

@@ -338,10 +356,12 @@ static cl::opt<bool> PreferPredicatedReductionSelect(

cl::desc(

"Prefer predicating a reduction operation over an after loop select."));

+namespace llvm {

cl::opt<bool> EnableVPlanNativePath(

- "enable-vplan-native-path", cl::init(false), cl::Hidden,

+ "enable-vplan-native-path", cl::Hidden,

cl::desc("Enable VPlan-native vectorization path with "

"support for outer loop vectorization."));

// This flag enables the stress testing of the VPlan H-CFG construction in the

// VPlan-native vectorization path. It must be used in conjuction with

@@ -419,9 +439,42 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,

return std::nullopt;

}

+/// Return a vector containing interleaved elements from multiple

+/// smaller input vectors.

+static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,

+ const Twine &Name) {

+ unsigned Factor = Vals.size();

+ assert(Factor > 1 && "Tried to interleave invalid number of vectors");

+ VectorType *VecTy = cast<VectorType>(Vals[0]->getType());

+#ifndef NDEBUG

+ for (Value *Val : Vals)

+ assert(Val->getType() == VecTy && "Tried to interleave mismatched types");

+#endif

+ // Scalable vectors cannot use arbitrary shufflevectors (only splats), so

+ // must use intrinsics to interleave.

+ if (VecTy->isScalableTy()) {

+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);

+ return Builder.CreateIntrinsic(

+ WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,

+ /*FMFSource=*/nullptr, Name);

+ }

+ // Fixed length. Start by concatenating all vectors into a wide vector.

+ Value *WideVec = concatenateVectors(Builder, Vals);

+ // Interleave the elements into the wide vector.

+ const unsigned NumElts = VecTy->getElementCount().getFixedValue();

+ return Builder.CreateShuffleVector(

+ WideVec, createInterleaveMask(NumElts, Factor), Name);

namespace {

// Forward declare GeneratedRTChecks.

class GeneratedRTChecks;

+using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;

} // namespace

namespace llvm {

@@ -477,8 +530,10 @@ public:

/// loop and the start value for the canonical induction, if it is != 0. The

/// latter is the case when vectorizing the epilogue loop. In the case of

/// epilogue vectorization, this function is overriden to handle the more

- /// complex control flow around the loops.

- virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();

+ /// complex control flow around the loops. \p ExpandedSCEVs is used to

+ /// look up SCEV expansions for expressions needed during skeleton creation.

+ virtual std::pair<BasicBlock *, Value *>

+ createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.

void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);

@@ -498,7 +553,7 @@ public:

/// Instr's operands.

void scalarizeInstruction(const Instruction *Instr,

VPReplicateRecipe *RepRecipe,

- const VPIteration &Instance, bool IfPredicateInstr,

+ const VPIteration &Instance,

VPTransformState &State);

/// Construct the vector value of a scalarized value \p V one lane at a time.

@@ -513,7 +568,7 @@ public:

ArrayRef<VPValue *> VPDefs,

VPTransformState &State, VPValue *Addr,

ArrayRef<VPValue *> StoredValues,

- VPValue *BlockInMask = nullptr);

+ VPValue *BlockInMask, bool NeedsMaskForGaps);

/// Fix the non-induction PHIs in \p Plan.

void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);

@@ -522,28 +577,30 @@ public:

/// able to vectorize with strict in-order reductions for the given RdxDesc.

bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);

- /// Create a broadcast instruction. This method generates a broadcast

- /// instruction (shuffle) for loop invariant values and for the induction

- /// value. If this is the induction variable then we extend it to N, N+1, ...

- /// this is needed because each iteration in the loop corresponds to a SIMD

- /// element.

- virtual Value *getBroadcastInstrs(Value *V);

// Returns the resume value (bc.merge.rdx) for a reduction as

// generated by fixReduction.

PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);

/// Create a new phi node for the induction variable \p OrigPhi to resume

/// iteration count in the scalar epilogue, from where the vectorized loop

- /// left off. In cases where the loop skeleton is more complicated (eg.

- /// epilogue vectorization) and the resume values can come from an additional

- /// bypass block, the \p AdditionalBypass pair provides information about the

- /// bypass block and the end value on the edge from bypass to this loop.

+ /// left off. \p Step is the SCEV-expanded induction step to use. In cases

+ /// where the loop skeleton is more complicated (i.e., epilogue vectorization)

+ /// and the resume values can come from an additional bypass block, the \p

+ /// AdditionalBypass pair provides information about the bypass block and the

+ /// end value on the edge from bypass to this loop.

PHINode *createInductionResumeValue(

- PHINode *OrigPhi, const InductionDescriptor &ID,

+ PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,

ArrayRef<BasicBlock *> BypassBlocks,

std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});

+ /// Returns the original loop trip count.

+ Value *getTripCount() const { return TripCount; }

+ /// Used to set the trip count after ILV's construction and after the

+ /// preheader block has been executed. Note that this always holds the trip

+ /// count of the original loop for both main loop and epilogue vectorization.

+ void setTripCount(Value *TC) { TripCount = TC; }

protected:

friend class LoopVectorizationPlanner;

@@ -560,7 +617,7 @@ protected:

void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,

Value *VectorTripCount, Value *EndValue,

BasicBlock *MiddleBlock, BasicBlock *VectorHeader,

- VPlan &Plan);

+ VPlan &Plan, VPTransformState &State);

/// Handle all cross-iteration phis in the header.

void fixCrossIterationPHIs(VPTransformState &State);

@@ -573,10 +630,6 @@ protected:

/// Create code for the loop exit value of the reduction.

void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);

- /// Clear NSW/NUW flags from reduction instructions if necessary.

- void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,

- VPTransformState &State);

/// Iteratively sink the scalarized operands of a predicated instruction into

/// the block that was created for it.

void sinkScalarOperands(Instruction *PredInst);

@@ -585,9 +638,6 @@ protected:

/// represented as.

void truncateToMinimalBitwidths(VPTransformState &State);

- /// Returns (and creates if needed) the original loop trip count.

- Value *getOrCreateTripCount(BasicBlock *InsertBlock);

/// Returns (and creates if needed) the trip count of the widened loop.

Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);

@@ -621,6 +671,7 @@ protected:

/// block, the \p AdditionalBypass pair provides information about the bypass

/// block and the end value on the edge from bypass to this loop.

void createInductionResumeValues(

+ const SCEV2ValueTy &ExpandedSCEVs,

std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});

/// Complete the loop skeleton by adding debug MDs, creating appropriate

@@ -758,9 +809,6 @@ public:

ElementCount::getFixed(1),

ElementCount::getFixed(1), UnrollFactor, LVL, CM,

BFI, PSI, Check) {}

-private:

- Value *getBroadcastInstrs(Value *V) override;

};

/// Encapsulate information regarding vectorization of a loop and its epilogue.

@@ -810,15 +858,16 @@ public:

// Override this function to handle the more complex control flow around the

// three loops.

- std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {

- return createEpilogueVectorizedLoopSkeleton();

+ std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(

+ const SCEV2ValueTy &ExpandedSCEVs) final {

+ return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);

}

/// The interface for creating a vectorized skeleton using one of two

/// different strategies, each corresponding to one execution of the vplan

/// as described above.

virtual std::pair<BasicBlock *, Value *>

- createEpilogueVectorizedLoopSkeleton() = 0;

+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;

/// Holds and updates state information required to vectorize the main loop

/// and its epilogue in two separate passes. This setup helps us avoid

@@ -846,7 +895,8 @@ public:

EPI, LVL, CM, BFI, PSI, Check) {}

/// Implements the interface for creating a vectorized skeleton using the

/// *main loop* strategy (ie the first pass of vplan execution).

- std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;

+ std::pair<BasicBlock *, Value *>

+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;

protected:

/// Emits an iteration count bypass check once for the main loop (when \p

@@ -876,7 +926,8 @@ public:

}

/// Implements the interface for creating a vectorized skeleton using the

/// *epilogue loop* strategy (ie the second pass of vplan execution).

- std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;

+ std::pair<BasicBlock *, Value *>

+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;

protected:

/// Emits an iteration count bypass check after the main vector loop has

@@ -953,35 +1004,21 @@ namespace llvm {

Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,

int64_t Step) {

assert(Ty->isIntegerTy() && "Expected an integer step");

- Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());

- return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;

+ return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));

}

/// Return the runtime value for VF.

Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {

- Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());

- return VF.isScalable() ? B.CreateVScale(EC) : EC;

+ return B.CreateElementCount(Ty, VF);

}

-const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {

+const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,

+ Loop *OrigLoop) {

const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();

assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");

ScalarEvolution &SE = *PSE.getSE();

- // The exit count might have the type of i64 while the phi is i32. This can

- // happen if we have an induction variable that is sign extended before the

- // compare. The only way that we get a backedge taken count is that the

- // induction variable was signed and as such will not overflow. In such a case

- // truncation is legal.

- if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >

- IdxTy->getPrimitiveSizeInBits())

- BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);

- BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);

- // Get the total trip count from the count by adding 1.

- return SE.getAddExpr(BackedgeTakenCount,

- SE.getOne(BackedgeTakenCount->getType()));

+ return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);

}

static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,

@@ -1062,11 +1099,17 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(

continue;

// This recipe contributes to the address computation of a widen

- // load/store. Collect recipe if its underlying instruction has

- // poison-generating flags.

- Instruction *Instr = CurRec->getUnderlyingInstr();

- if (Instr && Instr->hasPoisonGeneratingFlags())

- State.MayGeneratePoisonRecipes.insert(CurRec);

+ // load/store. If the underlying instruction has poison-generating flags,

+ // drop them directly.

+ if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {

+ RecWithFlags->dropPoisonGeneratingFlags();

+ } else {

+ Instruction *Instr = CurRec->getUnderlyingInstr();

+ (void)Instr;

+ assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&

+ "found instruction with poison generating flags not covered by "

+ "VPRecipeWithIRFlags");

+ }

// Add new definitions to the worklist.

for (VPValue *operand : CurRec->operands())

@@ -1143,15 +1186,7 @@ enum ScalarEpilogueLowering {

CM_ScalarEpilogueNotAllowedUsePredicate

};

-/// ElementCountComparator creates a total ordering for ElementCount

-/// for the purposes of using it in a set structure.

-struct ElementCountComparator {

- bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {

- return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <

- std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());

- }

-};

-using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;

+using InstructionVFPair = std::pair<Instruction *, ElementCount>;

/// LoopVectorizationCostModel - estimates the expected speedups due to

/// vectorization.

@@ -1184,17 +1219,6 @@ public:

/// otherwise.

bool runtimeChecksRequired();

- /// \return The most profitable vectorization factor and the cost of that VF.

- /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO

- /// then this vectorization factor will be selected if vectorization is

- /// possible.

- VectorizationFactor

- selectVectorizationFactor(const ElementCountSet &CandidateVFs);

- VectorizationFactor

- selectEpilogueVectorizationFactor(const ElementCount MaxVF,

- const LoopVectorizationPlanner &LVP);

/// Setup cost-based decisions for user vectorization factor.

/// \return true if the UserVF is a feasible VF to be chosen.

bool selectUserVectorizationFactor(ElementCount UserVF) {

@@ -1278,11 +1302,17 @@ public:

auto Scalars = InstsToScalarize.find(VF);

assert(Scalars != InstsToScalarize.end() &&

"VF not yet analyzed for scalarization profitability");

- return Scalars->second.find(I) != Scalars->second.end();

+ return Scalars->second.contains(I);

}

/// Returns true if \p I is known to be uniform after vectorization.

bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {

+ // Pseudo probe needs to be duplicated for each unrolled iteration and

+ // vector lane so that profiled loop trip count can be accurately

+ // accumulated instead of being under counted.

+ if (isa<PseudoProbeInst>(I))

+ return false;

if (VF.isScalar())

return true;

@@ -1316,7 +1346,7 @@ public:

/// \returns True if instruction \p I can be truncated to a smaller bitwidth

/// for vectorization factor \p VF.

bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {

- return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&

+ return VF.isVector() && MinBWs.contains(I) &&

!isProfitableToScalarize(I, VF) &&

!isScalarAfterVectorization(I, VF);

}

@@ -1379,7 +1409,7 @@ public:

InstructionCost getWideningCost(Instruction *I, ElementCount VF) {

assert(VF.isVector() && "Expected VF >=2");

std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);

- assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&

+ assert(WideningDecisions.contains(InstOnVF) &&

"The cost is not calculated");

return WideningDecisions[InstOnVF].second;

}

@@ -1419,7 +1449,7 @@ public:

/// that may be vectorized as interleave, gather-scatter or scalarized.

void collectUniformsAndScalars(ElementCount VF) {

// Do the analysis once.

- if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())

+ if (VF.isScalar() || Uniforms.contains(VF))

return;

setCostBasedWideningDecision(VF);

collectLoopUniforms(VF);

@@ -1442,8 +1472,7 @@ public:

/// Returns true if the target machine can represent \p V as a masked gather

/// or scatter operation.

- bool isLegalGatherOrScatter(Value *V,

- ElementCount VF = ElementCount::getFixed(1)) {

+ bool isLegalGatherOrScatter(Value *V, ElementCount VF) {

bool LI = isa<LoadInst>(V);

bool SI = isa<StoreInst>(V);

if (!LI && !SI)

@@ -1522,14 +1551,29 @@ public:

/// Returns true if we're required to use a scalar epilogue for at least

/// the final iteration of the original loop.

- bool requiresScalarEpilogue(ElementCount VF) const {

+ bool requiresScalarEpilogue(bool IsVectorizing) const {

if (!isScalarEpilogueAllowed())

return false;

// If we might exit from anywhere but the latch, must run the exiting

// iteration in scalar form.

if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())

return true;

- return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();

+ return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();

+ }

+ /// Returns true if we're required to use a scalar epilogue for at least

+ /// the final iteration of the original loop for all VFs in \p Range.

+ /// A scalar epilogue must either be required for all VFs in \p Range or for

+ /// none.

+ bool requiresScalarEpilogue(VFRange Range) const {

+ auto RequiresScalarEpilogue = [this](ElementCount VF) {

+ return requiresScalarEpilogue(VF.isVector());

+ };

+ bool IsRequired = all_of(Range, RequiresScalarEpilogue);

+ assert(

+ (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&

+ "all VFs in range must agree on whether a scalar epilogue is required");

+ return IsRequired;

}

/// Returns true if a scalar epilogue is not allowed due to optsize or a

@@ -1538,14 +1582,21 @@ public:

return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;

}

- /// Returns true if all loop blocks should be masked to fold tail loop.

- bool foldTailByMasking() const { return FoldTailByMasking; }

+ /// Returns the TailFoldingStyle that is best for the current loop.

+ TailFoldingStyle

+ getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {

+ if (!CanFoldTailByMasking)

+ return TailFoldingStyle::None;

+ if (ForceTailFoldingStyle.getNumOccurrences())

+ return ForceTailFoldingStyle;

+ return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);

+ }

- /// Returns true if were tail-folding and want to use the active lane mask

- /// for vector loop control flow.

- bool useActiveLaneMaskForControlFlow() const {

- return FoldTailByMasking &&

- TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;

+ /// Returns true if all loop blocks should be masked to fold tail loop.

+ bool foldTailByMasking() const {

+ return getTailFoldingStyle() != TailFoldingStyle::None;

}

/// Returns true if the instructions in this block requires predication

@@ -1582,12 +1633,8 @@ public:

/// scalarized -

/// i.e. either vector version isn't available, or is too expensive.

InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,

- bool &NeedToScalarize) const;

- /// Returns true if the per-lane cost of VectorizationFactor A is lower than

- /// that of B.

- bool isMoreProfitable(const VectorizationFactor &A,

- const VectorizationFactor &B) const;

+ Function **Variant,

+ bool *NeedsMask = nullptr) const;

/// Invalidates decisions already taken by the cost model.

void invalidateCostModelingDecisions() {

@@ -1596,10 +1643,29 @@ public:

Scalars.clear();

}

- /// Convenience function that returns the value of vscale_range iff

- /// vscale_range.min == vscale_range.max or otherwise returns the value

- /// returned by the corresponding TLI method.

- std::optional<unsigned> getVScaleForTuning() const;

+ /// The vectorization cost is a combination of the cost itself and a boolean

+ /// indicating whether any of the contributing operations will actually

+ /// operate on vector values after type legalization in the backend. If this

+ /// latter value is false, then all operations will be scalarized (i.e. no

+ /// vectorization has actually taken place).

+ using VectorizationCostTy = std::pair<InstructionCost, bool>;

+ /// Returns the expected execution cost. The unit of the cost does

+ /// not matter because we use the 'cost' units to compare different

+ /// vector widths. The cost that is returned is *not* normalized by

+ /// the factor width. If \p Invalid is not nullptr, this function

+ /// will add a pair(Instruction*, ElementCount) to \p Invalid for

+ /// each instruction that has an Invalid cost for the given VF.

+ VectorizationCostTy

+ expectedCost(ElementCount VF,

+ SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);

+ bool hasPredStores() const { return NumPredStores > 0; }

+ /// Returns true if epilogue vectorization is considered profitable, and

+ /// false otherwise.

+ /// \p VF is the vectorization factor chosen for the original loop.

+ bool isEpilogueVectorizationProfitable(const ElementCount VF) const;

private:

unsigned NumPredStores = 0;

@@ -1626,24 +1692,6 @@ private:

/// of elements.

ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);

- /// The vectorization cost is a combination of the cost itself and a boolean

- /// indicating whether any of the contributing operations will actually

- /// operate on vector values after type legalization in the backend. If this

- /// latter value is false, then all operations will be scalarized (i.e. no

- /// vectorization has actually taken place).

- using VectorizationCostTy = std::pair<InstructionCost, bool>;

- /// Returns the expected execution cost. The unit of the cost does

- /// not matter because we use the 'cost' units to compare different

- /// vector widths. The cost that is returned is *not* normalized by

- /// the factor width. If \p Invalid is not nullptr, this function

- /// will add a pair(Instruction*, ElementCount) to \p Invalid for

- /// each instruction that has an Invalid cost for the given VF.

- using InstructionVFPair = std::pair<Instruction *, ElementCount>;

- VectorizationCostTy

- expectedCost(ElementCount VF,

- SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);

/// Returns the execution time cost of an instruction for a given vector

/// width. Vector width of one means scalar.

VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);

@@ -1715,7 +1763,7 @@ private:

ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;

/// All blocks of loop are to be masked to fold tail of scalar iterations.

- bool FoldTailByMasking = false;

+ bool CanFoldTailByMasking = false;

/// A map holding scalar costs for different vectorization factors. The

/// presence of a cost for an instruction in the mapping indicates that the

@@ -1796,8 +1844,7 @@ private:

// the scalars are collected. That should be a safe assumption in most

// cases, because we check if the operands have vectorizable types

// beforehand in LoopVectorizationLegality.

- return Scalars.find(VF) == Scalars.end() ||

- !isScalarAfterVectorization(I, VF);

+ return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);

};

/// Returns a range containing only operands needing to be extracted.

@@ -1807,16 +1854,6 @@ private:

Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));

}

- /// Determines if we have the infrastructure to vectorize loop \p L and its

- /// epilogue, assuming the main loop is vectorized by \p VF.

- bool isCandidateForEpilogueVectorization(const Loop &L,

- const ElementCount VF) const;

- /// Returns true if epilogue vectorization is considered profitable, and

- /// false otherwise.

- /// \p VF is the vectorization factor chosen for the original loop.

- bool isEpilogueVectorizationProfitable(const ElementCount VF) const;

public:

/// The loop that we evaluate.

Loop *TheLoop;

@@ -1862,9 +1899,6 @@ public:

/// All element types found in the loop.

SmallPtrSet<Type *, 16> ElementTypesInLoop;

- /// Profitable vector factors.

- SmallVector<VectorizationFactor, 8> ProfitableVFs;

};

} // end namespace llvm

@@ -2135,6 +2169,17 @@ public:

};

} // namespace

+static bool useActiveLaneMask(TailFoldingStyle Style) {

+ return Style == TailFoldingStyle::Data ||

+ Style == TailFoldingStyle::DataAndControlFlow ||

+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

+static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {

+ return Style == TailFoldingStyle::DataAndControlFlow ||

+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

// Return true if \p OuterLp is an outer loop annotated with hints for explicit

// vectorization. The loop needs to be annotated with #pragma omp simd

// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the

@@ -2202,97 +2247,11 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,

collectSupportedLoops(*InnerL, LI, ORE, V);

}

-namespace {

-/// The LoopVectorize Pass.

-struct LoopVectorize : public FunctionPass {

- /// Pass identification, replacement for typeid

- static char ID;

- LoopVectorizePass Impl;

- explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,

- bool VectorizeOnlyWhenForced = false)

- : FunctionPass(ID),

- Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {

- initializeLoopVectorizePass(*PassRegistry::getPassRegistry());

- }

- bool runOnFunction(Function &F) override {

- if (skipFunction(F))

- return false;

- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();

- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

- auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();

- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;

- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

- auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();

- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();

- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

- auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();

- return Impl

- .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)

- .MadeAnyChange;

- }

- void getAnalysisUsage(AnalysisUsage &AU) const override {

- AU.addRequired<AssumptionCacheTracker>();

- AU.addRequired<BlockFrequencyInfoWrapperPass>();

- AU.addRequired<DominatorTreeWrapperPass>();

- AU.addRequired<LoopInfoWrapperPass>();

- AU.addRequired<ScalarEvolutionWrapperPass>();

- AU.addRequired<TargetTransformInfoWrapperPass>();

- AU.addRequired<LoopAccessLegacyAnalysis>();

- AU.addRequired<DemandedBitsWrapperPass>();

- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();

- AU.addRequired<InjectTLIMappingsLegacy>();

- // We currently do not preserve loopinfo/dominator analyses with outer loop

- // vectorization. Until this is addressed, mark these analyses as preserved

- // only for non-VPlan-native path.

- // TODO: Preserve Loop and Dominator analyses for VPlan-native path.

- if (!EnableVPlanNativePath) {

- AU.addPreserved<LoopInfoWrapperPass>();

- AU.addPreserved<DominatorTreeWrapperPass>();

- }

- AU.addPreserved<BasicAAWrapperPass>();

- AU.addPreserved<GlobalsAAWrapperPass>();

- AU.addRequired<ProfileSummaryInfoWrapperPass>();

- }

-};

-} // end anonymous namespace

//===----------------------------------------------------------------------===//

// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and

// LoopVectorizationCostModel and LoopVectorizationPlanner.

//===----------------------------------------------------------------------===//

-Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {

- // We need to place the broadcast of invariant variables outside the loop,

- // but only if it's proven safe to do so. Else, broadcast will be inside

- // vector loop body.

- Instruction *Instr = dyn_cast<Instruction>(V);

- bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&

- (!Instr ||

- DT->dominates(Instr->getParent(), LoopVectorPreHeader));

- // Place the code for broadcasting invariant variables in the new preheader.

- IRBuilder<>::InsertPointGuard Guard(Builder);

- if (SafeToHoist)

- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

- // Broadcast the scalar into all locations in the vector.

- Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

- return Shuf;

/// This function adds

/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)

/// to each vector element of Val. The sequence starts at StartIndex.

@@ -2435,21 +2394,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,

}

-// Generate code for the induction step. Note that induction steps are

-// required to be loop-invariant

-static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,

- Instruction *InsertBefore,

- Loop *OrigLoop = nullptr) {

- const DataLayout &DL = SE.getDataLayout();

- assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&

- "Induction step should be loop invariant");

- if (auto *E = dyn_cast<SCEVUnknown>(Step))

- return E->getValue();

- SCEVExpander Exp(SE, DL, "induction");

- return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);

/// Compute the transformed value of Index at offset StartValue using step

/// StepValue.

/// For integer induction, returns StartValue + Index * StepValue.

@@ -2514,9 +2458,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,

return CreateAdd(StartValue, Offset);

}

case InductionDescriptor::IK_PtrInduction: {

- assert(isa<Constant>(Step) &&

- "Expected constant step for pointer induction");

- return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));

+ return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step));

}

case InductionDescriptor::IK_FpInduction: {

assert(!isa<VectorType>(Index->getType()) &&

@@ -2538,6 +2480,50 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,

llvm_unreachable("invalid enum");

}

+std::optional<unsigned> getMaxVScale(const Function &F,

+ const TargetTransformInfo &TTI) {

+ if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())

+ return MaxVScale;

+ if (F.hasFnAttribute(Attribute::VScaleRange))

+ return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

+ return std::nullopt;

+/// For the given VF and UF and maximum trip count computed for the loop, return

+/// whether the induction variable might overflow in the vectorized loop. If not,

+/// then we know a runtime overflow check always evaluates to false and can be

+/// removed.

+static bool isIndvarOverflowCheckKnownFalse(

+ const LoopVectorizationCostModel *Cost,

+ ElementCount VF, std::optional<unsigned> UF = std::nullopt) {

+ // Always be conservative if we don't know the exact unroll factor.

+ unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);

+ Type *IdxTy = Cost->Legal->getWidestInductionType();

+ APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();

+ // We know the runtime overflow check is known false iff the (max) trip-count

+ // is known and (max) trip-count + (VF * UF) does not overflow in the type of

+ // the vector loop induction variable.

+ if (unsigned TC =

+ Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {

+ uint64_t MaxVF = VF.getKnownMinValue();

+ if (VF.isScalable()) {

+ std::optional<unsigned> MaxVScale =

+ getMaxVScale(*Cost->TheFunction, Cost->TTI);

+ if (!MaxVScale)

+ return false;

+ MaxVF *= *MaxVScale;

+ }

+ return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);

+ }

+ return false;

void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,

const VPIteration &Instance,

VPTransformState &State) {

@@ -2591,14 +2577,13 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {

void InnerLoopVectorizer::vectorizeInterleaveGroup(

const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,

VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,

- VPValue *BlockInMask) {

+ VPValue *BlockInMask, bool NeedsMaskForGaps) {

Instruction *Instr = Group->getInsertPos();

const DataLayout &DL = Instr->getModule()->getDataLayout();

// Prepare for the vector type of the interleaved load/store.

Type *ScalarTy = getLoadStoreType(Instr);

unsigned InterleaveFactor = Group->getFactor();

- assert(!VF.isScalable() && "scalable vectors not yet supported.");

auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);

// Prepare for the new pointers.

@@ -2609,14 +2594,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

assert((!BlockInMask || !Group->isReverse()) &&

"Reversed masked interleave-group not supported.");

+ Value *Idx;

// If the group is reverse, adjust the index to refer to the last vector lane

// instead of the first. We adjust the index from the first vector lane,

// rather than directly getting the pointer for lane VF - 1, because the

// pointer operand of the interleaved access is supposed to be uniform. For

// uniform instructions, we're only required to generate a value for the

// first vector lane in each unroll iteration.

- if (Group->isReverse())

- Index += (VF.getKnownMinValue() - 1) * Group->getFactor();

+ if (Group->isReverse()) {

+ Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);

+ Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));

+ Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));

+ Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));

+ Idx = Builder.CreateNeg(Idx);

+ } else

+ Idx = Builder.getInt32(-Index);

for (unsigned Part = 0; Part < UF; Part++) {

Value *AddrPart = State.get(Addr, VPIteration(Part, 0));

@@ -2637,8 +2629,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

bool InBounds = false;

if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))

InBounds = gep->isInBounds();

- AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));

- cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);

+ AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);

// Cast to the vector pointer type.

unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();

@@ -2649,14 +2640,43 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

State.setDebugLocFromInst(Instr);

Value *PoisonVec = PoisonValue::get(VecTy);

- Value *MaskForGaps = nullptr;

- if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {

- MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);

- assert(MaskForGaps && "Mask for Gaps is required but it is null");

- }

+ auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](

+ unsigned Part, Value *MaskForGaps) -> Value * {

+ if (VF.isScalable()) {

+ assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");

+ assert(InterleaveFactor == 2 &&

+ "Unsupported deinterleave factor for scalable vectors");

+ auto *BlockInMaskPart = State.get(BlockInMask, Part);

+ SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};

+ auto *MaskTy =

+ VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);

+ return Builder.CreateIntrinsic(

+ MaskTy, Intrinsic::experimental_vector_interleave2, Ops,

+ /*FMFSource=*/nullptr, "interleaved.mask");

+ }

+ if (!BlockInMask)

+ return MaskForGaps;

+ Value *BlockInMaskPart = State.get(BlockInMask, Part);

+ Value *ShuffledMask = Builder.CreateShuffleVector(

+ BlockInMaskPart,

+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),

+ "interleaved.mask");

+ return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,

+ MaskForGaps)

+ : ShuffledMask;

+ };

// Vectorize the interleaved load group.

if (isa<LoadInst>(Instr)) {

+ Value *MaskForGaps = nullptr;

+ if (NeedsMaskForGaps) {

+ MaskForGaps =

+ createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);

+ assert(MaskForGaps && "Mask for Gaps is required but it is null");

+ }

// For each unroll part, create a wide load for the group.

SmallVector<Value *, 2> NewLoads;

for (unsigned Part = 0; Part < UF; Part++) {

@@ -2664,18 +2684,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

if (BlockInMask || MaskForGaps) {

assert(useMaskedInterleavedAccesses(*TTI) &&

"masked interleaved groups are not allowed.");

- Value *GroupMask = MaskForGaps;

- if (BlockInMask) {

- Value *BlockInMaskPart = State.get(BlockInMask, Part);

- Value *ShuffledMask = Builder.CreateShuffleVector(

- BlockInMaskPart,

- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),

- "interleaved.mask");

- GroupMask = MaskForGaps

- ? Builder.CreateBinOp(Instruction::And, ShuffledMask,

- MaskForGaps)

- : ShuffledMask;

- }

+ Value *GroupMask = CreateGroupMask(Part, MaskForGaps);

NewLoad =

Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),

GroupMask, PoisonVec, "wide.masked.vec");

@@ -2687,6 +2696,41 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

NewLoads.push_back(NewLoad);

}

+ if (VecTy->isScalableTy()) {

+ assert(InterleaveFactor == 2 &&

+ "Unsupported deinterleave factor for scalable vectors");

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),

+ // so must use intrinsics to deinterleave.

+ Value *DI = Builder.CreateIntrinsic(

+ Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],

+ /*FMFSource=*/nullptr, "strided.vec");

+ unsigned J = 0;

+ for (unsigned I = 0; I < InterleaveFactor; ++I) {

+ Instruction *Member = Group->getMember(I);

+ if (!Member)

+ continue;

+ Value *StridedVec = Builder.CreateExtractValue(DI, I);

+ // If this member has different type, cast the result type.

+ if (Member->getType() != ScalarTy) {

+ VectorType *OtherVTy = VectorType::get(Member->getType(), VF);

+ StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);

+ }

+ if (Group->isReverse())

+ StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");

+ State.set(VPDefs[J], StridedVec, Part);

+ ++J;

+ }

+ return;

+ }

// For each member in the group, shuffle out the appropriate data from the

// wide loads.

unsigned J = 0;

@@ -2724,7 +2768,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

auto *SubVT = VectorType::get(ScalarTy, VF);

// Vectorize the interleaved store group.

- MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);

+ Value *MaskForGaps =

+ createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);

assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&

"masked interleaved groups are not allowed.");

assert((!MaskForGaps || !VF.isScalable()) &&

@@ -2759,27 +2804,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

StoredVecs.push_back(StoredVec);

}

- // Concatenate all vectors into a wide vector.

- Value *WideVec = concatenateVectors(Builder, StoredVecs);

- // Interleave the elements in the wide vector.

- Value *IVec = Builder.CreateShuffleVector(

- WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),

- "interleaved.vec");

+ // Interleave all the smaller vectors into one wider vector.

+ Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");

Instruction *NewStoreInstr;

if (BlockInMask || MaskForGaps) {

- Value *GroupMask = MaskForGaps;

- if (BlockInMask) {

- Value *BlockInMaskPart = State.get(BlockInMask, Part);

- Value *ShuffledMask = Builder.CreateShuffleVector(

- BlockInMaskPart,

- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),

- "interleaved.mask");

- GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,

- ShuffledMask, MaskForGaps)

- : ShuffledMask;

- }

+ Value *GroupMask = CreateGroupMask(Part, MaskForGaps);

NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],

Group->getAlign(), GroupMask);

} else

@@ -2793,7 +2822,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,

VPReplicateRecipe *RepRecipe,

const VPIteration &Instance,

- bool IfPredicateInstr,

VPTransformState &State) {

assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");

@@ -2810,14 +2838,7 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,

if (!IsVoidRetTy)

Cloned->setName(Instr->getName() + ".cloned");

- // If the scalarized instruction contributes to the address computation of a

- // widen masked load/store which was in a basic block that needed predication

- // and is not predicated after vectorization, we can't propagate

- // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized

- // instruction could feed a poison value to the base address of the widen

- // load/store.

- if (State.MayGeneratePoisonRecipes.contains(RepRecipe))

- Cloned->dropPoisonGeneratingFlags();

+ RepRecipe->setFlags(Cloned);

if (Instr->getDebugLoc())

State.setDebugLocFromInst(Instr);

@@ -2843,45 +2864,17 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,

AC->registerAssumption(II);

// End if-block.

+ bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();

if (IfPredicateInstr)

PredicatedInstructions.push_back(Cloned);

}

-Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {

- if (TripCount)

- return TripCount;

- assert(InsertBlock);

- IRBuilder<> Builder(InsertBlock->getTerminator());

- // Find the loop boundaries.

- Type *IdxTy = Legal->getWidestInductionType();

- assert(IdxTy && "No type for induction");

- const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);

- const DataLayout &DL = InsertBlock->getModule()->getDataLayout();

- // Expand the trip count and place the new instructions in the preheader.

- // Notice that the pre-header does not change, only the loop body.

- SCEVExpander Exp(*PSE.getSE(), DL, "induction");

- // Count holds the overall loop count (N).

- TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),

- InsertBlock->getTerminator());

- if (TripCount->getType()->isPointerTy())

- TripCount =

- CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",

- InsertBlock->getTerminator());

- return TripCount;

Value *

InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {

if (VectorTripCount)

return VectorTripCount;

- Value *TC = getOrCreateTripCount(InsertBlock);

+ Value *TC = getTripCount();

IRBuilder<> Builder(InsertBlock->getTerminator());

Type *Ty = TC->getType();

@@ -2917,7 +2910,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {

// the step does not evenly divide the trip count, no adjustment is necessary

// since there will already be scalar iterations. Note that the minimum

// iterations check ensures that N >= Step.

- if (Cost->requiresScalarEpilogue(VF)) {

+ if (Cost->requiresScalarEpilogue(VF.isVector())) {

auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));

R = Builder.CreateSelect(IsZero, Step, R);

}

@@ -2930,10 +2923,10 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {

Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,

const DataLayout &DL) {

// Verify that V is a vector type with same number of elements as DstVTy.

- auto *DstFVTy = cast<FixedVectorType>(DstVTy);

- unsigned VF = DstFVTy->getNumElements();

- auto *SrcVecTy = cast<FixedVectorType>(V->getType());

- assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");

+ auto *DstFVTy = cast<VectorType>(DstVTy);

+ auto VF = DstFVTy->getElementCount();

+ auto *SrcVecTy = cast<VectorType>(V->getType());

+ assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");

Type *SrcElemTy = SrcVecTy->getElementType();

Type *DstElemTy = DstFVTy->getElementType();

assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&

@@ -2953,13 +2946,13 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,

"Only one type should be a floating point type");

Type *IntTy =

IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));

- auto *VecIntTy = FixedVectorType::get(IntTy, VF);

+ auto *VecIntTy = VectorType::get(IntTy, VF);

Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);

return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);

}

void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);

+ Value *Count = getTripCount();

// Reuse existing vector loop preheader for TC checks.

// Note that new preheader block is generated for vector loop.

BasicBlock *const TCCheckBlock = LoopVectorPreHeader;

@@ -2970,8 +2963,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

// vector trip count is zero. This check also covers the case where adding one

// to the backedge-taken count overflowed leading to an incorrect trip count

// of zero. In this case we will also jump to the scalar loop.

- auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE

- : ICmpInst::ICMP_ULT;

+ auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE

+ : ICmpInst::ICMP_ULT;

// If tail is to be folded, vector loop takes care of all iterations.

Type *CountTy = Count->getType();

@@ -2989,10 +2982,13 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));

};

- if (!Cost->foldTailByMasking())

+ TailFoldingStyle Style = Cost->getTailFoldingStyle();

+ if (Style == TailFoldingStyle::None)

CheckMinIters =

Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");

- else if (VF.isScalable()) {

+ else if (VF.isScalable() &&

+ !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&

+ Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {

// vscale is not necessarily a power-of-2, which means we cannot guarantee

// an overflow to zero when updating induction variables and so an

// additional overflow check is required before entering the vector loop.

@@ -3017,7 +3013,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

// Update dominator for Bypass & LoopExit (if needed).

DT->changeImmediateDominator(Bypass, TCCheckBlock);

- if (!Cost->requiresScalarEpilogue(VF))

+ if (!Cost->requiresScalarEpilogue(VF.isVector()))

// If there is an epilogue which must run, there's no edge from the

// middle block to exit blocks and thus no need to update the immediate

// dominator of the exit blocks.

@@ -3044,7 +3040,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {

// Update dominator only if this is first RT check.

if (LoopBypassBlocks.empty()) {

DT->changeImmediateDominator(Bypass, SCEVCheckBlock);

- if (!Cost->requiresScalarEpilogue(VF))

+ if (!Cost->requiresScalarEpilogue(VF.isVector()))

// If there is an epilogue which must run, there's no edge from the

// middle block to exit blocks and thus no need to update the immediate

// dominator of the exit blocks.

@@ -3097,7 +3093,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {

LoopVectorPreHeader = OrigLoop->getLoopPreheader();

assert(LoopVectorPreHeader && "Invalid loop structure");

LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr

- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&

+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&

"multiple exit loop without required epilogue?");

LoopMiddleBlock =

@@ -3117,17 +3113,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {

// branch from the middle block to the loop scalar preheader, and the

// exit block. completeLoopSkeleton will update the condition to use an

// iteration check, if required to decide whether to execute the remainder.

- BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?

- BranchInst::Create(LoopScalarPreHeader) :

- BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,

- Builder.getTrue());

+ BranchInst *BrInst =

+ Cost->requiresScalarEpilogue(VF.isVector())

+ ? BranchInst::Create(LoopScalarPreHeader)

+ : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,

+ Builder.getTrue());

BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());

ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);

// Update dominator for loop exit. During skeleton creation, only the vector

// pre-header and the middle block are created. The vector loop is entirely

// created during VPlan exection.

- if (!Cost->requiresScalarEpilogue(VF))

+ if (!Cost->requiresScalarEpilogue(VF.isVector()))

// If there is an epilogue which must run, there's no edge from the

// middle block to exit blocks and thus no need to update the immediate

// dominator of the exit blocks.

@@ -3135,7 +3132,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {

}

PHINode *InnerLoopVectorizer::createInductionResumeValue(

- PHINode *OrigPhi, const InductionDescriptor &II,

+ PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,

ArrayRef<BasicBlock *> BypassBlocks,

std::pair<BasicBlock *, Value *> AdditionalBypass) {

Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

@@ -3154,8 +3151,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(

if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))

B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

- Value *Step =

- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());

EndValue =

emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);

EndValue->setName("ind.end");

@@ -3163,8 +3158,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(

// Compute the end value for the additional bypass (if applicable).

if (AdditionalBypass.first) {

B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));

- Value *Step =

- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());

EndValueFromAdditionalBypass = emitTransformedIndex(

B, AdditionalBypass.second, II.getStartValue(), Step, II);

EndValueFromAdditionalBypass->setName("ind.end");

@@ -3193,7 +3186,22 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(

return BCResumeVal;

}

+/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV

+/// expansion results.

+static Value *getExpandedStep(const InductionDescriptor &ID,

+ const SCEV2ValueTy &ExpandedSCEVs) {

+ const SCEV *Step = ID.getStep();

+ if (auto *C = dyn_cast<SCEVConstant>(Step))

+ return C->getValue();

+ if (auto *U = dyn_cast<SCEVUnknown>(Step))

+ return U->getValue();

+ auto I = ExpandedSCEVs.find(Step);

+ assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");

+ return I->second;

void InnerLoopVectorizer::createInductionResumeValues(

+ const SCEV2ValueTy &ExpandedSCEVs,

std::pair<BasicBlock *, Value *> AdditionalBypass) {

assert(((AdditionalBypass.first && AdditionalBypass.second) ||

(!AdditionalBypass.first && !AdditionalBypass.second)) &&

@@ -3209,14 +3217,15 @@ void InnerLoopVectorizer::createInductionResumeValues(

PHINode *OrigPhi = InductionEntry.first;

const InductionDescriptor &II = InductionEntry.second;

PHINode *BCResumeVal = createInductionResumeValue(

- OrigPhi, II, LoopBypassBlocks, AdditionalBypass);

+ OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,

+ AdditionalBypass);

OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);

}

BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {

// The trip counts should be cached by now.

- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);

+ Value *Count = getTripCount();

Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();

@@ -3229,7 +3238,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {

// Thus if tail is to be folded, we know we don't need to run the

// remainder and we can use the previous value for the condition (true).

// 3) Otherwise, construct a runtime check.

- if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {

+ if (!Cost->requiresScalarEpilogue(VF.isVector()) &&

+ !Cost->foldTailByMasking()) {

Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,

Count, VectorTripCount, "cmp.n",

LoopMiddleBlock->getTerminator());

@@ -3250,14 +3260,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {

}

std::pair<BasicBlock *, Value *>

-InnerLoopVectorizer::createVectorizedLoopSkeleton() {

+InnerLoopVectorizer::createVectorizedLoopSkeleton(

+ const SCEV2ValueTy &ExpandedSCEVs) {

In this function we generate a new loop. The new loop will contain

the vectorized instructions while the old loop will continue to run the

scalar remainder.

- [ ] <-- loop iteration number check.

- / |

+ [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's

+ / | preheader are expanded here. Eventually all required SCEV

+ / | expansion should happen here.

/ v

| [ ] <-- vector loop bypass (may consist of multiple blocks).

| / |

@@ -3304,7 +3316,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {

emitMemRuntimeChecks(LoopScalarPreHeader);

// Emit phis for the new starting index of the scalar loop.

- createInductionResumeValues();

+ createInductionResumeValues(ExpandedSCEVs);

return {completeLoopSkeleton(), nullptr};

}

@@ -3317,7 +3329,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,

const InductionDescriptor &II,

Value *VectorTripCount, Value *EndValue,

BasicBlock *MiddleBlock,

- BasicBlock *VectorHeader, VPlan &Plan) {

+ BasicBlock *VectorHeader, VPlan &Plan,

+ VPTransformState &State) {

// There are two kinds of external IV usages - those that use the value

// computed in the last iteration (the PHI) and those that use the penultimate

// value (the value that feeds into the phi from the loop latch).

@@ -3345,7 +3358,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,

auto *UI = cast<Instruction>(U);

if (!OrigLoop->contains(UI)) {

assert(isa<PHINode>(UI) && "Expected LCSSA form");

IRBuilder<> B(MiddleBlock->getTerminator());

// Fast-math-flags propagate from the original induction instruction.

@@ -3355,8 +3367,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,

Value *CountMinusOne = B.CreateSub(

VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));

CountMinusOne->setName("cmo");

- Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),

- VectorHeader->getTerminator());

+ VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());

+ assert(StepVPV && "step must have been expanded during VPlan execution");

+ Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()

+ : State.get(StepVPV, {0, 0});

Value *Escape =

emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);

Escape->setName("ind.escape");

@@ -3430,12 +3445,12 @@ static void cse(BasicBlock *BB) {

}

-InstructionCost

-LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,

- bool &NeedToScalarize) const {

+InstructionCost LoopVectorizationCostModel::getVectorCallCost(

+ CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {

Function *F = CI->getCalledFunction();

Type *ScalarRetTy = CI->getType();

SmallVector<Type *, 4> Tys, ScalarTys;

+ bool MaskRequired = Legal->isMaskRequired(CI);

for (auto &ArgOp : CI->args())

ScalarTys.push_back(ArgOp->getType());

@@ -3464,18 +3479,39 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,

// If we can't emit a vector call for this function, then the currently found

// cost is the cost we need to return.

- NeedToScalarize = true;

- VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);

+ InstructionCost MaskCost = 0;

+ VFShape Shape = VFShape::get(*CI, VF, MaskRequired);

+ if (NeedsMask)

+ *NeedsMask = MaskRequired;

Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

+ // If we want an unmasked vector function but can't find one matching the VF,

+ // maybe we can find vector function that does use a mask and synthesize

+ // an all-true mask.

+ if (!VecFunc && !MaskRequired) {

+ Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);

+ VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

+ // If we found one, add in the cost of creating a mask

+ if (VecFunc) {

+ if (NeedsMask)

+ *NeedsMask = true;

+ MaskCost = TTI.getShuffleCost(

+ TargetTransformInfo::SK_Broadcast,

+ VectorType::get(

+ IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),

+ VF));

+ }

+ // We don't support masked function calls yet, but we can scalarize a

+ // masked call with branches (unless VF is scalable).

if (!TLI || CI->isNoBuiltin() || !VecFunc)

- return Cost;

+ return VF.isScalable() ? InstructionCost::getInvalid() : Cost;

// If the corresponding vector cost is cheaper, return its cost.

InstructionCost VectorCallCost =

- TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);

+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;

if (VectorCallCost < Cost) {

- NeedToScalarize = false;

+ *Variant = VecFunc;

Cost = VectorCallCost;

}

return Cost;

@@ -3675,14 +3711,25 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

// Forget the original basic block.

PSE.getSE()->forgetLoop(OrigLoop);

+ // After vectorization, the exit blocks of the original loop will have

+ // additional predecessors. Invalidate SCEVs for the exit phis in case SE

+ // looked through single-entry phis.

+ SmallVector<BasicBlock *> ExitBlocks;

+ OrigLoop->getExitBlocks(ExitBlocks);

+ for (BasicBlock *Exit : ExitBlocks)

+ for (PHINode &PN : Exit->phis())

+ PSE.getSE()->forgetValue(&PN);

VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();

Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);

- if (Cost->requiresScalarEpilogue(VF)) {

+ if (Cost->requiresScalarEpilogue(VF.isVector())) {

// No edge from the middle block to the unique exit block has been inserted

// and there is nothing to fix from vector loop; phis should have incoming

// from scalar loop only.

- Plan.clearLiveOuts();

} else {

+ // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking

+ // the cost model.

// If we inserted an edge from the middle block to the unique exit block,

// update uses outside the loop (phis) to account for the newly inserted

// edge.

@@ -3692,7 +3739,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

fixupIVUsers(Entry.first, Entry.second,

getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),

IVEndValues[Entry.first], LoopMiddleBlock,

- VectorLoop->getHeader(), Plan);

+ VectorLoop->getHeader(), Plan, State);

}

// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated

@@ -3799,31 +3846,53 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(

Value *Incoming = State.get(PreviousDef, UF - 1);

auto *ExtractForScalar = Incoming;

auto *IdxTy = Builder.getInt32Ty();

+ Value *RuntimeVF = nullptr;

if (VF.isVector()) {

auto *One = ConstantInt::get(IdxTy, 1);

Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());

- auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);

+ RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);

auto *LastIdx = Builder.CreateSub(RuntimeVF, One);

- ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,

- "vector.recur.extract");

- }

- // Extract the second last element in the middle block if the

- // Phi is used outside the loop. We need to extract the phi itself

- // and not the last element (the phi update in the current iteration). This

- // will be the value when jumping to the exit block from the LoopMiddleBlock,

- // when the scalar loop is not run at all.

- Value *ExtractForPhiUsedOutsideLoop = nullptr;

- if (VF.isVector()) {

- auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);

- auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));

- ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(

- Incoming, Idx, "vector.recur.extract.for.phi");

- } else if (UF > 1)

- // When loop is unrolled without vectorizing, initialize

- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value

- // of `Incoming`. This is analogous to the vectorized case above: extracting

- // the second last element when VF > 1.

- ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);

+ ExtractForScalar =

+ Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");

+ }

+ auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());

+ assert(PhiR->getNumUsers() == 1 &&

+ RecurSplice->getOpcode() ==

+ VPInstruction::FirstOrderRecurrenceSplice &&

+ "recurrence phi must have a single user: FirstOrderRecurrenceSplice");

+ SmallVector<VPLiveOut *> LiveOuts;

+ for (VPUser *U : RecurSplice->users())

+ if (auto *LiveOut = dyn_cast<VPLiveOut>(U))

+ LiveOuts.push_back(LiveOut);

+ if (!LiveOuts.empty()) {

+ // Extract the second last element in the middle block if the

+ // Phi is used outside the loop. We need to extract the phi itself

+ // and not the last element (the phi update in the current iteration). This

+ // will be the value when jumping to the exit block from the

+ // LoopMiddleBlock, when the scalar loop is not run at all.

+ Value *ExtractForPhiUsedOutsideLoop = nullptr;

+ if (VF.isVector()) {

+ auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));

+ ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(

+ Incoming, Idx, "vector.recur.extract.for.phi");

+ } else {

+ assert(UF > 1 && "VF and UF cannot both be 1");

+ // When loop is unrolled without vectorizing, initialize

+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled

+ // value of `Incoming`. This is analogous to the vectorized case above:

+ // extracting the second last element when VF > 1.

+ ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);

+ }

+ for (VPLiveOut *LiveOut : LiveOuts) {

+ assert(!Cost->requiresScalarEpilogue(VF.isVector()));

+ PHINode *LCSSAPhi = LiveOut->getPhi();

+ LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);

+ State.Plan->removeLiveOut(LCSSAPhi);

+ }

// Fix the initial value of the original recurrence in the scalar loop.

Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());

@@ -3837,22 +3906,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(

Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);

Phi->setName("scalar.recur");

- // Finally, fix users of the recurrence outside the loop. The users will need

- // either the last value of the scalar recurrence or the last value of the

- // vector recurrence we extracted in the middle block. Since the loop is in

- // LCSSA form, we just need to find all the phi nodes for the original scalar

- // recurrence in the exit block, and then add an edge for the middle block.

- // Note that LCSSA does not imply single entry when the original scalar loop

- // had multiple exiting edges (as we always run the last iteration in the

- // scalar epilogue); in that case, there is no edge from middle to exit and

- // and thus no phis which needed updated.

- if (!Cost->requiresScalarEpilogue(VF))

- for (PHINode &LCSSAPhi : LoopExitBlock->phis())

- if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {

- LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);

- State.Plan->removeLiveOut(&LCSSAPhi);

- }

}

void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

@@ -3872,9 +3925,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

// This is the vector-clone of the value that leaves the loop.

Type *VecTy = State.get(LoopExitInstDef, 0)->getType();

- // Wrap flags are in general invalid after vectorization, clear them.

- clearReductionWrapFlags(PhiR, State);

// Before each round, move the insertion point right between

// the PHIs and the values we are going to write.

// This allows us to write both PHINodes and the extractelement

@@ -4036,7 +4086,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

// We know that the loop is in LCSSA form. We need to update the PHI nodes

// in the exit blocks. See comment on analogous loop in

// fixFixedOrderRecurrence for a more complete explaination of the logic.

- if (!Cost->requiresScalarEpilogue(VF))

+ if (!Cost->requiresScalarEpilogue(VF.isVector()))

for (PHINode &LCSSAPhi : LoopExitBlock->phis())

if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {

LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);

@@ -4054,38 +4104,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);

}

-void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,

- VPTransformState &State) {

- const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

- RecurKind RK = RdxDesc.getRecurrenceKind();

- if (RK != RecurKind::Add && RK != RecurKind::Mul)

- return;

- SmallVector<VPValue *, 8> Worklist;

- SmallPtrSet<VPValue *, 8> Visited;

- Worklist.push_back(PhiR);

- Visited.insert(PhiR);

- while (!Worklist.empty()) {

- VPValue *Cur = Worklist.pop_back_val();

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *V = State.get(Cur, Part);

- if (!isa<OverflowingBinaryOperator>(V))

- break;

- cast<Instruction>(V)->dropPoisonGeneratingFlags();

- }

- for (VPUser *U : Cur->users()) {

- auto *UserRecipe = dyn_cast<VPRecipeBase>(U);

- if (!UserRecipe)

- continue;

- for (VPValue *V : UserRecipe->definedValues())

- if (Visited.insert(V).second)

- Worklist.push_back(V);

- }

void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {

// The basic block and loop containing the predicated instruction.

auto *PredBB = PredInst->getParent();

@@ -4125,10 +4143,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {

auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());

// We can't sink an instruction if it is a phi node, is not in the loop,

- // or may have side effects.

+ // may have side effects or may read from memory.

+ // TODO Could dor more granular checking to allow sinking a load past non-store instructions.

if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||

- I->mayHaveSideEffects())

- continue;

+ I->mayHaveSideEffects() || I->mayReadFromMemory())

+ continue;

// If the instruction is already in PredBB, check if we can sink its

// operands. In that case, VPlan's sinkScalarOperands() succeeded in

@@ -4189,7 +4208,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

// We should not collect Scalars more than once per VF. Right now, this

// function is called from collectUniformsAndScalars(), which already does

// this check. Collecting Scalars for VF=1 does not make any sense.

- assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&

+ assert(VF.isVector() && !Scalars.contains(VF) &&

"This function should not be visited twice for the same VF");

// This avoids any chances of creating a REPLICATE recipe during planning

@@ -4382,6 +4401,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(

switch(I->getOpcode()) {

default:

return true;

+ case Instruction::Call:

+ return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);

case Instruction::Load:

case Instruction::Store: {

auto *Ptr = getLoadStorePointerOperand(I);

@@ -4430,10 +4451,10 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {

// both speculation safety (which follows from the same argument as loads),

// but also must prove the value being stored is correct. The easiest

// form of the later is to require that all values stored are the same.

- if (Legal->isUniformMemOp(*I) &&

- (isa<LoadInst>(I) ||

- (isa<StoreInst>(I) &&

- TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&

+ if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&

+ (isa<LoadInst>(I) ||

+ (isa<StoreInst>(I) &&

+ TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&

!Legal->blockNeedsPredication(I->getParent()))

return false;

return true;

@@ -4445,6 +4466,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {

// TODO: We can use the loop-preheader as context point here and get

// context sensitive reasoning

return !isSafeToSpeculativelyExecute(I);

+ case Instruction::Call:

+ return Legal->isMaskRequired(I);

}

@@ -4502,7 +4525,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,

// second vector operand. One example of this are shifts on x86.

Value *Op2 = I->getOperand(1);

auto Op2Info = TTI.getOperandInfo(Op2);

- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))

+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&

+ Legal->isInvariant(Op2))

Op2Info.Kind = TargetTransformInfo::OK_UniformValue;

SmallVector<const Value *, 4> Operands(I->operand_values());

@@ -4614,7 +4638,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

// already does this check. Collecting Uniforms for VF=1 does not make any

// sense.

- assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&

+ assert(VF.isVector() && !Uniforms.contains(VF) &&

"This function should not be visited twice for the same VF");

// Visit the list of Uniforms. If we'll not find any uniform value, we'll

@@ -4663,10 +4687,18 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())

addToWorklistIfAllowed(Cmp);

+ auto PrevVF = VF.divideCoefficientBy(2);

// Return true if all lanes perform the same memory operation, and we can

// thus chose to execute only one.

auto isUniformMemOpUse = [&](Instruction *I) {

- if (!Legal->isUniformMemOp(*I))

+ // If the value was already known to not be uniform for the previous

+ // (smaller VF), it cannot be uniform for the larger VF.

+ if (PrevVF.isVector()) {

+ auto Iter = Uniforms.find(PrevVF);

+ if (Iter != Uniforms.end() && !Iter->second.contains(I))

+ return false;

+ }

+ if (!Legal->isUniformMemOp(*I, VF))

return false;

if (isa<LoadInst>(I))

// Loading the same address always produces the same result - at least

@@ -4689,11 +4721,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

WideningDecision == CM_Interleave);

};

// Returns true if Ptr is the pointer operand of a memory access instruction

- // I, and I is known to not require scalarization.

+ // I, I is known to not require scalarization, and the pointer is not also

+ // stored.

auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {

- return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);

+ if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)

+ return false;

+ return getLoadStorePointerOperand(I) == Ptr &&

+ (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));

};

// Holds a list of values which are known to have at least one uniform use.

@@ -4739,10 +4774,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

if (isUniformMemOpUse(&I))

addToWorklistIfAllowed(&I);

- if (isUniformDecision(&I, VF)) {

- assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");

+ if (isVectorizedMemAccessUse(&I, Ptr))

HasUniformUse.insert(Ptr);

- }

}

// Add to the worklist any operands which have *only* uniform (e.g. lane 0

@@ -4906,12 +4939,11 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

return MaxScalableVF;

// Limit MaxScalableVF by the maximum safe dependence distance.

- std::optional<unsigned> MaxVScale = TTI.getMaxVScale();

- if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))

- MaxVScale =

- TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

- MaxScalableVF =

- ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);

+ if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))

+ MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);

+ else

+ MaxScalableVF = ElementCount::getScalable(0);

if (!MaxScalableVF)

reportVectorizationInfo(

"Max legal vector width too small, scalable vectorization "

@@ -4932,7 +4964,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

// the memory accesses that is most restrictive (involved in the smallest

// dependence distance).

unsigned MaxSafeElements =

- PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);

+ llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);

auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);

auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);

@@ -5105,16 +5137,26 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

}

FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);

// Avoid tail folding if the trip count is known to be a multiple of any VF

- // we chose.

- // FIXME: The condition below pessimises the case for fixed-width vectors,

- // when scalable VFs are also candidates for vectorization.

- if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {

- ElementCount MaxFixedVF = MaxFactors.FixedVF;

- assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&

+ // we choose.

+ std::optional<unsigned> MaxPowerOf2RuntimeVF =

+ MaxFactors.FixedVF.getFixedValue();

+ if (MaxFactors.ScalableVF) {

+ std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);

+ if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {

+ MaxPowerOf2RuntimeVF = std::max<unsigned>(

+ *MaxPowerOf2RuntimeVF,

+ *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());

+ } else

+ MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.

+ }

+ if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {

+ assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&

"MaxFixedVF must be a power of 2");

- unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC

- : MaxFixedVF.getFixedValue();

+ unsigned MaxVFtimesIC =

+ UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;

ScalarEvolution *SE = PSE.getSE();

const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();

const SCEV *ExitCount = SE->getAddExpr(

@@ -5134,7 +5176,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

// by masking.

// FIXME: look for a smaller MaxVF that does divide TC rather than masking.

if (Legal->prepareToFoldTailByMasking()) {

- FoldTailByMasking = true;

+ CanFoldTailByMasking = true;

return MaxFactors;

}

@@ -5187,7 +5229,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

// Ensure MaxVF is a power of 2; the dependence distance bound may not be.

// Note that both WidestRegister and WidestType may not be a powers of 2.

auto MaxVectorElementCount = ElementCount::get(

- PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),

+ llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),

ComputeScalableMaxVF);

MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);

LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "

@@ -5207,6 +5249,13 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

auto Min = Attr.getVScaleRangeMin();

WidestRegisterMinEC *= Min;

}

+ // When a scalar epilogue is required, at least one iteration of the scalar

+ // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a

+ // max VF that results in a dead vector loop.

+ if (ConstTripCount > 0 && requiresScalarEpilogue(true))

+ ConstTripCount -= 1;

if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&

(!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {

// If loop trip count (TC) is known at compile time there is no point in

@@ -5214,7 +5263,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

// power of two which doesn't exceed TC.

// If MaxVectorElementCount is scalable, we only fall back on a fixed VF

// when the TC is less than or equal to the known number of lanes.

- auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);

+ auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount);

LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "

"exceeding the constant trip count: "

<< ClampedConstTripCount << "\n");

@@ -5228,7 +5277,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&

TTI.shouldMaximizeVectorBandwidth(RegKind))) {

auto MaxVectorElementCountMaxBW = ElementCount::get(

- PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),

+ llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),

ComputeScalableMaxVF);

MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);

@@ -5273,9 +5322,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

return MaxVF;

}

-std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {

- if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {

- auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);

+/// Convenience function that returns the value of vscale_range iff

+/// vscale_range.min == vscale_range.max or otherwise returns the value

+/// returned by the corresponding TTI method.

+static std::optional<unsigned>

+getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {

+ const Function *Fn = L->getHeader()->getParent();

+ if (Fn->hasFnAttribute(Attribute::VScaleRange)) {

+ auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);

auto Min = Attr.getVScaleRangeMin();

auto Max = Attr.getVScaleRangeMax();

if (Max && Min == Max)

@@ -5285,31 +5339,39 @@ std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {

return TTI.getVScaleForTuning();

}

-bool LoopVectorizationCostModel::isMoreProfitable(

+bool LoopVectorizationPlanner::isMoreProfitable(

const VectorizationFactor &A, const VectorizationFactor &B) const {

InstructionCost CostA = A.Cost;

InstructionCost CostB = B.Cost;

- unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);

- if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&

- MaxTripCount) {

- // If we are folding the tail and the trip count is a known (possibly small)

- // constant, the trip count will be rounded up to an integer number of

- // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),

- // which we compare directly. When not folding the tail, the total cost will

- // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is

- // approximated with the per-lane cost below instead of using the tripcount

- // as here.

- auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());

- auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());

+ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);

+ if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {

+ // If the trip count is a known (possibly small) constant, the trip count

+ // will be rounded up to an integer number of iterations under

+ // FoldTailByMasking. The total cost in that case will be

+ // VecCost*ceil(TripCount/VF). When not folding the tail, the total

+ // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be

+ // some extra overheads, but for the purpose of comparing the costs of

+ // different VFs we can use this to compare the total loop-body cost

+ // expected after vectorization.

+ auto GetCostForTC = [MaxTripCount, this](unsigned VF,

+ InstructionCost VectorCost,

+ InstructionCost ScalarCost) {

+ return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)

+ : VectorCost * (MaxTripCount / VF) +

+ ScalarCost * (MaxTripCount % VF);

+ };

+ auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);

+ auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);

return RTCostA < RTCostB;

}

// Improve estimate for the vector width if it is scalable.

unsigned EstimatedWidthA = A.Width.getKnownMinValue();

unsigned EstimatedWidthB = B.Width.getKnownMinValue();

- if (std::optional<unsigned> VScale = getVScaleForTuning()) {

+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {

if (A.Width.isScalable())

EstimatedWidthA *= *VScale;

if (B.Width.isScalable())

@@ -5328,9 +5390,74 @@ bool LoopVectorizationCostModel::isMoreProfitable(

return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);

}

-VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

+static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,

+ OptimizationRemarkEmitter *ORE,

+ Loop *TheLoop) {

+ if (InvalidCosts.empty())

+ return;

+ // Emit a report of VFs with invalid costs in the loop.

+ // Group the remarks per instruction, keeping the instruction order from

+ // InvalidCosts.

+ std::map<Instruction *, unsigned> Numbering;

+ unsigned I = 0;

+ for (auto &Pair : InvalidCosts)

+ if (!Numbering.count(Pair.first))

+ Numbering[Pair.first] = I++;

+ // Sort the list, first on instruction(number) then on VF.

+ sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {

+ if (Numbering[A.first] != Numbering[B.first])

+ return Numbering[A.first] < Numbering[B.first];

+ ElementCountComparator ECC;

+ return ECC(A.second, B.second);

+ });

+ // For a list of ordered instruction-vf pairs:

+ // [(load, vf1), (load, vf2), (store, vf1)]

+ // Group the instructions together to emit separate remarks for:

+ // load (vf1, vf2)

+ // store (vf1)

+ auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);

+ auto Subset = ArrayRef<InstructionVFPair>();

+ do {

+ if (Subset.empty())

+ Subset = Tail.take_front(1);

+ Instruction *I = Subset.front().first;

+ // If the next instruction is different, or if there are no other pairs,

+ // emit a remark for the collated subset. e.g.

+ // [(load, vf1), (load, vf2))]

+ // to emit:

+ // remark: invalid costs for 'load' at VF=(vf, vf2)

+ if (Subset == Tail || Tail[Subset.size()].first != I) {

+ std::string OutString;

+ raw_string_ostream OS(OutString);

+ assert(!Subset.empty() && "Unexpected empty range");

+ OS << "Instruction with invalid costs prevented vectorization at VF=(";

+ for (const auto &Pair : Subset)

+ OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;

+ OS << "):";

+ if (auto *CI = dyn_cast<CallInst>(I))

+ OS << " call to " << CI->getCalledFunction()->getName();

+ else

+ OS << " " << I->getOpcodeName();

+ OS.flush();

+ reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);

+ Tail = Tail.drop_front(Subset.size());

+ Subset = {};

+ } else

+ // Grow the subset by one element

+ Subset = Tail.take_front(Subset.size() + 1);

+ } while (!Tail.empty());

+VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(

const ElementCountSet &VFCandidates) {

- InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;

+ InstructionCost ExpectedCost =

+ CM.expectedCost(ElementCount::getFixed(1)).first;

LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");

assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");

assert(VFCandidates.count(ElementCount::getFixed(1)) &&

@@ -5340,7 +5467,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

ExpectedCost);

VectorizationFactor ChosenFactor = ScalarCost;

- bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;

+ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;

if (ForceVectorization && VFCandidates.size() > 1) {

// Ignore scalar width, because the user explicitly wants vectorization.

// Initialize cost to max so that VF = 2 is, at least, chosen during cost

@@ -5354,12 +5481,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

if (i.isScalar())

continue;

- VectorizationCostTy C = expectedCost(i, &InvalidCosts);

+ LoopVectorizationCostModel::VectorizationCostTy C =

+ CM.expectedCost(i, &InvalidCosts);

VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);

#ifndef NDEBUG

unsigned AssumedMinimumVscale = 1;

- if (std::optional<unsigned> VScale = getVScaleForTuning())

+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))

AssumedMinimumVscale = *VScale;

unsigned Width =

Candidate.Width.isScalable()

@@ -5388,70 +5516,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

ChosenFactor = Candidate;

}

- // Emit a report of VFs with invalid costs in the loop.

- if (!InvalidCosts.empty()) {

- // Group the remarks per instruction, keeping the instruction order from

- // InvalidCosts.

- std::map<Instruction *, unsigned> Numbering;

- unsigned I = 0;

- for (auto &Pair : InvalidCosts)

- if (!Numbering.count(Pair.first))

- Numbering[Pair.first] = I++;

- // Sort the list, first on instruction(number) then on VF.

- llvm::sort(InvalidCosts,

- [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {

- if (Numbering[A.first] != Numbering[B.first])

- return Numbering[A.first] < Numbering[B.first];

- ElementCountComparator ECC;

- return ECC(A.second, B.second);

- });

- // For a list of ordered instruction-vf pairs:

- // [(load, vf1), (load, vf2), (store, vf1)]

- // Group the instructions together to emit separate remarks for:

- // load (vf1, vf2)

- // store (vf1)

- auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);

- auto Subset = ArrayRef<InstructionVFPair>();

- do {

- if (Subset.empty())

- Subset = Tail.take_front(1);

- Instruction *I = Subset.front().first;

- // If the next instruction is different, or if there are no other pairs,

- // emit a remark for the collated subset. e.g.

- // [(load, vf1), (load, vf2))]

- // to emit:

- // remark: invalid costs for 'load' at VF=(vf, vf2)

- if (Subset == Tail || Tail[Subset.size()].first != I) {

- std::string OutString;

- raw_string_ostream OS(OutString);

- assert(!Subset.empty() && "Unexpected empty range");

- OS << "Instruction with invalid costs prevented vectorization at VF=(";

- for (const auto &Pair : Subset)

- OS << (Pair.second == Subset.front().second ? "" : ", ")

- << Pair.second;

- OS << "):";

- if (auto *CI = dyn_cast<CallInst>(I))

- OS << " call to " << CI->getCalledFunction()->getName();

- else

- OS << " " << I->getOpcodeName();

- OS.flush();

- reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);

- Tail = Tail.drop_front(Subset.size());

- Subset = {};

- } else

- // Grow the subset by one element

- Subset = Tail.take_front(Subset.size() + 1);

- } while (!Tail.empty());

- }

+ emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);

- if (!EnableCondStoresVectorization && NumPredStores) {

- reportVectorizationFailure("There are conditional stores.",

+ if (!EnableCondStoresVectorization && CM.hasPredStores()) {

+ reportVectorizationFailure(

+ "There are conditional stores.",

"store that is conditionally executed prevents vectorization",

- "ConditionalStore", ORE, TheLoop);

+ "ConditionalStore", ORE, OrigLoop);

ChosenFactor = ScalarCost;

}

@@ -5463,11 +5534,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

return ChosenFactor;

}

-bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(

- const Loop &L, ElementCount VF) const {

+bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(

+ ElementCount VF) const {

// Cross iteration phis such as reductions need special handling and are

// currently unsupported.

- if (any_of(L.getHeader()->phis(),

+ if (any_of(OrigLoop->getHeader()->phis(),

[&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))

return false;

@@ -5475,20 +5546,21 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(

// currently unsupported.

for (const auto &Entry : Legal->getInductionVars()) {

// Look for uses of the value of the induction at the last iteration.

- Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());

+ Value *PostInc =

+ Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());

for (User *U : PostInc->users())

- if (!L.contains(cast<Instruction>(U)))

+ if (!OrigLoop->contains(cast<Instruction>(U)))

return false;

// Look for uses of penultimate value of the induction.

for (User *U : Entry.first->users())

- if (!L.contains(cast<Instruction>(U)))

+ if (!OrigLoop->contains(cast<Instruction>(U)))

return false;

}

// Epilogue vectorization code has not been auditted to ensure it handles

// non-latch exits properly. It may be fine, but it needs auditted and

// tested.

- if (L.getExitingBlock() != L.getLoopLatch())

+ if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())

return false;

return true;

@@ -5507,62 +5579,59 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(

// We also consider epilogue vectorization unprofitable for targets that don't

// consider interleaving beneficial (eg. MVE).

- if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)

+ if (TTI.getMaxInterleaveFactor(VF) <= 1)

return false;

- // FIXME: We should consider changing the threshold for scalable

- // vectors to take VScaleForTuning into account.

- if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)

+ unsigned Multiplier = 1;

+ if (VF.isScalable())

+ Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);

+ if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)

return true;

return false;

}

-VectorizationFactor

-LoopVectorizationCostModel::selectEpilogueVectorizationFactor(

- const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {

+VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(

+ const ElementCount MainLoopVF, unsigned IC) {

VectorizationFactor Result = VectorizationFactor::Disabled();

if (!EnableEpilogueVectorization) {

- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);

+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");

return Result;

}

- if (!isScalarEpilogueAllowed()) {

- LLVM_DEBUG(

- dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "

- "allowed.\n";);

+ if (!CM.isScalarEpilogueAllowed()) {

+ LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "

+ "epilogue is allowed.\n");

return Result;

}

// Not really a cost consideration, but check for unsupported cases here to

// simplify the logic.

- if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {

- LLVM_DEBUG(

- dbgs() << "LEV: Unable to vectorize epilogue because the loop is "

- "not a supported candidate.\n";);

+ if (!isCandidateForEpilogueVectorization(MainLoopVF)) {

+ LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "

+ "is not a supported candidate.\n");

return Result;

}

if (EpilogueVectorizationForceVF > 1) {

- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);

+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");

ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);

- if (LVP.hasPlanWithVF(ForcedEC))

+ if (hasPlanWithVF(ForcedEC))

return {ForcedEC, 0, 0};

else {

- LLVM_DEBUG(

- dbgs()

- << "LEV: Epilogue vectorization forced factor is not viable.\n";);

+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "

+ "viable.\n");

return Result;

}

- if (TheLoop->getHeader()->getParent()->hasOptSize() ||

- TheLoop->getHeader()->getParent()->hasMinSize()) {

+ if (OrigLoop->getHeader()->getParent()->hasOptSize() ||

+ OrigLoop->getHeader()->getParent()->hasMinSize()) {

LLVM_DEBUG(

- dbgs()

- << "LEV: Epilogue vectorization skipped due to opt for size.\n";);

+ dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");

return Result;

}

- if (!isEpilogueVectorizationProfitable(MainLoopVF)) {

+ if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {

LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "

"this loop\n");

return Result;

@@ -5574,21 +5643,48 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(

ElementCount EstimatedRuntimeVF = MainLoopVF;

if (MainLoopVF.isScalable()) {

EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());

- if (std::optional<unsigned> VScale = getVScaleForTuning())

+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))

EstimatedRuntimeVF *= *VScale;

}

- for (auto &NextVF : ProfitableVFs)

- if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&

- ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||

- ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&

- (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&

- LVP.hasPlanWithVF(NextVF.Width))

+ ScalarEvolution &SE = *PSE.getSE();

+ Type *TCType = Legal->getWidestInductionType();

+ const SCEV *RemainingIterations = nullptr;

+ for (auto &NextVF : ProfitableVFs) {

+ // Skip candidate VFs without a corresponding VPlan.

+ if (!hasPlanWithVF(NextVF.Width))

+ continue;

+ // Skip candidate VFs with widths >= the estimate runtime VF (scalable

+ // vectors) or the VF of the main loop (fixed vectors).

+ if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&

+ ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||

+ ElementCount::isKnownGE(NextVF.Width, MainLoopVF))

+ continue;

+ // If NextVF is greater than the number of remaining iterations, the

+ // epilogue loop would be dead. Skip such factors.

+ if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {

+ // TODO: extend to support scalable VFs.

+ if (!RemainingIterations) {

+ const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);

+ RemainingIterations = SE.getURemExpr(

+ TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));

+ }

+ if (SE.isKnownPredicate(

+ CmpInst::ICMP_UGT,

+ SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),

+ RemainingIterations))

+ continue;

+ }

+ if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))

Result = NextVF;

+ }

if (Result != VectorizationFactor::Disabled())

LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "

- << Result.Width << "\n";);

+ << Result.Width << "\n");

return Result;

}

@@ -5688,7 +5784,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

return 1;

// We used the distance for the interleave count.

- if (Legal->getMaxSafeDepDistBytes() != -1U)

+ if (!Legal->isSafeForAnyVectorWidth())

return 1;

auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);

@@ -5750,20 +5846,19 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())

LoopInvariantRegs = R.LoopInvariantRegs[pair.first];

- unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);

+ unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /

+ MaxLocalUsers);

// Don't count the induction variable as interleaved.

if (EnableIndVarRegisterHeur) {

- TmpIC =

- PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /

- std::max(1U, (MaxLocalUsers - 1)));

+ TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /

+ std::max(1U, (MaxLocalUsers - 1)));

}

IC = std::min(IC, TmpIC);

}

// Clamp the interleave ranges to reasonable counts.

- unsigned MaxInterleaveCount =

- TTI.getMaxInterleaveFactor(VF.getKnownMinValue());

+ unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);

// Check if the user has overridden the max.

if (VF.isScalar()) {

@@ -5834,8 +5929,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

// We assume that the cost overhead is 1 and we use the cost model

// to estimate the cost of the loop and interleave until the cost of the

// loop overhead is about 5% of the cost of the loop.

- unsigned SmallIC = std::min(

- IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));

+ unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(

+ SmallLoopCost / *LoopCost.getValue()));

// Interleave until store/load ports (estimated by max interleave count) are

// saturated.

@@ -5953,7 +6048,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

// Saves the list of values that are used in the loop but are defined outside

// the loop (not including non-instruction values such as arguments and

// constants).

- SmallPtrSet<Value *, 8> LoopInvariants;

+ SmallSetVector<Instruction *, 8> LoopInvariants;

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {

for (Instruction &I : BB->instructionsWithoutDebug()) {

@@ -6079,11 +6174,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

for (auto *Inst : LoopInvariants) {

// FIXME: The target might use more than one register for the type

// even in the scalar case.

- unsigned Usage =

- VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);

+ bool IsScalar = all_of(Inst->users(), [&](User *U) {

+ auto *I = cast<Instruction>(U);

+ return TheLoop != LI->getLoopFor(I->getParent()) ||

+ isScalarAfterVectorization(I, VFs[i]);

+ });

+ ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];

unsigned ClassID =

- TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());

- Invariant[ClassID] += Usage;

+ TTI.getRegisterClassForType(VF.isVector(), Inst->getType());

+ Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);

}

LLVM_DEBUG({

@@ -6134,8 +6234,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {

// instructions to scalarize, there's nothing to do. Collection may already

// have occurred if we have a user-selected VF and are now computing the

// expected cost for interleaving.

- if (VF.isScalar() || VF.isZero() ||

- InstsToScalarize.find(VF) != InstsToScalarize.end())

+ if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))

return;

// Initialize a mapping for VF in InstsToScalalarize. If we find that it's

@@ -6224,7 +6323,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

Instruction *I = Worklist.pop_back_val();

// If we've already analyzed the instruction, there's nothing to do.

- if (ScalarCosts.find(I) != ScalarCosts.end())

+ if (ScalarCosts.contains(I))

continue;

// Compute the cost of the vector instruction. Note that this cost already

@@ -6362,11 +6461,6 @@ static const SCEV *getAddressAccessSCEV(

return PSE.getSCEV(Ptr);

}

-static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {

- return Legal->hasStride(I->getOperand(0)) ||

- Legal->hasStride(I->getOperand(1));

InstructionCost

LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

ElementCount VF) {

@@ -6460,7 +6554,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,

InstructionCost

LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

ElementCount VF) {

- assert(Legal->isUniformMemOp(*I));

+ assert(Legal->isUniformMemOp(*I, VF));

Type *ValTy = getLoadStoreType(I);

auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));

@@ -6475,7 +6569,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

}

StoreInst *SI = cast<StoreInst>(I);

- bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());

+ bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());

return TTI.getAddressComputationCost(ValTy) +

TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,

CostKind) +

@@ -6502,11 +6596,6 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,

InstructionCost

LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

ElementCount VF) {

- // TODO: Once we have support for interleaving with scalable vectors

- // we can calculate the cost properly here.

- if (VF.isScalable())

- return InstructionCost::getInvalid();

Type *ValTy = getLoadStoreType(I);

auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));

unsigned AS = getLoadStoreAddressSpace(I);

@@ -6836,7 +6925,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {

if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))

NumPredStores++;

- if (Legal->isUniformMemOp(I)) {

+ if (Legal->isUniformMemOp(I, VF)) {

auto isLegalToScalarize = [&]() {

if (!VF.isScalable())

// Scalarization of fixed length vectors "just works".

@@ -7134,8 +7223,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- // Since we will replace the stride by 1 the multiplication should go away.

- if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))

+ // If we're speculating on the stride being 1, the multiplication may

+ // fold away. We can generalize this for all operations using the notion

+ // of neutral elements. (TODO)

+ if (I->getOpcode() == Instruction::Mul &&

+ (PSE.getSCEV(I->getOperand(0))->isOne() ||

+ PSE.getSCEV(I->getOperand(1))->isOne()))

return 0;

// Detect reduction patterns

@@ -7146,7 +7239,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

// second vector operand. One example of this are shifts on x86.

Value *Op2 = I->getOperand(1);

auto Op2Info = TTI.getOperandInfo(Op2);

- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))

+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&

+ Legal->isInvariant(Op2))

Op2Info.Kind = TargetTransformInfo::OK_UniformValue;

SmallVector<const Value *, 4> Operands(I->operand_values());

@@ -7304,7 +7398,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

VectorTy =

largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);

} else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {

- SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);

+ // Leave SrcVecTy unchanged - we only shrink the destination element

+ // type.

VectorTy =

smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);

}

@@ -7316,9 +7411,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

if (RecurrenceDescriptor::isFMulAddIntrinsic(I))

if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))

return *RedCost;

- bool NeedToScalarize;

+ Function *Variant;

CallInst *CI = cast<CallInst>(I);

- InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);

+ InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);

if (getVectorIntrinsicIDForCall(CI, TLI)) {

InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);

return std::min(CallCost, IntrinsicCost);

@@ -7339,37 +7434,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

} // end of switch.

}

-char LoopVectorize::ID = 0;

-static const char lv_name[] = "Loop Vectorization";

-INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)

-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)

-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)

-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)

-INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)

-namespace llvm {

-Pass *createLoopVectorizePass() { return new LoopVectorize(); }

-Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,

- bool VectorizeOnlyWhenForced) {

- return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);

-} // end namespace llvm

void LoopVectorizationCostModel::collectValuesToIgnore() {

// Ignore ephemeral values.

CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

@@ -7462,7 +7526,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

// reasonable one.

if (UserVF.isZero()) {

VF = ElementCount::getFixed(determineVPlanVF(

- TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

.getFixedValue(),

CM));

LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");

@@ -7497,13 +7561,16 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

std::optional<VectorizationFactor>

LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

assert(OrigLoop->isInnermost() && "Inner loop expected.");

+ CM.collectValuesToIgnore();

+ CM.collectElementTypesForWidening();

FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);

if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.

return std::nullopt;

// Invalidate interleave groups if all blocks of loop will be predicated.

if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&

- !useMaskedInterleavedAccesses(*TTI)) {

+ !useMaskedInterleavedAccesses(TTI)) {

LLVM_DEBUG(

dbgs()

<< "LV: Invalidate all interleaved groups due to fold-tail by masking "

@@ -7527,6 +7594,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");

CM.collectInLoopReductions();

buildVPlansWithVPRecipes(UserVF, UserVF);

+ if (!hasPlanWithVF(UserVF)) {

+ LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF

+ << ".\n");

+ return std::nullopt;

+ }

LLVM_DEBUG(printPlans(dbgs()));

return {{UserVF, 0, 0}};

} else

@@ -7562,8 +7635,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

return VectorizationFactor::Disabled();

// Select the optimal vectorization factor.

- VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);

+ VectorizationFactor VF = selectVectorizationFactor(VFCandidates);

assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");

+ if (!hasPlanWithVF(VF.Width)) {

+ LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width

+ << ".\n");

+ return std::nullopt;

+ }

return VF;

}

@@ -7614,43 +7692,51 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {

}

-void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

- VPlan &BestVPlan,

- InnerLoopVectorizer &ILV,

- DominatorTree *DT,

- bool IsEpilogueVectorization) {

+SCEV2ValueTy LoopVectorizationPlanner::executePlan(

+ ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,

+ InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,

+ DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {

assert(BestVPlan.hasVF(BestVF) &&

"Trying to execute plan with unsupported VF");

assert(BestVPlan.hasUF(BestUF) &&

"Trying to execute plan with unsupported UF");

+ assert(

+ (IsEpilogueVectorization || !ExpandedSCEVs) &&

+ "expanded SCEVs to reuse can only be used during epilogue vectorization");

LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF

<< '\n');

- // Workaround! Compute the trip count of the original loop and cache it

- // before we start modifying the CFG. This code has a systemic problem

- // wherein it tries to run analysis over partially constructed IR; this is

- // wrong, and not simply for SCEV. The trip count of the original loop

- // simply happens to be prone to hitting this in practice. In theory, we

- // can hit the same issue for any SCEV, or ValueTracking query done during

- // mutation. See PR49900.

- ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());

if (!IsEpilogueVectorization)

VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

// Perform the actual loop transformation.

+ VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};

+ // 0. Generate SCEV-dependent code into the preheader, including TripCount,

+ // before making any changes to the CFG.

+ if (!BestVPlan.getPreheader()->empty()) {

+ State.CFG.PrevBB = OrigLoop->getLoopPreheader();

+ State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());

+ BestVPlan.getPreheader()->execute(&State);

+ }

+ if (!ILV.getTripCount())

+ ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));

+ else

+ assert(IsEpilogueVectorization && "should only re-use the existing trip "

+ "count during epilogue vectorization");

// 1. Set up the skeleton for vectorization, including vector pre-header and

// middle block. The vector loop is created during VPlan execution.

- VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};

Value *CanonicalIVStartValue;

std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =

- ILV.createVectorizedLoopSkeleton();

+ ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs

+ : State.ExpandedSCEVs);

// Only use noalias metadata when using memory checks guaranteeing no overlap

// across all iterations.

const LoopAccessInfo *LAI = ILV.Legal->getLAI();

+ std::unique_ptr<LoopVersioning> LVer = nullptr;

if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&

!LAI->getRuntimePointerChecking()->getDiffChecks()) {

@@ -7658,9 +7744,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

// still use it to add the noalias metadata.

// TODO: Find a better way to re-use LoopVersioning functionality to add

// metadata.

- State.LVer = std::make_unique<LoopVersioning>(

+ LVer = std::make_unique<LoopVersioning>(

*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,

PSE.getSE());

+ State.LVer = &*LVer;

State.LVer->prepareNoAliasMetadata();

}

@@ -7677,10 +7764,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

//===------------------------------------------------===//

// 2. Copy and widen instructions from the old loop into the new loop.

- BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),

- ILV.getOrCreateVectorTripCount(nullptr),

- CanonicalIVStartValue, State,

- IsEpilogueVectorization);

+ BestVPlan.prepareToExecute(

+ ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),

+ CanonicalIVStartValue, State, IsEpilogueVectorization);

BestVPlan.execute(&State);

@@ -7706,13 +7792,18 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

LoopVectorizeHints Hints(L, true, *ORE);

Hints.setAlreadyVectorized();

}

- AddRuntimeUnrollDisableMetaData(L);

+ TargetTransformInfo::UnrollingPreferences UP;

+ TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);

+ if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)

+ AddRuntimeUnrollDisableMetaData(L);

// 3. Fix the vectorized code: take care of header phi's, live-outs,

// predication, updating analyses.

ILV.fixVectorizedLoop(State, BestVPlan);

ILV.printDebugTracesAtEnd();

+ return State.ExpandedSCEVs;

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

@@ -7725,8 +7816,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {

}

#endif

-Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }

//===--------------------------------------------------------------------===//

// EpilogueVectorizerMainLoop

//===--------------------------------------------------------------------===//

@@ -7734,7 +7823,8 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }

/// This function is partially responsible for generating the control flow

/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.

std::pair<BasicBlock *, Value *>

-EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {

+EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(

+ const SCEV2ValueTy &ExpandedSCEVs) {

createVectorLoopSkeleton("");

// Generate the code to check the minimum iteration count of the vector

@@ -7795,7 +7885,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

assert(Bypass && "Expected valid bypass basic block.");

ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;

unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;

- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);

+ Value *Count = getTripCount();

// Reuse existing vector loop preheader for TC checks.

// Note that new preheader block is generated for vector loop.

BasicBlock *const TCCheckBlock = LoopVectorPreHeader;

@@ -7803,8 +7893,10 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

// Generate code to check if the loop's trip count is less than VF * UF of the

// main vector loop.

- auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?

- ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;

+ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()

+ : VF.isVector())

+ ? ICmpInst::ICMP_ULE

+ : ICmpInst::ICMP_ULT;

Value *CheckMinIters = Builder.CreateICmp(

P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),

@@ -7824,7 +7916,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

// Update dominator for Bypass & LoopExit.

DT->changeImmediateDominator(Bypass, TCCheckBlock);

- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))

+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))

// For loops with multiple exits, there's no edge from the middle block

// to exit blocks (as the epilogue must run) and thus no need to update

// the immediate dominator of the exit blocks.

@@ -7852,7 +7944,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

/// This function is partially responsible for generating the control flow

/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.

std::pair<BasicBlock *, Value *>

-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {

+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(

+ const SCEV2ValueTy &ExpandedSCEVs) {

createVectorLoopSkeleton("vec.epilog.");

// Now, compare the remaining count and if there aren't enough iterations to

@@ -7891,7 +7984,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {

DT->changeImmediateDominator(LoopScalarPreHeader,

EPI.EpilogueIterationCountCheck);

- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))

+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))

// If there is an epilogue which must run, there's no edge from the

// middle block to exit blocks and thus no need to update the immediate

// dominator of the exit blocks.

@@ -7950,7 +8043,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {

// check, then the resume value for the induction variable comes from

// the trip count of the main vector loop, hence passing the AdditionalBypass

// argument.

- createInductionResumeValues({VecEpilogueIterationCountCheck,

+ createInductionResumeValues(ExpandedSCEVs,

+ {VecEpilogueIterationCountCheck,

EPI.VectorTripCount} /* AdditionalBypass */);

return {completeLoopSkeleton(), EPResumeVal};

@@ -7972,8 +8066,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(

// Generate code to check if the loop's trip count is less than VF * UF of the

// vector epilogue loop.

- auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?

- ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;

+ auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())

+ ? ICmpInst::ICMP_ULE

+ : ICmpInst::ICMP_ULT;

Value *CheckMinIters =

Builder.CreateICmp(P, Count,

@@ -8008,8 +8103,7 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(

assert(!Range.isEmpty() && "Trying to test an empty VF range.");

bool PredicateAtRangeStart = Predicate(Range.Start);

- for (ElementCount TmpVF = Range.Start * 2;

- ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)

+ for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))

if (Predicate(TmpVF) != PredicateAtRangeStart) {

Range.End = TmpVF;

break;

@@ -8025,16 +8119,16 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(

/// buildVPlan().

void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,

ElementCount MaxVF) {

- auto MaxVFPlusOne = MaxVF.getWithIncrement(1);

- for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {

- VFRange SubRange = {VF, MaxVFPlusOne};

+ auto MaxVFTimes2 = MaxVF * 2;

+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {

+ VFRange SubRange = {VF, MaxVFTimes2};

VPlans.push_back(buildVPlan(SubRange));

VF = SubRange.End;

}

VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

- VPlanPtr &Plan) {

+ VPlan &Plan) {

assert(is_contained(predecessors(Dst), Src) && "Invalid edge");

// Look for cached value.

@@ -8058,7 +8152,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

if (OrigLoop->isLoopExiting(Src))

return EdgeMaskCache[Edge] = SrcMask;

- VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());

+ VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());

assert(EdgeMask && "No Edge Mask found for condition");

if (BI->getSuccessor(0) != Dst)

@@ -8069,7 +8163,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

// 'select i1 SrcMask, i1 EdgeMask, i1 false'.

// The select version does not introduce new UB if SrcMask is false and

// EdgeMask is poison. Using 'and' here introduces undefined behavior.

- VPValue *False = Plan->getOrAddVPValue(

+ VPValue *False = Plan.getVPValueOrAddLiveIn(

ConstantInt::getFalse(BI->getCondition()->getType()));

EdgeMask =

Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());

@@ -8078,7 +8172,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

return EdgeMaskCache[Edge] = EdgeMask;

}

-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {

assert(OrigLoop->contains(BB) && "Block is not a part of a loop");

// Look for cached value.

@@ -8098,29 +8192,28 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

// If we're using the active lane mask for control flow, then we get the

// mask from the active lane mask PHI that is cached in the VPlan.

- PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();

- if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)

- return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();

+ TailFoldingStyle TFStyle = CM.getTailFoldingStyle();

+ if (useActiveLaneMaskForControlFlow(TFStyle))

+ return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();

// Introduce the early-exit compare IV <= BTC to form header block mask.

// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by

// constructing the desired canonical IV in the header block as its first

// non-phi instructions.

- VPBasicBlock *HeaderVPBB =

- Plan->getVectorLoopRegion()->getEntryBasicBlock();

+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();

- auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());

+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());

HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());

VPBuilder::InsertPointGuard Guard(Builder);

Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);

- if (EmitGetActiveLaneMask != PredicationStyle::None) {

- VPValue *TC = Plan->getOrCreateTripCount();

+ if (useActiveLaneMask(TFStyle)) {

+ VPValue *TC = Plan.getTripCount();

BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},

nullptr, "active.lane.mask");

} else {

- VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();

+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();

BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});

}

return BlockMaskCache[BB] = BlockMask;

@@ -8168,7 +8261,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,

VPValue *Mask = nullptr;

if (Legal->isMaskRequired(I))

- Mask = createBlockInMask(I->getParent(), Plan);

+ Mask = createBlockInMask(I->getParent(), *Plan);

// Determine if the pointer operand of the access is either consecutive or

// reverse consecutive.

@@ -8189,22 +8282,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,

/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also

/// insert a recipe to expand the step for the induction recipe.

-static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(

- PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,

- const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,

- VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {

- // Returns true if an instruction \p I should be scalarized instead of

- // vectorized for the chosen vectorization factor.

- auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {

- return CM.isScalarAfterVectorization(I, VF) ||

- CM.isProfitableToScalarize(I, VF);

- };

- bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(

- [&](ElementCount VF) {

- return ShouldScalarizeInstruction(PhiOrTrunc, VF);

- },

- Range);

+static VPWidenIntOrFpInductionRecipe *

+createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,

+ VPValue *Start, const InductionDescriptor &IndDesc,

+ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,

+ VFRange &Range) {

assert(IndDesc.getStartValue() ==

Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));

assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&

@@ -8213,12 +8295,10 @@ static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(

VPValue *Step =

vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);

if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {

- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,

- !NeedsScalarIVOnly);

+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);

}

assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");

- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,

- !NeedsScalarIVOnly);

+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);

}

VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(

@@ -8227,14 +8307,13 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(

// Check if this is an integer or fp induction. If so, build the recipe that

// produces its scalar and vector values.

if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))

- return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,

+ return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,

*PSE.getSE(), *OrigLoop, Range);

// Check if this is pointer induction. If so, build the recipe for it.

if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {

VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),

*PSE.getSE());

- assert(isa<SCEVConstant>(II->getStep()));

return new VPWidenPointerInductionRecipe(

Phi, Operands[0], Step, *II,

LoopVectorizationPlanner::getDecisionAndClampRange(

@@ -8267,9 +8346,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(

auto *Phi = cast<PHINode>(I->getOperand(0));

const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);

- VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());

- return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,

- *PSE.getSE(), *OrigLoop, Range);

+ VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());

+ return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),

+ *OrigLoop, Range);

}

return nullptr;

}

@@ -8309,7 +8388,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,

for (unsigned In = 0; In < NumIncoming; In++) {

VPValue *EdgeMask =

- createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);

+ createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);

assert((EdgeMask || NumIncoming == 1) &&

"Multiple predecessors with one having a full mask");

OperandsWithMask.push_back(Operands[In]);

@@ -8321,8 +8400,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,

VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

ArrayRef<VPValue *> Operands,

- VFRange &Range) const {

+ VFRange &Range,

+ VPlanPtr &Plan) {

bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(

[this, CI](ElementCount VF) {

return CM.isScalarWithPredication(CI, VF);

@@ -8339,17 +8418,17 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

ID == Intrinsic::experimental_noalias_scope_decl))

return nullptr;

- ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());

+ SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));

// Is it beneficial to perform intrinsic call compared to lib call?

bool ShouldUseVectorIntrinsic =

ID && LoopVectorizationPlanner::getDecisionAndClampRange(

[&](ElementCount VF) -> bool {

- bool NeedToScalarize = false;

+ Function *Variant;

// Is it beneficial to perform intrinsic call compared to lib

// call?

InstructionCost CallCost =

- CM.getVectorCallCost(CI, VF, NeedToScalarize);

+ CM.getVectorCallCost(CI, VF, &Variant);

InstructionCost IntrinsicCost =

CM.getVectorIntrinsicCost(CI, VF);

return IntrinsicCost <= CallCost;

@@ -8358,6 +8437,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

if (ShouldUseVectorIntrinsic)

return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);

+ Function *Variant = nullptr;

+ ElementCount VariantVF;

+ bool NeedsMask = false;

// Is better to call a vectorized version of the function than to to scalarize

// the call?

auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(

@@ -8365,14 +8447,57 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

// The following case may be scalarized depending on the VF.

// The flag shows whether we can use a usual Call for vectorized

// version of the instruction.

- bool NeedToScalarize = false;

- CM.getVectorCallCost(CI, VF, NeedToScalarize);

- return !NeedToScalarize;

+ // If we've found a variant at a previous VF, then stop looking. A

+ // vectorized variant of a function expects input in a certain shape

+ // -- basically the number of input registers, the number of lanes

+ // per register, and whether there's a mask required.

+ // We store a pointer to the variant in the VPWidenCallRecipe, so

+ // once we have an appropriate variant it's only valid for that VF.

+ // This will force a different vplan to be generated for each VF that

+ // finds a valid variant.

+ if (Variant)

+ return false;

+ CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);

+ // If we found a valid vector variant at this VF, then store the VF

+ // in case we need to generate a mask.

+ if (Variant)

+ VariantVF = VF;

+ return Variant != nullptr;

Range);

- if (ShouldUseVectorCall)

+ if (ShouldUseVectorCall) {

+ if (NeedsMask) {

+ // We have 2 cases that would require a mask:

+ // 1) The block needs to be predicated, either due to a conditional

+ // in the scalar loop or use of an active lane mask with

+ // tail-folding, and we use the appropriate mask for the block.

+ // 2) No mask is required for the block, but the only available

+ // vector variant at this VF requires a mask, so we synthesize an

+ // all-true mask.

+ VPValue *Mask = nullptr;

+ if (Legal->isMaskRequired(CI))

+ Mask = createBlockInMask(CI->getParent(), *Plan);

+ else

+ Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(

+ IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));

+ VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true);

+ unsigned MaskPos = 0;

+ for (const VFInfo &Info : VFDatabase::getMappings(*CI))

+ if (Info.Shape == Shape) {

+ assert(Info.isMasked() && "Vector function info shape mismatch");

+ MaskPos = Info.getParamIndexForOptionalMask().value();

+ break;

+ }

+ Ops.insert(Ops.begin() + MaskPos, Mask);

+ }

return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),

- Intrinsic::not_intrinsic);

+ Intrinsic::not_intrinsic, Variant);

+ }

return nullptr;

}

@@ -8405,9 +8530,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,

// div/rem operation itself. Otherwise fall through to general handling below.

if (CM.isPredicatedInst(I)) {

SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());

- VPValue *Mask = createBlockInMask(I->getParent(), Plan);

- VPValue *One =

- Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));

+ VPValue *Mask = createBlockInMask(I->getParent(), *Plan);

+ VPValue *One = Plan->getVPValueOrAddLiveIn(

+ ConstantInt::get(I->getType(), 1u, false));

auto *SafeRHS =

new VPInstruction(Instruction::Select, {Mask, Ops[1], One},

I->getDebugLoc());

@@ -8415,38 +8540,26 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,

Ops[1] = SafeRHS;

return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));

}

- LLVM_FALLTHROUGH;

+ [[fallthrough]];

}

case Instruction::Add:

case Instruction::And:

case Instruction::AShr:

- case Instruction::BitCast:

case Instruction::FAdd:

case Instruction::FCmp:

case Instruction::FDiv:

case Instruction::FMul:

case Instruction::FNeg:

- case Instruction::FPExt:

- case Instruction::FPToSI:

- case Instruction::FPToUI:

- case Instruction::FPTrunc:

case Instruction::FRem:

case Instruction::FSub:

case Instruction::ICmp:

- case Instruction::IntToPtr:

case Instruction::LShr:

case Instruction::Mul:

case Instruction::Or:

- case Instruction::PtrToInt:

case Instruction::Select:

- case Instruction::SExt:

case Instruction::Shl:

- case Instruction::SIToFP:

case Instruction::Sub:

- case Instruction::Trunc:

- case Instruction::UIToFP:

case Instruction::Xor:

- case Instruction::ZExt:

case Instruction::Freeze:

return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));

};

@@ -8462,9 +8575,9 @@ void VPRecipeBuilder::fixHeaderPhis() {

}

-VPBasicBlock *VPRecipeBuilder::handleReplication(

- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,

- VPlanPtr &Plan) {

+VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,

+ VFRange &Range,

+ VPlan &Plan) {

bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(

[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },

Range);

@@ -8501,83 +8614,22 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

break;

}

- auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),

- IsUniform, IsPredicated);

- // Find if I uses a predicated instruction. If so, it will use its scalar

- // value. Avoid hoisting the insert-element which packs the scalar value into

- // a vector value, as that happens iff all users use the vector value.

- for (VPValue *Op : Recipe->operands()) {

- auto *PredR =

- dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());

- if (!PredR)

- continue;

- auto *RepR = cast<VPReplicateRecipe>(

- PredR->getOperand(0)->getDefiningRecipe());

- assert(RepR->isPredicated() &&

- "expected Replicate recipe to be predicated");

- RepR->setAlsoPack(false);

- }

- // Finalize the recipe for Instr, first if it is not predicated.

+ VPValue *BlockInMask = nullptr;

if (!IsPredicated) {

+ // Finalize the recipe for Instr, first if it is not predicated.

LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");

- setRecipe(I, Recipe);

- Plan->addVPValue(I, Recipe);

- VPBB->appendRecipe(Recipe);

- return VPBB;

- }

- LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");

- VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();

- assert(SingleSucc && "VPBB must have a single successor when handling "

- "predicated replication.");

- VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);

- // Record predicated instructions for above packing optimizations.

- VPBlockBase *Region = createReplicateRegion(Recipe, Plan);

- VPBlockUtils::insertBlockAfter(Region, VPBB);

- auto *RegSucc = new VPBasicBlock();

- VPBlockUtils::insertBlockAfter(RegSucc, Region);

- VPBlockUtils::connectBlocks(RegSucc, SingleSucc);

- return RegSucc;

-VPRegionBlock *

-VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,

- VPlanPtr &Plan) {

- Instruction *Instr = PredRecipe->getUnderlyingInstr();

- // Instructions marked for predication are replicated and placed under an

- // if-then construct to prevent side-effects.

- // Generate recipes to compute the block mask for this region.

- VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);

- // Build the triangular if-then region.

- std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();

- assert(Instr->getParent() && "Predicated instruction not in any basic block");

- auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);

- auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);

- auto *PHIRecipe = Instr->getType()->isVoidTy()

- ? nullptr

- : new VPPredInstPHIRecipe(PredRecipe);

- if (PHIRecipe) {

- setRecipe(Instr, PHIRecipe);

- Plan->addVPValue(Instr, PHIRecipe);

} else {

- setRecipe(Instr, PredRecipe);

- Plan->addVPValue(Instr, PredRecipe);

+ LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");

+ // Instructions marked for predication are replicated and a mask operand is

+ // added initially. Masked replicate recipes will later be placed under an

+ // if-then construct to prevent side-effects. Generate recipes to compute

+ // the block mask for this region.

+ BlockInMask = createBlockInMask(I->getParent(), Plan);

}

- auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);

- auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);

- VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);

- // Note: first set Entry as region entry and then connect successors starting

- // from it in order, to propagate the "parent" of each VPBasicBlock.

- VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);

- VPBlockUtils::connectBlocks(Pred, Exiting);

- return Region;

+ auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),

+ IsUniform, BlockInMask);

+ return toVPRecipeResult(Recipe);

}

VPRecipeOrVPValueTy

@@ -8643,7 +8695,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

return nullptr;

if (auto *CI = dyn_cast<CallInst>(Instr))

- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));

+ return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));

if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))

return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));

@@ -8653,13 +8705,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))

return toVPRecipeResult(new VPWidenGEPRecipe(

- GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));

+ GEP, make_range(Operands.begin(), Operands.end())));

if (auto *SI = dyn_cast<SelectInst>(Instr)) {

- bool InvariantCond =

- PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);

return toVPRecipeResult(new VPWidenSelectRecipe(

- *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));

+ *SI, make_range(Operands.begin(), Operands.end())));

+ }

+ if (auto *CI = dyn_cast<CastInst>(Instr)) {

+ return toVPRecipeResult(

+ new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));

}

return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));

@@ -8677,34 +8732,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

auto &ConditionalAssumes = Legal->getConditionalAssumes();

DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());

- MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();

- // Dead instructions do not need sinking. Remove them from SinkAfter.

- for (Instruction *I : DeadInstructions)

- SinkAfter.erase(I);

- // Cannot sink instructions after dead instructions (there won't be any

- // recipes for them). Instead, find the first non-dead previous instruction.

- for (auto &P : Legal->getSinkAfter()) {

- Instruction *SinkTarget = P.second;

- Instruction *FirstInst = &*SinkTarget->getParent()->begin();

- (void)FirstInst;

- while (DeadInstructions.contains(SinkTarget)) {

- assert(

- SinkTarget != FirstInst &&

- "Must find a live instruction (at least the one feeding the "

- "fixed-order recurrence PHI) before reaching beginning of the block");

- SinkTarget = SinkTarget->getPrevNode();

- assert(SinkTarget != P.first &&

- "sink source equals target, no sinking required");

- }

- P.second = SinkTarget;

- }

- auto MaxVFPlusOne = MaxVF.getWithIncrement(1);

- for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {

- VFRange SubRange = {VF, MaxVFPlusOne};

- VPlans.push_back(

- buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));

+ auto MaxVFTimes2 = MaxVF * 2;

+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {

+ VFRange SubRange = {VF, MaxVFTimes2};

+ if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))

+ VPlans.push_back(std::move(*Plan));

VF = SubRange.End;

}

@@ -8712,10 +8744,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

// Add the necessary canonical IV and branch recipes required to control the

// loop.

static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

- bool HasNUW,

- bool UseLaneMaskForLoopControlFlow) {

+ TailFoldingStyle Style) {

Value *StartIdx = ConstantInt::get(IdxTy, 0);

- auto *StartV = Plan.getOrAddVPValue(StartIdx);

+ auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);

// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.

auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);

@@ -8725,6 +8756,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar

// IV by VF * UF.

+ bool HasNUW = Style == TailFoldingStyle::None;

auto *CanonicalIVIncrement =

new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW

: VPInstruction::CanonicalIVIncrement,

@@ -8732,11 +8764,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

CanonicalIVPHI->addOperand(CanonicalIVIncrement);

VPBasicBlock *EB = TopRegion->getExitingBasicBlock();

- EB->appendRecipe(CanonicalIVIncrement);

- if (UseLaneMaskForLoopControlFlow) {

+ if (useActiveLaneMaskForControlFlow(Style)) {

// Create the active lane mask instruction in the vplan preheader.

- VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();

+ VPBasicBlock *VecPreheader =

+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());

// We can't use StartV directly in the ActiveLaneMask VPInstruction, since

// we have to take unrolling into account. Each part needs to start at

@@ -8745,14 +8776,34 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW

: VPInstruction::CanonicalIVIncrementForPart,

{StartV}, DL, "index.part.next");

- Preheader->appendRecipe(CanonicalIVIncrementParts);

+ VecPreheader->appendRecipe(CanonicalIVIncrementParts);

// Create the ActiveLaneMask instruction using the correct start values.

- VPValue *TC = Plan.getOrCreateTripCount();

+ VPValue *TC = Plan.getTripCount();

+ VPValue *TripCount, *IncrementValue;

+ if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {

+ // When avoiding a runtime check, the active.lane.mask inside the loop

+ // uses a modified trip count and the induction variable increment is

+ // done after the active.lane.mask intrinsic is called.

+ auto *TCMinusVF =

+ new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);

+ VecPreheader->appendRecipe(TCMinusVF);

+ IncrementValue = CanonicalIVPHI;

+ TripCount = TCMinusVF;

+ } else {

+ // When the loop is guarded by a runtime overflow check for the loop

+ // induction variable increment by VF, we can increment the value before

+ // the get.active.lane mask and use the unmodified tripcount.

+ EB->appendRecipe(CanonicalIVIncrement);

+ IncrementValue = CanonicalIVIncrement;

+ TripCount = TC;

+ }

auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,

{CanonicalIVIncrementParts, TC}, DL,

"active.lane.mask.entry");

- Preheader->appendRecipe(EntryALM);

+ VecPreheader->appendRecipe(EntryALM);

// Now create the ActiveLaneMaskPhi recipe in the main loop using the

// preheader ActiveLaneMask instruction.

@@ -8763,15 +8814,21 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

CanonicalIVIncrementParts =

new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW

: VPInstruction::CanonicalIVIncrementForPart,

- {CanonicalIVIncrement}, DL);

+ {IncrementValue}, DL);

EB->appendRecipe(CanonicalIVIncrementParts);

auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,

- {CanonicalIVIncrementParts, TC}, DL,

+ {CanonicalIVIncrementParts, TripCount}, DL,

"active.lane.mask.next");

EB->appendRecipe(ALM);

LaneMaskPhi->addOperand(ALM);

+ if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {

+ // Do the increment of the canonical IV after the active.lane.mask, because

+ // that value is still based off %CanonicalIVPHI

+ EB->appendRecipe(CanonicalIVIncrement);

+ }

// We have to invert the mask here because a true condition means jumping

// to the exit block.

auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);

@@ -8781,6 +8838,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);

EB->appendRecipe(BranchBack);

} else {

+ EB->appendRecipe(CanonicalIVIncrement);

// Add the BranchOnCount VPInstruction to the latch.

VPInstruction *BranchBack = new VPInstruction(

VPInstruction::BranchOnCount,

@@ -8804,14 +8863,13 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,

for (PHINode &ExitPhi : ExitBB->phis()) {

Value *IncomingValue =

ExitPhi.getIncomingValueForBlock(ExitingBB);

- VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);

+ VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);

Plan.addLiveOut(&ExitPhi, V);

}

-VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,

- const MapVector<Instruction *, Instruction *> &SinkAfter) {

+std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

+ VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) {

SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

@@ -8822,12 +8880,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// process after constructing the initial VPlan.

// ---------------------------------------------------------------------------

- // Mark instructions we'll need to sink later and their targets as

- // ingredients whose recipe we'll need to record.

- for (const auto &Entry : SinkAfter) {

- RecipeBuilder.recordRecipeOf(Entry.first);

- RecipeBuilder.recordRecipeOf(Entry.second);

- }

for (const auto &Reduction : CM.getInLoopReductionChains()) {

PHINode *Phi = Reduction.first;

RecurKind Kind =

@@ -8852,9 +8904,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// single VPInterleaveRecipe.

for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {

auto applyIG = [IG, this](ElementCount VF) -> bool {

- return (VF.isVector() && // Query is illegal for VF == 1

- CM.getWideningDecision(IG->getInsertPos(), VF) ==

- LoopVectorizationCostModel::CM_Interleave);

+ bool Result = (VF.isVector() && // Query is illegal for VF == 1

+ CM.getWideningDecision(IG->getInsertPos(), VF) ==

+ LoopVectorizationCostModel::CM_Interleave);

+ // For scalable vectors, the only interleave factor currently supported

+ // is 2 since we require the (de)interleave2 intrinsics instead of

+ // shufflevectors.

+ assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&

+ "Unsupported interleave factor for scalable vectors");

+ return Result;

};

if (!getDecisionAndClampRange(applyIG, Range))

continue;

@@ -8869,26 +8927,34 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// visit each basic block after having visited its predecessor basic blocks.

// ---------------------------------------------------------------------------

- // Create initial VPlan skeleton, starting with a block for the pre-header,

- // followed by a region for the vector loop, followed by the middle block. The

- // skeleton vector loop region contains a header and latch block.

- VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");

- auto Plan = std::make_unique<VPlan>(Preheader);

+ // Create initial VPlan skeleton, having a basic block for the pre-header

+ // which contains SCEV expansions that need to happen before the CFG is

+ // modified; a basic block for the vector pre-header, followed by a region for

+ // the vector loop, followed by the middle basic block. The skeleton vector

+ // loop region contains a header and latch basic blocks.

+ VPlanPtr Plan = VPlan::createInitialVPlan(

+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),

+ *PSE.getSE());

VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");

VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");

VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);

auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");

- VPBlockUtils::insertBlockAfter(TopRegion, Preheader);

+ VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());

VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");

VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);

+ // Don't use getDecisionAndClampRange here, because we don't know the UF

+ // so this function is better to be conservative, rather than to split

+ // it up into different VPlans.

+ bool IVUpdateMayOverflow = false;

+ for (ElementCount VF : Range)

+ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);

Instruction *DLInst =

getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());

addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),

DLInst ? DLInst->getDebugLoc() : DebugLoc(),

- !CM.foldTailByMasking(),

- CM.useActiveLaneMaskForControlFlow());

+ CM.getTailFoldingStyle(IVUpdateMayOverflow));

// Scan the body of the loop in a topological order to visit each basic block

// after having visited its predecessor basic blocks.

@@ -8896,18 +8962,16 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

DFS.perform(LI);

VPBasicBlock *VPBB = HeaderVPBB;

- SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {

// Relevant instructions from basic block BB will be grouped into VPRecipe

// ingredients and fill a new VPBasicBlock.

- unsigned VPBBsForBB = 0;

if (VPBB != HeaderVPBB)

VPBB->setName(BB->getName());

Builder.setInsertPoint(VPBB);

// Introduce each ingredient into VPlan.

// TODO: Model and preserve debug intrinsics in VPlan.

- for (Instruction &I : BB->instructionsWithoutDebug()) {

+ for (Instruction &I : BB->instructionsWithoutDebug(false)) {

Instruction *Instr = &I;

// First filter out irrelevant instructions, to ensure no recipes are

@@ -8918,7 +8982,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

SmallVector<VPValue *, 4> Operands;

auto *Phi = dyn_cast<PHINode>(Instr);

if (Phi && Phi->getParent() == OrigLoop->getHeader()) {

- Operands.push_back(Plan->getOrAddVPValue(

+ Operands.push_back(Plan->getVPValueOrAddLiveIn(

Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));

} else {

auto OpRange = Plan->mapToVPValues(Instr->operands());

@@ -8932,50 +8996,36 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))

continue;

- if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(

- Instr, Operands, Range, VPBB, Plan)) {

- // If Instr can be simplified to an existing VPValue, use it.

- if (RecipeOrValue.is<VPValue *>()) {

- auto *VPV = RecipeOrValue.get<VPValue *>();

- Plan->addVPValue(Instr, VPV);

- // If the re-used value is a recipe, register the recipe for the

- // instruction, in case the recipe for Instr needs to be recorded.

- if (VPRecipeBase *R = VPV->getDefiningRecipe())

- RecipeBuilder.setRecipe(Instr, R);

- continue;

- }

- // Otherwise, add the new recipe.

- VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();

- for (auto *Def : Recipe->definedValues()) {

- auto *UV = Def->getUnderlyingValue();

- Plan->addVPValue(UV, Def);

- }

- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&

- HeaderVPBB->getFirstNonPhi() != VPBB->end()) {

- // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section

- // of the header block. That can happen for truncates of induction

- // variables. Those recipes are moved to the phi section of the header

- // block after applying SinkAfter, which relies on the original

- // position of the trunc.

- assert(isa<TruncInst>(Instr));

- InductionsToMove.push_back(

- cast<VPWidenIntOrFpInductionRecipe>(Recipe));

- }

- RecipeBuilder.setRecipe(Instr, Recipe);

- VPBB->appendRecipe(Recipe);

+ auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(

+ Instr, Operands, Range, VPBB, Plan);

+ if (!RecipeOrValue)

+ RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);

+ // If Instr can be simplified to an existing VPValue, use it.

+ if (isa<VPValue *>(RecipeOrValue)) {

+ auto *VPV = cast<VPValue *>(RecipeOrValue);

+ Plan->addVPValue(Instr, VPV);

+ // If the re-used value is a recipe, register the recipe for the

+ // instruction, in case the recipe for Instr needs to be recorded.

+ if (VPRecipeBase *R = VPV->getDefiningRecipe())

+ RecipeBuilder.setRecipe(Instr, R);

continue;

}

- // Otherwise, if all widening options failed, Instruction is to be

- // replicated. This may create a successor for VPBB.

- VPBasicBlock *NextVPBB =

- RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);

- if (NextVPBB != VPBB) {

- VPBB = NextVPBB;

- VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)

- : "");

+ // Otherwise, add the new recipe.

+ VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);

+ for (auto *Def : Recipe->definedValues()) {

+ auto *UV = Def->getUnderlyingValue();

+ Plan->addVPValue(UV, Def);

}

+ RecipeBuilder.setRecipe(Instr, Recipe);

+ if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&

+ HeaderVPBB->getFirstNonPhi() != VPBB->end()) {

+ // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the

+ // phi section of HeaderVPBB.

+ assert(isa<TruncInst>(Instr));

+ Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());

+ } else

+ VPBB->appendRecipe(Recipe);

}

VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);

@@ -8985,7 +9035,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// After here, VPBB should not be used.

VPBB = nullptr;

- addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);

+ if (CM.requiresScalarEpilogue(Range)) {

+ // No edge from the middle block to the unique exit block has been inserted

+ // and there is nothing to fix from vector loop; phis should have incoming

+ // from scalar loop only.

+ } else

+ addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);

assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&

!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&

@@ -8998,116 +9053,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// bring the VPlan to its final state.

// ---------------------------------------------------------------------------

- // Apply Sink-After legal constraints.

- auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {

- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());

- if (Region && Region->isReplicator()) {

- assert(Region->getNumSuccessors() == 1 &&

- Region->getNumPredecessors() == 1 && "Expected SESE region!");

- assert(R->getParent()->size() == 1 &&

- "A recipe in an original replicator region must be the only "

- "recipe in its block");

- return Region;

- }

- return nullptr;

- };

- for (const auto &Entry : SinkAfter) {

- VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);

- VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);

- auto *TargetRegion = GetReplicateRegion(Target);

- auto *SinkRegion = GetReplicateRegion(Sink);

- if (!SinkRegion) {

- // If the sink source is not a replicate region, sink the recipe directly.

- if (TargetRegion) {

- // The target is in a replication region, make sure to move Sink to

- // the block after it, not into the replication region itself.

- VPBasicBlock *NextBlock =

- cast<VPBasicBlock>(TargetRegion->getSuccessors().front());

- Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());

- } else

- Sink->moveAfter(Target);

- continue;

- }

- // The sink source is in a replicate region. Unhook the region from the CFG.

- auto *SinkPred = SinkRegion->getSinglePredecessor();

- auto *SinkSucc = SinkRegion->getSingleSuccessor();

- VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);

- VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);

- VPBlockUtils::connectBlocks(SinkPred, SinkSucc);

- if (TargetRegion) {

- // The target recipe is also in a replicate region, move the sink region

- // after the target region.

- auto *TargetSucc = TargetRegion->getSingleSuccessor();

- VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);

- VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);

- VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);

- } else {

- // The sink source is in a replicate region, we need to move the whole

- // replicate region, which should only contain a single recipe in the

- // main block.

- auto *SplitBlock =

- Target->getParent()->splitAt(std::next(Target->getIterator()));

- auto *SplitPred = SplitBlock->getSinglePredecessor();

- VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);

- VPBlockUtils::connectBlocks(SplitPred, SinkRegion);

- VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);

- }

- VPlanTransforms::removeRedundantCanonicalIVs(*Plan);

- VPlanTransforms::removeRedundantInductionCasts(*Plan);

- // Now that sink-after is done, move induction recipes for optimized truncates

- // to the phi section of the header block.

- for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)

- Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());

// Adjust the recipes for any inloop reductions.

adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,

RecipeBuilder, Range.Start);

- // Introduce a recipe to combine the incoming and previous values of a

- // fixed-order recurrence.

- for (VPRecipeBase &R :

- Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

- auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);

- if (!RecurPhi)

- continue;

- VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();

- // Fixed-order recurrences do not contain cycles, so this loop is guaranteed

- // to terminate.

- while (auto *PrevPhi =

- dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))

- PrevRecipe = &PrevPhi->getBackedgeRecipe();

- VPBasicBlock *InsertBlock = PrevRecipe->getParent();

- auto *Region = GetReplicateRegion(PrevRecipe);

- if (Region)

- InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());

- if (!InsertBlock) {

- InsertBlock = new VPBasicBlock(Region->getName() + ".succ");

- VPBlockUtils::insertBlockAfter(InsertBlock, Region);

- }

- if (Region || PrevRecipe->isPhi())

- Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());

- else

- Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));

- auto *RecurSplice = cast<VPInstruction>(

- Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,

- {RecurPhi, RecurPhi->getBackedgeValue()}));

- RecurPhi->replaceAllUsesWith(RecurSplice);

- // Set the first operand of RecurSplice to RecurPhi again, after replacing

- // all users.

- RecurSplice->setOperand(0, RecurPhi);

- }

// Interleave memory: for each Interleave Group we marked earlier as relevant

// for this VPlan, replace the Recipes widening its memory instructions with a

// single VPInterleaveRecipe at its insertion point.

@@ -9122,48 +9071,66 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

StoredValues.push_back(StoreR->getStoredValue());

}

+ bool NeedsMaskForGaps =

+ IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();

auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,

- Recipe->getMask());

+ Recipe->getMask(), NeedsMaskForGaps);

VPIG->insertBefore(Recipe);

unsigned J = 0;

for (unsigned i = 0; i < IG->getFactor(); ++i)

if (Instruction *Member = IG->getMember(i)) {

+ VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);

if (!Member->getType()->isVoidTy()) {

- VPValue *OriginalV = Plan->getVPValue(Member);

- Plan->removeVPValueFor(Member);

- Plan->addVPValue(Member, VPIG->getVPValue(J));

+ VPValue *OriginalV = MemberR->getVPSingleValue();

OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));

J++;

}

- RecipeBuilder.getRecipe(Member)->eraseFromParent();

+ MemberR->eraseFromParent();

}

- for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);

- VF *= 2)

+ for (ElementCount VF : Range)

Plan->addVF(VF);

Plan->setName("Initial VPlan");

+ // Replace VPValues for known constant strides guaranteed by predicate scalar

+ // evolution.

+ for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {

+ auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();

+ auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));

+ // Only handle constant strides for now.

+ if (!ScevStride)

+ continue;

+ Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());

+ auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);

+ // The versioned value may not be used in the loop directly, so just add a

+ // new live-in in those cases.

+ Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);

+ }

// From this point onwards, VPlan-to-VPlan transformations may change the plan

// in ways that accessing values using original IR values is incorrect.

Plan->disableValue2VPValue();

+ // Sink users of fixed-order recurrence past the recipe defining the previous

+ // value and introduce FirstOrderRecurrenceSplice VPInstructions.

+ if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))

+ return std::nullopt;

+ VPlanTransforms::removeRedundantCanonicalIVs(*Plan);

+ VPlanTransforms::removeRedundantInductionCasts(*Plan);

VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());

VPlanTransforms::removeDeadRecipes(*Plan);

- bool ShouldSimplify = true;

- while (ShouldSimplify) {

- ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);

- ShouldSimplify |=

- VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);

- ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);

- }

+ VPlanTransforms::createAndOptimizeReplicateRegions(*Plan);

VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);

VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);

assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");

- return Plan;

+ return std::make_optional(std::move(Plan));

}

VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

@@ -9175,21 +9142,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");

// Create new empty VPlan

- auto Plan = std::make_unique<VPlan>();

+ auto Plan = VPlan::createInitialVPlan(

+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),

+ *PSE.getSE());

// Build hierarchical CFG

VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);

HCFGBuilder.buildHierarchicalCFG();

- for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);

- VF *= 2)

+ for (ElementCount VF : Range)

Plan->addVF(VF);

- SmallPtrSet<Instruction *, 1> DeadInstructions;

VPlanTransforms::VPInstructionsToVPRecipes(

- OrigLoop, Plan,

+ Plan,

[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },

- DeadInstructions, *PSE.getSE(), *TLI);

+ *PSE.getSE(), *TLI);

// Remove the existing terminator of the exiting block of the top-most region.

// A BranchOnCount will be added instead when adding the canonical IV recipes.

@@ -9198,7 +9165,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

Term->eraseFromParent();

addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),

- true, CM.useActiveLaneMaskForControlFlow());

+ CM.getTailFoldingStyle());

return Plan;

}

@@ -9255,7 +9222,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

VPBuilder::InsertPointGuard Guard(Builder);

Builder.setInsertPoint(WidenRecipe->getParent(),

WidenRecipe->getIterator());

- CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);

+ CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan);

}

if (IsFMulAdd) {

@@ -9270,7 +9237,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

VecOp = FMulRecipe;

}

VPReductionRecipe *RedRecipe =

- new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);

+ new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI);

WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);

Plan->removeVPValueFor(R);

Plan->addVPValue(R, RedRecipe);

@@ -9304,13 +9271,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

if (!PhiR || PhiR->isInLoop())

continue;

VPValue *Cond =

- RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);

+ RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);

VPValue *Red = PhiR->getBackedgeValue();

assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&

"reduction recipe must be defined before latch");

Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});

}

+ VPlanTransforms::clearReductionWrapFlags(*Plan);

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

@@ -9475,7 +9444,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

PartStart, ConstantInt::get(PtrInd->getType(), Lane));

Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);

- Value *Step = State.get(getOperand(1), VPIteration(0, Part));

+ Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));

Value *SclrGep = emitTransformedIndex(

State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);

SclrGep->setName("next.gep");

@@ -9485,8 +9454,6 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

return;

}

- assert(isa<SCEVConstant>(IndDesc.getStep()) &&

- "Induction step not a SCEV constant!");

Type *PhiType = IndDesc.getStep()->getType();

// Build a pointer phi

@@ -9506,7 +9473,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

Value *NumUnrolledElems =

State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));

Value *InductionGEP = GetElementPtrInst::Create(

- IndDesc.getElementType(), NewPointerPhi,

+ State.Builder.getInt8Ty(), NewPointerPhi,

State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",

InductionLoc);

// Add induction update using an incorrect block temporarily. The phi node

@@ -9529,10 +9496,10 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

StartOffset = State.Builder.CreateAdd(

StartOffset, State.Builder.CreateStepVector(VecPhiType));

- assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&

+ assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&

"scalar step must be the same across all parts");

Value *GEP = State.Builder.CreateGEP(

- IndDesc.getElementType(), NewPointerPhi,

+ State.Builder.getInt8Ty(), NewPointerPhi,

State.Builder.CreateMul(

StartOffset,

State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),

@@ -9584,7 +9551,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {

void VPInterleaveRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Interleave group being replicated.");

State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),

- getStoredValues(), getMask());

+ getStoredValues(), getMask(),

+ NeedsMaskForGaps);

}

void VPReductionRecipe::execute(VPTransformState &State) {

@@ -9640,10 +9608,9 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

Instruction *UI = getUnderlyingInstr();

if (State.Instance) { // Generate a single instance.

assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");

- State.ILV->scalarizeInstruction(UI, this, *State.Instance,

- IsPredicated, State);

+ State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);

// Insert scalar instance packing it into a vector.

- if (AlsoPack && State.VF.isVector()) {

+ if (State.VF.isVector() && shouldPack()) {

// If we're constructing lane 0, initialize to start from poison.

if (State.Instance->Lane.isFirstLane()) {

assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");

@@ -9663,8 +9630,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

all_of(operands(), [](VPValue *Op) {

return Op->isDefinedOutsideVectorRegions();

})) {

- State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,

- State);

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);

if (user_begin() != user_end()) {

for (unsigned Part = 1; Part < State.UF; ++Part)

State.set(this, State.get(this, VPIteration(0, 0)),

@@ -9676,16 +9642,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

// Uniform within VL means we need to generate lane 0 only for each

// unrolled copy.

for (unsigned Part = 0; Part < State.UF; ++Part)

- State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),

- IsPredicated, State);

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);

return;

}

- // A store of a loop varying value to a loop invariant address only

- // needs only the last copy of the store.

- if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {

+ // A store of a loop varying value to a uniform address only needs the last

+ // copy of the store.

+ if (isa<StoreInst>(UI) &&

+ vputils::isUniformAfterVectorization(getOperand(1))) {

auto Lane = VPLane::getLastLaneForVF(State.VF);

- State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),

State);

return;

}

@@ -9695,8 +9661,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

const unsigned EndLane = State.VF.getKnownMinValue();

for (unsigned Part = 0; Part < State.UF; ++Part)

for (unsigned Lane = 0; Lane < EndLane; ++Lane)

- State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),

- IsPredicated, State);

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);

}

void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

@@ -9714,7 +9679,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

auto *DataTy = VectorType::get(ScalarDataTy, State.VF);

const Align Alignment = getLoadStoreAlignment(&Ingredient);

- bool CreateGatherScatter = !Consecutive;

+ bool CreateGatherScatter = !isConsecutive();

auto &Builder = State.Builder;

InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);

@@ -9725,36 +9690,39 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {

// Calculate the pointer for the specific unroll-part.

- GetElementPtrInst *PartPtr = nullptr;

+ Value *PartPtr = nullptr;

+ // Use i32 for the gep index type when the value is constant,

+ // or query DataLayout for a more suitable index type otherwise.

+ const DataLayout &DL =

+ Builder.GetInsertBlock()->getModule()->getDataLayout();

+ Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)

+ ? DL.getIndexType(ScalarDataTy->getPointerTo())

+ : Builder.getInt32Ty();

bool InBounds = false;

if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

InBounds = gep->isInBounds();

- if (Reverse) {

+ if (isReverse()) {

// If the address is consecutive but reversed, then the

// wide store needs to start at the last vector element.

// RunTimeVF = VScale * VF.getKnownMinValue()

// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()

- Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);

+ Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);

// NumElt = -Part * RunTimeVF

- Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);

+ Value *NumElt =

+ Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);

// LastLane = 1 - RunTimeVF

- Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);

+ Value *LastLane =

+ Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);

+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);

PartPtr =

- cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));

- PartPtr->setIsInBounds(InBounds);

- PartPtr = cast<GetElementPtrInst>(

- Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));

- PartPtr->setIsInBounds(InBounds);

+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);

if (isMaskRequired) // Reverse of a null all-one mask is a null mask.

BlockInMaskParts[Part] =

Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");

} else {

- Value *Increment =

- createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);

- PartPtr = cast<GetElementPtrInst>(

- Builder.CreateGEP(ScalarDataTy, Ptr, Increment));

- PartPtr->setIsInBounds(InBounds);

+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);

+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);

}

unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

@@ -9774,7 +9742,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,

MaskPart);

} else {

- if (Reverse) {

+ if (isReverse()) {

// If we store to reverse consecutive memory locations, then we need

// to reverse the order of elements in the stored value.

StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");

@@ -9833,7 +9801,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

static ScalarEpilogueLowering getScalarEpilogueLowering(

Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,

BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,

- AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,

LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {

// 1) OptSize takes precedence over all other options, i.e. if this is set,

// don't look at hints or options, and don't request a scalar epilogue.

@@ -9869,7 +9836,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(

};

// 4) if the TTI hook indicates this is profitable, request predication.

- if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))

+ TailFoldingInfo TFI(TLI, &LVL, IAI);

+ if (TTI->preferPredicateOverEpilogue(&TFI))

return CM_ScalarEpilogueNotNeededUsePredicate;

return CM_ScalarEpilogueAllowed;

@@ -9880,9 +9848,29 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {

if (hasVectorValue(Def, Part))

return Data.PerPartOutput[Def][Part];

+ auto GetBroadcastInstrs = [this, Def](Value *V) {

+ bool SafeToHoist = Def->isDefinedOutsideVectorRegions();

+ if (VF.isScalar())

+ return V;

+ // Place the code for broadcasting invariant variables in the new preheader.

+ IRBuilder<>::InsertPointGuard Guard(Builder);

+ if (SafeToHoist) {

+ BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(

+ Plan->getVectorLoopRegion()->getSinglePredecessor())];

+ if (LoopVectorPreHeader)

+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

+ }

+ // Place the code for broadcasting invariant variables in the new preheader.

+ // Broadcast the scalar into all locations in the vector.

+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

+ return Shuf;

+ };

if (!hasScalarValue(Def, {Part, 0})) {

Value *IRV = Def->getLiveInIRValue();

- Value *B = ILV->getBroadcastInstrs(IRV);

+ Value *B = GetBroadcastInstrs(IRV);

set(Def, B, Part);

return B;

}

@@ -9900,9 +9888,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {

unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;

// Check if there is a scalar value for the selected lane.

if (!hasScalarValue(Def, {Part, LastLane})) {

- // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.

+ // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and

+ // VPExpandSCEVRecipes can also be uniform.

assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||

- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&

+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||

+ isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&

"unexpected recipe found to be invariant");

IsUniform = true;

LastLane = 0;

@@ -9927,7 +9917,7 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {

// State, we will only generate the insertelements once.

Value *VectorValue = nullptr;

if (IsUniform) {

- VectorValue = ILV->getBroadcastInstrs(ScalarValue);

+ VectorValue = GetBroadcastInstrs(ScalarValue);

set(Def, VectorValue, Part);

} else {

// Initialize packing with insertelements to start from undef.

@@ -9962,15 +9952,15 @@ static bool processLoopInVPlanNativePath(

Function *F = L->getHeader()->getParent();

InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());

- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);

+ ScalarEpilogueLowering SEL =

+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);

LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,

&Hints, IAI);

// Use the planner for outer loop vectorization.

// TODO: CM is not used at this point inside the planner. Turn CM into an

// optional argument if we don't need it in the future.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);

+ LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE);

// Get user vectorization factor.

ElementCount UserVF = Hints.getWidth();

@@ -10231,8 +10221,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Check the function attributes and profiles to find out if this function

// should be optimized for size.

- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);

+ ScalarEpilogueLowering SEL =

+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);

// Check the loop for a trip count threshold: vectorize loops with a tiny trip

// count by optimizing for size, to minimize overheads.

@@ -10309,11 +10299,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Use the cost model.

LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,

F, &Hints, IAI);

- CM.collectValuesToIgnore();

- CM.collectElementTypesForWidening();

// Use the planner for vectorization.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);

+ LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,

+ ORE);

// Get user vectorization factor and interleave count.

ElementCount UserVF = Hints.getWidth();

@@ -10342,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

bool ForceVectorization =

Hints.getForce() == LoopVectorizeHints::FK_Enabled;

if (!ForceVectorization &&

- !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,

+ !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,

*PSE.getSE())) {

ORE->emit([&]() {

return OptimizationRemarkAnalysisAliasing(

@@ -10464,7 +10452,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Consider vectorizing the epilogue too if it's profitable.

VectorizationFactor EpilogueVF =

- CM.selectEpilogueVectorizationFactor(VF.Width, LVP);

+ LVP.selectEpilogueVectorizationFactor(VF.Width, IC);

if (EpilogueVF.Width.isVector()) {

// The first pass vectorizes the main loop and creates a scalar epilogue

@@ -10475,8 +10463,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {

EPI, &LVL, &CM, BFI, PSI, Checks);

VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);

- LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,

- DT, true);

+ auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,

+ BestMainPlan, MainILV, DT, true);

++LoopsVectorized;

// Second pass vectorizes the epilogue and adjusts the control flow

@@ -10492,6 +10480,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {

VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();

Header->setName("vec.epilog.vector.body");

+ // Re-use the trip count and steps expanded for the main loop, as

+ // skeleton creation needs it as a value that dominates both the scalar

+ // and vector epilogue loops

+ // TODO: This is a workaround needed for epilogue vectorization and it

+ // should be removed once induction resume value creation is done

+ // directly in VPlan.

+ EpilogILV.setTripCount(MainILV.getTripCount());

+ for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {

+ auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);

+ auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(

+ ExpandedSCEVs.find(ExpandR->getSCEV())->second);

+ ExpandR->replaceAllUsesWith(ExpandedVal);

+ ExpandR->eraseFromParent();

+ }

// Ensure that the start values for any VPWidenIntOrFpInductionRecipe,

// VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated

// before vectorizing the epilogue loop.

@@ -10520,15 +10523,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {

}

ResumeV = MainILV.createInductionResumeValue(

- IndPhi, *ID, {EPI.MainLoopIterationCountCheck});

+ IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),

+ {EPI.MainLoopIterationCountCheck});

}

assert(ResumeV && "Must have a resume value");

- VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);

+ VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);

cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);

}

LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,

- DT, true);

+ DT, true, &ExpandedSCEVs);

++LoopsEpilogueVectorized;

if (!MainILV.areSafetyChecksAdded())

@@ -10581,14 +10585,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {

LoopVectorizeResult LoopVectorizePass::runImpl(

Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,

- DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,

+ DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,

DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,

OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {

SE = &SE_;

LI = &LI_;

TTI = &TTI_;

DT = &DT_;

- BFI = &BFI_;

+ BFI = BFI_;

TLI = TLI_;

AC = &AC_;

LAIs = &LAIs_;

@@ -10604,7 +10608,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl(

// vector registers, loop vectorization may still enable scalar

// interleaving.

if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&

- TTI->getMaxInterleaveFactor(1) < 2)

+ TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)

return LoopVectorizeResult(false, false);

bool Changed = false, CFGChanged = false;

@@ -10656,7 +10660,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);

auto &TTI = AM.getResult<TargetIRAnalysis>(F);

auto &DT = AM.getResult<DominatorTreeAnalysis>(F);

- auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);

auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);

auto &AC = AM.getResult<AssumptionAnalysis>(F);

auto &DB = AM.getResult<DemandedBitsAnalysis>(F);

@@ -10666,12 +10669,20 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);

ProfileSummaryInfo *PSI =

MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());

+ BlockFrequencyInfo *BFI = nullptr;

+ if (PSI && PSI->hasProfileSummary())

+ BFI = &AM.getResult<BlockFrequencyAnalysis>(F);

LoopVectorizeResult Result =

runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);

if (!Result.MadeAnyChange)

return PreservedAnalyses::all();

PreservedAnalyses PA;

+ if (isAssignmentTrackingEnabled(*F.getParent())) {

+ for (auto &BB : F)

+ RemoveRedundantDbgInstrs(&BB);

+ }

// We currently do not preserve loopinfo/dominator analyses with outer loop

// vectorization. Until this is addressed, mark these analyses as preserved

// only for non-VPlan-native path.

@@ -10679,6 +10690,11 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

if (!EnableVPlanNativePath) {

PA.preserve<LoopAnalysis>();

PA.preserve<DominatorTreeAnalysis>();

+ PA.preserve<ScalarEvolutionAnalysis>();

+#ifdef EXPENSIVE_CHECKS

+ SE.verify();

+#endif

}

if (Result.MadeCFGChange) {

@@ -10699,8 +10715,8 @@ void LoopVectorizePass::printPipeline(

static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(

OS, MapClassName2PassName);

- OS << "<";

+ OS << '<';

OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";

OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";

- OS << ">";

+ OS << '>';

}