src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-02-11 12:38:04 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-02-11 12:38:11 +0000
commit	e3b557809604d036af6e00c60f012c2025b59a5e (patch)
tree	8a11ba2269a3b669601e2fd41145b174008f4da8 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent	08e8dd7b9db7bb4a9de26d44c1cbfd24e869c014 (diff)

vendor/llvm-project/llvmorg-16-init-18548-gb0daacf58f41

Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1683

1 files changed, 916 insertions, 767 deletions

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 238b074089aa..a28099d8ba7d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -65,8 +65,6 @@

#include "llvm/ADT/DenseMapInfo.h"

#include "llvm/ADT/Hashing.h"

#include "llvm/ADT/MapVector.h"

-#include "llvm/ADT/None.h"

-#include "llvm/ADT/Optional.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallSet.h"

@@ -142,6 +140,7 @@

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

#include <algorithm>

#include <cassert>

+#include <cmath>

#include <cstdint>

#include <functional>

#include <iterator>

@@ -362,10 +361,15 @@ cl::opt<bool> llvm::EnableLoopVectorization(

"vectorize-loops", cl::init(true), cl::Hidden,

cl::desc("Run the Loop vectorization passes"));

-cl::opt<bool> PrintVPlansInDotFormat(

- "vplan-print-in-dot-format", cl::init(false), cl::Hidden,

+static cl::opt<bool> PrintVPlansInDotFormat(

+ "vplan-print-in-dot-format", cl::Hidden,

cl::desc("Use dot format instead of plain text when dumping VPlans"));

+static cl::opt<cl::boolOrDefault> ForceSafeDivisor(

+ "force-widen-divrem-via-safe-divisor", cl::Hidden,

+ cl::desc(

+ "Override cost based safe divisor widening for div/rem instructions"));

/// A helper function that returns true if the given type is irregular. The

/// type is irregular if its allocated size doesn't equal the store size of an

/// element of the corresponding vector type.

@@ -396,8 +400,9 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {

/// 1) Returns exact trip count if it is known.

/// 2) Returns expected trip count according to profile data if any.

/// 3) Returns upper bound estimate if it is known.

-/// 4) Returns None if all of the above failed.

-static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {

+/// 4) Returns std::nullopt if all of the above failed.

+static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,

+ Loop *L) {

// Check if exact trip count is known.

if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))

return ExpectedTC;

@@ -405,17 +410,19 @@ static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {

// Check if there is an expected trip count available from profile data.

if (LoopVectorizeWithBlockFrequency)

if (auto EstimatedTC = getLoopEstimatedTripCount(L))

- return EstimatedTC;

+ return *EstimatedTC;

// Check if upper bound estimate is known.

if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))

return ExpectedTC;

- return None;

+ return std::nullopt;

}

+namespace {

// Forward declare GeneratedRTChecks.

class GeneratedRTChecks;

+} // namespace

namespace llvm {

@@ -473,10 +480,6 @@ public:

/// complex control flow around the loops.

virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();

- /// Widen a single call instruction within the innermost loop.

- void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,

- VPTransformState &State);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.

void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);

@@ -493,7 +496,8 @@ public:

/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,

/// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p

/// Instr's operands.

- void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,

+ void scalarizeInstruction(const Instruction *Instr,

+ VPReplicateRecipe *RepRecipe,

const VPIteration &Instance, bool IfPredicateInstr,

VPTransformState &State);

@@ -529,6 +533,17 @@ public:

// generated by fixReduction.

PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);

+ /// Create a new phi node for the induction variable \p OrigPhi to resume

+ /// iteration count in the scalar epilogue, from where the vectorized loop

+ /// left off. In cases where the loop skeleton is more complicated (eg.

+ /// epilogue vectorization) and the resume values can come from an additional

+ /// bypass block, the \p AdditionalBypass pair provides information about the

+ /// bypass block and the end value on the edge from bypass to this loop.

+ PHINode *createInductionResumeValue(

+ PHINode *OrigPhi, const InductionDescriptor &ID,

+ ArrayRef<BasicBlock *> BypassBlocks,

+ std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});

protected:

friend class LoopVectorizationPlanner;

@@ -552,7 +567,7 @@ protected:

/// Create the exit value of first order recurrences in the middle block and

/// update their users.

- void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,

+ void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,

VPTransformState &State);

/// Create code for the loop exit value of the reduction.

@@ -611,7 +626,7 @@ protected:

/// Complete the loop skeleton by adding debug MDs, creating appropriate

/// conditional branches in the middle block, preparing the builder and

/// running the verifier. Return the preheader of the completed vector loop.

- BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);

+ BasicBlock *completeLoopSkeleton();

/// Collect poison-generating recipes that may generate a poison value that is

/// used after vectorization, even when their operands are not poison. Those

@@ -643,9 +658,6 @@ protected:

/// Dominator Tree.

DominatorTree *DT;

- /// Alias Analysis.

- AAResults *AA;

/// Target Library Info.

const TargetLibraryInfo *TLI;

@@ -951,6 +963,27 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {

return VF.isScalable() ? B.CreateVScale(EC) : EC;

}

+const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {

+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();

+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");

+ ScalarEvolution &SE = *PSE.getSE();

+ // The exit count might have the type of i64 while the phi is i32. This can

+ // happen if we have an induction variable that is sign extended before the

+ // compare. The only way that we get a backedge taken count is that the

+ // induction variable was signed and as such will not overflow. In such a case

+ // truncation is legal.

+ if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >

+ IdxTy->getPrimitiveSizeInBits())

+ BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);

+ BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);

+ // Get the total trip count from the count by adding 1.

+ return SE.getAddExpr(BackedgeTakenCount,

+ SE.getOne(BackedgeTakenCount->getType()));

static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,

ElementCount VF) {

assert(FTy->isFloatingPointTy() && "Expected floating point type!");

@@ -1037,27 +1070,25 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(

// Add new definitions to the worklist.

for (VPValue *operand : CurRec->operands())

- if (VPDef *OpDef = operand->getDef())

- Worklist.push_back(cast<VPRecipeBase>(OpDef));

+ if (VPRecipeBase *OpDef = operand->getDefiningRecipe())

+ Worklist.push_back(OpDef);

}

});

// Traverse all the recipes in the VPlan and collect the poison-generating

// recipes in the backward slice starting at the address of a VPWidenRecipe or

// VPInterleaveRecipe.

- auto Iter = depth_first(

- VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));

+ auto Iter = vp_depth_first_deep(State.Plan->getEntry());

for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

for (VPRecipeBase &Recipe : *VPBB) {

if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {

Instruction &UnderlyingInstr = WidenRec->getIngredient();

- VPDef *AddrDef = WidenRec->getAddr()->getDef();

+ VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();

if (AddrDef && WidenRec->isConsecutive() &&

Legal->blockNeedsPredication(UnderlyingInstr.getParent()))

- collectPoisonGeneratingInstrsInBackwardSlice(

- cast<VPRecipeBase>(AddrDef));

+ collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {

- VPDef *AddrDef = InterleaveRec->getAddr()->getDef();

+ VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();

if (AddrDef) {

// Check if any member of the interleave group needs predication.

const InterleaveGroup<Instruction> *InterGroup =

@@ -1072,8 +1103,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(

}

if (NeedPredication)

- collectPoisonGeneratingInstrsInBackwardSlice(

- cast<VPRecipeBase>(AddrDef));

+ collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

}

@@ -1182,7 +1212,7 @@ public:

/// If interleave count has been specified by metadata it will be returned.

/// Otherwise, the interleave count is computed and returned. VF and LoopCost

/// are the selected vectorization factor and the cost of the selected VF.

- unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);

+ unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);

/// Memory access instruction may be vectorized in more than one way.

/// Form of instruction after vectorization depends on cost.

@@ -1435,47 +1465,49 @@ public:

}));

}

- /// Returns true if \p I is an instruction that will be scalarized with

- /// predication when vectorizing \p I with vectorization factor \p VF. Such

- /// instructions include conditional stores and instructions that may divide

- /// by zero.

- bool isScalarWithPredication(Instruction *I, ElementCount VF) const;

- // Returns true if \p I is an instruction that will be predicated either

- // through scalar predication or masked load/store or masked gather/scatter.

- // \p VF is the vectorization factor that will be used to vectorize \p I.

- // Superset of instructions that return true for isScalarWithPredication.

- bool isPredicatedInst(Instruction *I, ElementCount VF) {

- // When we know the load's address is loop invariant and the instruction

- // in the original scalar loop was unconditionally executed then we

- // don't need to mark it as a predicated instruction. Tail folding may

- // introduce additional predication, but we're guaranteed to always have

- // at least one active lane. We call Legal->blockNeedsPredication here

- // because it doesn't query tail-folding.

- if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&

- !Legal->blockNeedsPredication(I->getParent()))

+ /// Given costs for both strategies, return true if the scalar predication

+ /// lowering should be used for div/rem. This incorporates an override

+ /// option so it is not simply a cost comparison.

+ bool isDivRemScalarWithPredication(InstructionCost ScalarCost,

+ InstructionCost SafeDivisorCost) const {

+ switch (ForceSafeDivisor) {

+ case cl::BOU_UNSET:

+ return ScalarCost < SafeDivisorCost;

+ case cl::BOU_TRUE:

return false;

- if (!blockNeedsPredicationForAnyReason(I->getParent()))

- return false;

- // Loads and stores that need some form of masked operation are predicated

- // instructions.

- if (isa<LoadInst>(I) || isa<StoreInst>(I))

- return Legal->isMaskRequired(I);

- return isScalarWithPredication(I, VF);

+ case cl::BOU_FALSE:

+ return true;

+ };

+ llvm_unreachable("impossible case value");

}

+ /// Returns true if \p I is an instruction which requires predication and

+ /// for which our chosen predication strategy is scalarization (i.e. we

+ /// don't have an alternate strategy such as masking available).

+ /// \p VF is the vectorization factor that will be used to vectorize \p I.

+ bool isScalarWithPredication(Instruction *I, ElementCount VF) const;

+ /// Returns true if \p I is an instruction that needs to be predicated

+ /// at runtime. The result is independent of the predication mechanism.

+ /// Superset of instructions that return true for isScalarWithPredication.

+ bool isPredicatedInst(Instruction *I) const;

+ /// Return the costs for our two available strategies for lowering a

+ /// div/rem operation which requires speculating at least one lane.

+ /// First result is for scalarization (will be invalid for scalable

+ /// vectors); second is for the safe-divisor strategy.

+ std::pair<InstructionCost, InstructionCost>

+ getDivRemSpeculationCost(Instruction *I,

+ ElementCount VF) const;

/// Returns true if \p I is a memory instruction with consecutive memory

/// access that can be widened.

- bool

- memoryInstructionCanBeWidened(Instruction *I,

- ElementCount VF = ElementCount::getFixed(1));

+ bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);

/// Returns true if \p I is a memory instruction in an interleaved-group

/// of memory accesses that can be vectorized with wide vector loads/stores

/// and shuffles.

- bool

- interleavedAccessCanBeWidened(Instruction *I,

- ElementCount VF = ElementCount::getFixed(1));

+ bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);

/// Check if \p Instr belongs to any interleaved access group.

bool isAccessInterleaved(Instruction *Instr) {

@@ -1567,7 +1599,7 @@ public:

/// Convenience function that returns the value of vscale_range iff

/// vscale_range.min == vscale_range.max or otherwise returns the value

/// returned by the corresponding TLI method.

- Optional<unsigned> getVScaleForTuning() const;

+ std::optional<unsigned> getVScaleForTuning() const;

private:

unsigned NumPredStores = 0;

@@ -1623,7 +1655,7 @@ private:

/// Return the cost of instructions in an inloop reduction pattern, if I is

/// part of that pattern.

- Optional<InstructionCost>

+ std::optional<InstructionCost>

getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,

TTI::TargetCostKind CostKind);

@@ -1651,8 +1683,8 @@ private:

/// Estimate the overhead of scalarizing an instruction. This is a

/// convenience wrapper for the type-based getScalarizationOverhead API.

- InstructionCost getScalarizationOverhead(Instruction *I,

- ElementCount VF) const;

+ InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,

+ TTI::TargetCostKind CostKind) const;

/// Returns true if an artificially high cost for emulated masked memrefs

/// should be used.

@@ -1719,8 +1751,9 @@ private:

/// scalarize and their scalar costs are collected in \p ScalarCosts. A

/// non-negative return value implies the expression will be scalarized.

/// Currently, only single-use chains are considered for scalarization.

- int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,

- ElementCount VF);

+ InstructionCost computePredInstDiscount(Instruction *PredInst,

+ ScalarCostsTy &ScalarCosts,

+ ElementCount VF);

/// Collect the instructions that are uniform after vectorization. An

/// instruction is uniform if we represent it with a single scalar value in

@@ -1835,6 +1868,7 @@ public:

};

} // end namespace llvm

+namespace {

/// Helper struct to manage generating runtime checks for vectorization.

///

/// The runtime checks are created up-front in temporary blocks to allow better

@@ -1914,7 +1948,7 @@ public:

if (DiffChecks) {

Value *RuntimeVF = nullptr;

MemRuntimeCheckCond = addDiffRuntimeChecks(

- MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,

+ MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,

[VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {

if (!RuntimeVF)

RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);

@@ -2099,6 +2133,7 @@ public:

return MemCheckBlock;

}

};

+} // namespace

// Return true if \p OuterLp is an outer loop annotated with hints for explicit

// vectorization. The loop needs to be annotated with #pragma omp simd

@@ -2194,18 +2229,15 @@ struct LoopVectorize : public FunctionPass {

auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();

auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;

- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

- auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();

+ auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();

auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();

auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();

- std::function<const LoopAccessInfo &(Loop &)> GetLAA =

- [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };

- return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,

- GetLAA, *ORE, PSI).MadeAnyChange;

+ return Impl

+ .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)

+ .MadeAnyChange;

}

void getAnalysisUsage(AnalysisUsage &AU) const override {

@@ -2215,7 +2247,6 @@ struct LoopVectorize : public FunctionPass {

AU.addRequired<LoopInfoWrapperPass>();

AU.addRequired<ScalarEvolutionWrapperPass>();

AU.addRequired<TargetTransformInfoWrapperPass>();

- AU.addRequired<AAResultsWrapperPass>();

AU.addRequired<LoopAccessLegacyAnalysis>();

AU.addRequired<DemandedBitsWrapperPass>();

AU.addRequired<OptimizationRemarkEmitterWrapperPass>();

@@ -2321,12 +2352,16 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,

const InductionDescriptor &ID, VPValue *Def,

VPTransformState &State) {

IRBuilderBase &Builder = State.Builder;

- // We shouldn't have to build scalar steps if we aren't vectorizing.

- assert(State.VF.isVector() && "VF should be greater than one");

- // Get the value type and ensure it and the step have the same integer type.

+ // Ensure step has the same type as that of scalar IV.

Type *ScalarIVTy = ScalarIV->getType()->getScalarType();

- assert(ScalarIVTy == Step->getType() &&

- "Val and Step should have the same type");

+ if (ScalarIVTy != Step->getType()) {

+ // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to

+ // avoid separate truncate here.

+ assert(Step->getType()->isIntegerTy() &&

+ "Truncation requires an integer step");

+ Step = State.Builder.CreateTrunc(Step, ScalarIVTy);

+ }

// We build scalar steps for both integer and floating-point induction

// variables. Here, we determine the kind of arithmetic we will perform.

@@ -2343,7 +2378,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,

// Determine the number of scalars we need to generate for each unroll

// iteration.

bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);

- unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();

// Compute the scalar steps and save the results in State.

Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),

ScalarIVTy->getScalarSizeInBits());

@@ -2357,7 +2391,17 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,

SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);

}

- for (unsigned Part = 0; Part < State.UF; ++Part) {

+ unsigned StartPart = 0;

+ unsigned EndPart = State.UF;

+ unsigned StartLane = 0;

+ unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();

+ if (State.Instance) {

+ StartPart = State.Instance->Part;

+ EndPart = StartPart + 1;

+ StartLane = State.Instance->Lane.getKnownLane();

+ EndLane = StartLane + 1;

+ }

+ for (unsigned Part = StartPart; Part < EndPart; ++Part) {

Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);

if (!FirstLaneOnly && State.VF.isScalable()) {

@@ -2376,7 +2420,7 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,

if (ScalarIVTy->isFloatingPointTy())

StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);

- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {

+ for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {

Value *StartIdx = Builder.CreateBinOp(

AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));

// The step returned by `createStepForVF` is a runtime-evaluated value

@@ -2415,8 +2459,14 @@ static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,

static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,

Value *StartValue, Value *Step,

const InductionDescriptor &ID) {

- assert(Index->getType()->getScalarType() == Step->getType() &&

- "Index scalar type does not match StepValue type");

+ Type *StepTy = Step->getType();

+ Value *CastedIndex = StepTy->isIntegerTy()

+ ? B.CreateSExtOrTrunc(Index, StepTy)

+ : B.CreateCast(Instruction::SIToFP, Index, StepTy);

+ if (CastedIndex != Index) {

+ CastedIndex->setName(CastedIndex->getName() + ".cast");

+ Index = CastedIndex;

+ }

// Note: the IR at this point is broken. We cannot use SE to create any new

// SCEV and then expand it, hoping that SCEV's simplification will give us

@@ -2682,6 +2732,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

for (unsigned Part = 0; Part < UF; Part++) {

// Collect the stored vector from each member.

SmallVector<Value *, 4> StoredVecs;

+ unsigned StoredIdx = 0;

for (unsigned i = 0; i < InterleaveFactor; i++) {

assert((Group->getMember(i) || MaskForGaps) &&

"Fail to get a member from an interleaved store group");

@@ -2694,7 +2745,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

continue;

}

- Value *StoredVec = State.get(StoredValues[i], Part);

+ Value *StoredVec = State.get(StoredValues[StoredIdx], Part);

+ ++StoredIdx;

if (Group->isReverse())

StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");

@@ -2738,7 +2790,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

}

-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,

+void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,

VPReplicateRecipe *RepRecipe,

const VPIteration &Instance,

bool IfPredicateInstr,

@@ -2772,11 +2824,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,

// Replace the operands of the cloned instructions with their scalar

// equivalents in the new loop.

- for (auto &I : enumerate(RepRecipe->operands())) {

+ for (const auto &I : enumerate(RepRecipe->operands())) {

auto InputInstance = Instance;

VPValue *Operand = I.value();

- VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);

- if (OperandR && OperandR->isUniform())

+ if (vputils::isUniformAfterVectorization(Operand))

InputInstance.Lane = VPLane::getFirstLane();

Cloned->setOperand(I.index(), State.get(Operand, InputInstance));

}

@@ -2803,33 +2854,15 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {

assert(InsertBlock);

IRBuilder<> Builder(InsertBlock->getTerminator());

// Find the loop boundaries.

- ScalarEvolution *SE = PSE.getSE();

- const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();

- assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&

- "Invalid loop count");

Type *IdxTy = Legal->getWidestInductionType();

assert(IdxTy && "No type for induction");

- // The exit count might have the type of i64 while the phi is i32. This can

- // happen if we have an induction variable that is sign extended before the

- // compare. The only way that we get a backedge taken count is that the

- // induction variable was signed and as such will not overflow. In such a case

- // truncation is legal.

- if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >

- IdxTy->getPrimitiveSizeInBits())

- BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);

- BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);

- // Get the total trip count from the count by adding 1.

- const SCEV *ExitCount = SE->getAddExpr(

- BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));

+ const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);

const DataLayout &DL = InsertBlock->getModule()->getDataLayout();

// Expand the trip count and place the new instructions in the preheader.

// Notice that the pre-header does not change, only the loop body.

- SCEVExpander Exp(*SE, DL, "induction");

+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");

// Count holds the overall loop count (N).

TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),

@@ -3080,7 +3113,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {

// 1) If we know that we must execute the scalar epilogue, emit an

// unconditional branch.

// 2) Otherwise, we must have a single unique exit block (due to how we

- // implement the multiple exit case). In this case, set up a conditonal

+ // implement the multiple exit case). In this case, set up a conditional

// branch from the middle block to the loop scalar preheader, and the

// exit block. completeLoopSkeleton will update the condition to use an

// iteration check, if required to decide whether to execute the remainder.

@@ -3101,88 +3134,87 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {

DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);

}

-void InnerLoopVectorizer::createInductionResumeValues(

+PHINode *InnerLoopVectorizer::createInductionResumeValue(

+ PHINode *OrigPhi, const InductionDescriptor &II,

+ ArrayRef<BasicBlock *> BypassBlocks,

std::pair<BasicBlock *, Value *> AdditionalBypass) {

- assert(((AdditionalBypass.first && AdditionalBypass.second) ||

- (!AdditionalBypass.first && !AdditionalBypass.second)) &&

- "Inconsistent information about additional bypass.");

Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

assert(VectorTripCount && "Expected valid arguments");

- // We are going to resume the execution of the scalar loop.

- // Go over all of the induction variables that we found and fix the

- // PHIs that are left in the scalar version of the loop.

- // The starting values of PHI nodes depend on the counter of the last

- // iteration in the vectorized loop.

- // If we come from a bypass edge then we need to start from the original

- // start value.

Instruction *OldInduction = Legal->getPrimaryInduction();

- for (auto &InductionEntry : Legal->getInductionVars()) {

- PHINode *OrigPhi = InductionEntry.first;

- InductionDescriptor II = InductionEntry.second;

+ Value *&EndValue = IVEndValues[OrigPhi];

+ Value *EndValueFromAdditionalBypass = AdditionalBypass.second;

+ if (OrigPhi == OldInduction) {

+ // We know what the end value is.

+ EndValue = VectorTripCount;

+ } else {

+ IRBuilder<> B(LoopVectorPreHeader->getTerminator());

- Value *&EndValue = IVEndValues[OrigPhi];

- Value *EndValueFromAdditionalBypass = AdditionalBypass.second;

- if (OrigPhi == OldInduction) {

- // We know what the end value is.

- EndValue = VectorTripCount;

- } else {

- IRBuilder<> B(LoopVectorPreHeader->getTerminator());

+ // Fast-math-flags propagate from the original induction instruction.

+ if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))

+ B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

- // Fast-math-flags propagate from the original induction instruction.

- if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))

- B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

+ Value *Step =

+ CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());

+ EndValue =

+ emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);

+ EndValue->setName("ind.end");

- Type *StepType = II.getStep()->getType();

- Instruction::CastOps CastOp =

- CastInst::getCastOpcode(VectorTripCount, true, StepType, true);

- Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");

+ // Compute the end value for the additional bypass (if applicable).

+ if (AdditionalBypass.first) {

+ B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));

Value *Step =

CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());

- EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);

- EndValue->setName("ind.end");

- // Compute the end value for the additional bypass (if applicable).

- if (AdditionalBypass.first) {

- B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));

- CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,

- StepType, true);

- Value *Step =

- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());

- VTC =

- B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");

- EndValueFromAdditionalBypass =

- emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);

- EndValueFromAdditionalBypass->setName("ind.end");

- }

+ EndValueFromAdditionalBypass = emitTransformedIndex(

+ B, AdditionalBypass.second, II.getStartValue(), Step, II);

+ EndValueFromAdditionalBypass->setName("ind.end");

}

+ }

- // Create phi nodes to merge from the backedge-taken check block.

- PHINode *BCResumeVal =

- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",

- LoopScalarPreHeader->getTerminator());

- // Copy original phi DL over to the new one.

- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());

+ // Create phi nodes to merge from the backedge-taken check block.

+ PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",

+ LoopScalarPreHeader->getTerminator());

+ // Copy original phi DL over to the new one.

+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());

- // The new PHI merges the original incoming value, in case of a bypass,

- // or the value at the end of the vectorized loop.

- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);

+ // The new PHI merges the original incoming value, in case of a bypass,

+ // or the value at the end of the vectorized loop.

+ BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);

- // Fix the scalar body counter (PHI node).

- // The old induction's phi node in the scalar body needs the truncated

- // value.

- for (BasicBlock *BB : LoopBypassBlocks)

- BCResumeVal->addIncoming(II.getStartValue(), BB);

+ // Fix the scalar body counter (PHI node).

+ // The old induction's phi node in the scalar body needs the truncated

+ // value.

+ for (BasicBlock *BB : BypassBlocks)

+ BCResumeVal->addIncoming(II.getStartValue(), BB);

- if (AdditionalBypass.first)

- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,

- EndValueFromAdditionalBypass);

+ if (AdditionalBypass.first)

+ BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,

+ EndValueFromAdditionalBypass);

+ return BCResumeVal;

+void InnerLoopVectorizer::createInductionResumeValues(

+ std::pair<BasicBlock *, Value *> AdditionalBypass) {

+ assert(((AdditionalBypass.first && AdditionalBypass.second) ||

+ (!AdditionalBypass.first && !AdditionalBypass.second)) &&

+ "Inconsistent information about additional bypass.");

+ // We are going to resume the execution of the scalar loop.

+ // Go over all of the induction variables that we found and fix the

+ // PHIs that are left in the scalar version of the loop.

+ // The starting values of PHI nodes depend on the counter of the last

+ // iteration in the vectorized loop.

+ // If we come from a bypass edge then we need to start from the original

+ // start value.

+ for (const auto &InductionEntry : Legal->getInductionVars()) {

+ PHINode *OrigPhi = InductionEntry.first;

+ const InductionDescriptor &II = InductionEntry.second;

+ PHINode *BCResumeVal = createInductionResumeValue(

+ OrigPhi, II, LoopBypassBlocks, AdditionalBypass);

OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);

}

-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {

+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {

// The trip counts should be cached by now.

Value *Count = getOrCreateTripCount(LoopVectorPreHeader);

Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

@@ -3251,18 +3283,6 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {

...

- // Get the metadata of the original loop before it gets modified.

- MDNode *OrigLoopID = OrigLoop->getLoopID();

- // Workaround! Compute the trip count of the original loop and cache it

- // before we start modifying the CFG. This code has a systemic problem

- // wherein it tries to run analysis over partially constructed IR; this is

- // wrong, and not simply for SCEV. The trip count of the original loop

- // simply happens to be prone to hitting this in practice. In theory, we

- // can hit the same issue for any SCEV, or ValueTracking query done during

- // mutation. See PR49900.

- getOrCreateTripCount(OrigLoop->getLoopPreheader());

// Create an empty vector loop, and prepare basic blocks for the runtime

// checks.

createVectorLoopSkeleton("");

@@ -3286,7 +3306,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {

// Emit phis for the new starting index of the scalar loop.

createInductionResumeValues();

- return {completeLoopSkeleton(OrigLoopID), nullptr};

+ return {completeLoopSkeleton(), nullptr};

}

// Fix up external users of the induction variable. At this point, we are

@@ -3334,17 +3354,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,

Value *CountMinusOne = B.CreateSub(

VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));

- Value *CMO =

- !II.getStep()->getType()->isIntegerTy()

- ? B.CreateCast(Instruction::SIToFP, CountMinusOne,

- II.getStep()->getType())

- : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());

- CMO->setName("cast.cmo");

+ CountMinusOne->setName("cmo");

Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),

VectorHeader->getTerminator());

Value *Escape =

- emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);

+ emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);

Escape->setName("ind.escape");

MissingVals[UI] = Escape;

}

@@ -3429,8 +3443,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,

// to be vectors, so we need to extract individual elements from there,

// execute VF scalar calls, and then gather the result into the vector return

// value.

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

InstructionCost ScalarCallCost =

- TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);

+ TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);

if (VF.isScalar())

return ScalarCallCost;

@@ -3441,7 +3456,8 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,

// Compute costs of unpacking argument values for the scalar calls and

// packing the return values to a vector.

- InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);

+ InstructionCost ScalarizationCost =

+ getScalarizationOverhead(CI, VF, CostKind);

InstructionCost Cost =

ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;

@@ -3457,7 +3473,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,

// If the corresponding vector cost is cheaper, return its cost.

InstructionCost VectorCallCost =

- TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);

+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);

if (VectorCallCost < Cost) {

NeedToScalarize = false;

Cost = VectorCallCost;

@@ -3672,7 +3688,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

// edge.

// Fix-up external users of the induction variables.

- for (auto &Entry : Legal->getInductionVars())

+ for (const auto &Entry : Legal->getInductionVars())

fixupIVUsers(Entry.first, Entry.second,

getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),

IVEndValues[Entry.first], LoopMiddleBlock,

@@ -3682,7 +3698,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated

// in the exit block, so update the builder.

State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());

- for (auto &KV : Plan.getLiveOuts())

+ for (const auto &KV : Plan.getLiveOuts())

KV.second->fixPhi(Plan, State);

for (Instruction *PI : PredicatedInstructions)

@@ -3722,11 +3738,11 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {

if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))

fixReduction(ReductionPhi, State);

else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))

- fixFirstOrderRecurrence(FOR, State);

+ fixFixedOrderRecurrence(FOR, State);

}

-void InnerLoopVectorizer::fixFirstOrderRecurrence(

+void InnerLoopVectorizer::fixFixedOrderRecurrence(

VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {

// This is the second phase of vectorizing first-order recurrences. An

// overview of the transformation is described below. Suppose we have the

@@ -4019,7 +4035,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

// We know that the loop is in LCSSA form. We need to update the PHI nodes

// in the exit blocks. See comment on analogous loop in

- // fixFirstOrderRecurrence for a more complete explaination of the logic.

+ // fixFixedOrderRecurrence for a more complete explaination of the logic.

if (!Cost->requiresScalarEpilogue(VF))

for (PHINode &LCSSAPhi : LoopExitBlock->phis())

if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {

@@ -4146,8 +4162,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {

void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,

VPTransformState &State) {

- auto Iter = depth_first(

- VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));

+ auto Iter = vp_depth_first_deep(Plan.getEntry());

for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

for (VPRecipeBase &P : VPBB->phis()) {

VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);

@@ -4170,78 +4185,6 @@ bool InnerLoopVectorizer::useOrderedReductions(

return Cost->useOrderedReductions(RdxDesc);

}

-void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,

- VPUser &ArgOperands,

- VPTransformState &State) {

- assert(!isa<DbgInfoIntrinsic>(CI) &&

- "DbgInfoIntrinsic should have been dropped during VPlan construction");

- State.setDebugLocFromInst(&CI);

- SmallVector<Type *, 4> Tys;

- for (Value *ArgOperand : CI.args())

- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));

- Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);

- // The flag shows whether we use Intrinsic or a usual Call for vectorized

- // version of the instruction.

- // Is it beneficial to perform intrinsic call compared to lib call?

- bool NeedToScalarize = false;

- InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);

- InstructionCost IntrinsicCost =

- ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;

- bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;

- assert((UseVectorIntrinsic || !NeedToScalarize) &&

- "Instruction should be scalarized elsewhere.");

- assert((IntrinsicCost.isValid() || CallCost.isValid()) &&

- "Either the intrinsic cost or vector call cost must be valid");

- for (unsigned Part = 0; Part < UF; ++Part) {

- SmallVector<Type *, 2> TysForDecl = {CI.getType()};

- SmallVector<Value *, 4> Args;

- for (auto &I : enumerate(ArgOperands.operands())) {

- // Some intrinsics have a scalar argument - don't replace it with a

- // vector.

- Value *Arg;

- if (!UseVectorIntrinsic ||

- !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))

- Arg = State.get(I.value(), Part);

- else

- Arg = State.get(I.value(), VPIteration(0, 0));

- if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))

- TysForDecl.push_back(Arg->getType());

- Args.push_back(Arg);

- }

- Function *VectorF;

- if (UseVectorIntrinsic) {

- // Use vector version of the intrinsic.

- if (VF.isVector())

- TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);

- Module *M = State.Builder.GetInsertBlock()->getModule();

- VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);

- assert(VectorF && "Can't retrieve vector intrinsic.");

- } else {

- // Use vector version of the function call.

- const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);

-#ifndef NDEBUG

- assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&

- "Can't create vector function.");

-#endif

- VectorF = VFDatabase(CI).getVectorizedFunction(Shape);

- }

- SmallVector<OperandBundleDef, 1> OpBundles;

- CI.getOperandBundlesAsDefs(OpBundles);

- CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

- if (isa<FPMathOperator>(V))

- V->copyFastMathFlags(&CI);

- State.set(Def, V, Part);

- State.addMetadata(V, &CI);

- }

void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

// We should not collect Scalars more than once per VF. Right now, this

// function is called from collectUniformsAndScalars(), which already does

@@ -4350,8 +4293,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

// induction variable when the PHI user is scalarized.

auto ForcedScalar = ForcedScalars.find(VF);

if (ForcedScalar != ForcedScalars.end())

- for (auto *I : ForcedScalar->second)

+ for (auto *I : ForcedScalar->second) {

+ LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");

Worklist.insert(I);

+ }

// Expand the worklist by looking through any bitcasts and getelementptr

// instructions we've already identified as scalar. This is similar to the

@@ -4376,7 +4321,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

// An induction variable will remain scalar if all users of the induction

// variable and induction variable update remain scalar.

- for (auto &Induction : Legal->getInductionVars()) {

+ for (const auto &Induction : Legal->getInductionVars()) {

auto *Ind = Induction.first;

auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

@@ -4429,15 +4374,16 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

bool LoopVectorizationCostModel::isScalarWithPredication(

Instruction *I, ElementCount VF) const {

- if (!blockNeedsPredicationForAnyReason(I->getParent()))

+ if (!isPredicatedInst(I))

return false;

+ // Do we have a non-scalar lowering for this predicated

+ // instruction? No - it is scalar with predication.

switch(I->getOpcode()) {

default:

- break;

+ return true;

case Instruction::Load:

case Instruction::Store: {

- if (!Legal->isMaskRequired(I))

- return false;

auto *Ptr = getLoadStorePointerOperand(I);

auto *Ty = getLoadStoreType(I);

Type *VTy = Ty;

@@ -4452,12 +4398,119 @@ bool LoopVectorizationCostModel::isScalarWithPredication(

case Instruction::UDiv:

case Instruction::SDiv:

case Instruction::SRem:

+ case Instruction::URem: {

+ // We have the option to use the safe-divisor idiom to avoid predication.

+ // The cost based decision here will always select safe-divisor for

+ // scalable vectors as scalarization isn't legal.

+ const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);

+ return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);

+ }

+bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {

+ if (!blockNeedsPredicationForAnyReason(I->getParent()))

+ return false;

+ // Can we prove this instruction is safe to unconditionally execute?

+ // If not, we must use some form of predication.

+ switch(I->getOpcode()) {

+ default:

+ return false;

+ case Instruction::Load:

+ case Instruction::Store: {

+ if (!Legal->isMaskRequired(I))

+ return false;

+ // When we know the load's address is loop invariant and the instruction

+ // in the original scalar loop was unconditionally executed then we

+ // don't need to mark it as a predicated instruction. Tail folding may

+ // introduce additional predication, but we're guaranteed to always have

+ // at least one active lane. We call Legal->blockNeedsPredication here

+ // because it doesn't query tail-folding. For stores, we need to prove

+ // both speculation safety (which follows from the same argument as loads),

+ // but also must prove the value being stored is correct. The easiest

+ // form of the later is to require that all values stored are the same.

+ if (Legal->isUniformMemOp(*I) &&

+ (isa<LoadInst>(I) ||

+ (isa<StoreInst>(I) &&

+ TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&

+ !Legal->blockNeedsPredication(I->getParent()))

+ return false;

+ return true;

+ }

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::SRem:

case Instruction::URem:

// TODO: We can use the loop-preheader as context point here and get

// context sensitive reasoning

return !isSafeToSpeculativelyExecute(I);

}

- return false;

+std::pair<InstructionCost, InstructionCost>

+LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,

+ ElementCount VF) const {

+ assert(I->getOpcode() == Instruction::UDiv ||

+ I->getOpcode() == Instruction::SDiv ||

+ I->getOpcode() == Instruction::SRem ||

+ I->getOpcode() == Instruction::URem);

+ assert(!isSafeToSpeculativelyExecute(I));

+ const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ // Scalarization isn't legal for scalable vector types

+ InstructionCost ScalarizationCost = InstructionCost::getInvalid();

+ if (!VF.isScalable()) {

+ // Get the scalarization cost and scale this amount by the probability of

+ // executing the predicated block. If the instruction is not predicated,

+ // we fall through to the next case.

+ ScalarizationCost = 0;

+ // These instructions have a non-void type, so account for the phi nodes

+ // that we will create. This cost is likely to be zero. The phi node

+ // cost, if any, should be scaled by the block probability because it

+ // models a copy at the end of each predicated block.

+ ScalarizationCost += VF.getKnownMinValue() *

+ TTI.getCFInstrCost(Instruction::PHI, CostKind);

+ // The cost of the non-predicated instruction.

+ ScalarizationCost += VF.getKnownMinValue() *

+ TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);

+ // The cost of insertelement and extractelement instructions needed for

+ // scalarization.

+ ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);

+ // Scale the cost by the probability of executing the predicated blocks.

+ // This assumes the predicated block for each vector lane is equally

+ // likely.

+ ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();

+ }

+ InstructionCost SafeDivisorCost = 0;

+ auto *VecTy = ToVectorTy(I->getType(), VF);

+ // The cost of the select guard to ensure all lanes are well defined

+ // after we speculate above any internal control flow.

+ SafeDivisorCost += TTI.getCmpSelInstrCost(

+ Instruction::Select, VecTy,

+ ToVectorTy(Type::getInt1Ty(I->getContext()), VF),

+ CmpInst::BAD_ICMP_PREDICATE, CostKind);

+ // Certain instructions can be cheaper to vectorize if they have a constant

+ // second vector operand. One example of this are shifts on x86.

+ Value *Op2 = I->getOperand(1);

+ auto Op2Info = TTI.getOperandInfo(Op2);

+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))

+ Op2Info.Kind = TargetTransformInfo::OK_UniformValue;

+ SmallVector<const Value *, 4> Operands(I->operand_values());

+ SafeDivisorCost += TTI.getArithmeticInstrCost(

+ I->getOpcode(), VecTy, CostKind,

+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

+ Op2Info, Operands, I);

+ return {ScalarizationCost, SafeDivisorCost};

}

bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(

@@ -4610,17 +4663,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())

addToWorklistIfAllowed(Cmp);

+ // Return true if all lanes perform the same memory operation, and we can

+ // thus chose to execute only one.

+ auto isUniformMemOpUse = [&](Instruction *I) {

+ if (!Legal->isUniformMemOp(*I))

+ return false;

+ if (isa<LoadInst>(I))

+ // Loading the same address always produces the same result - at least

+ // assuming aliasing and ordering which have already been checked.

+ return true;

+ // Storing the same value on every iteration.

+ return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());

+ };

auto isUniformDecision = [&](Instruction *I, ElementCount VF) {

InstWidening WideningDecision = getWideningDecision(I, VF);

assert(WideningDecision != CM_Unknown &&

"Widening decision should be ready at this moment");

- // A uniform memory op is itself uniform. We exclude uniform stores

- // here as they demand the last lane, not the first one.

- if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {

- assert(WideningDecision == CM_Scalarize);

+ if (isUniformMemOpUse(I))

return true;

- }

return (WideningDecision == CM_Widen ||

WideningDecision == CM_Widen_Reverse ||

@@ -4674,9 +4736,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

if (!Ptr)

continue;

- // A uniform memory op is itself uniform. We exclude uniform stores

- // here as they demand the last lane, not the first one.

- if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))

+ if (isUniformMemOpUse(&I))

addToWorklistIfAllowed(&I);

if (isUniformDecision(&I, VF)) {

@@ -4707,14 +4767,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

while (idx != Worklist.size()) {

Instruction *I = Worklist[idx++];

- for (auto OV : I->operand_values()) {

+ for (auto *OV : I->operand_values()) {

// isOutOfScope operands cannot be uniform instructions.

if (isOutOfScope(OV))

continue;

// First order recurrence Phi's should typically be considered

// non-uniform.

auto *OP = dyn_cast<PHINode>(OV);

- if (OP && Legal->isFirstOrderRecurrence(OP))

+ if (OP && Legal->isFixedOrderRecurrence(OP))

continue;

// If all the users of the operand are uniform, then add the

// operand into the uniform worklist.

@@ -4733,7 +4793,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

// nodes separately. An induction variable will remain uniform if all users

// of the induction variable and induction variable update remain uniform.

// The code below handles both pointer and non-pointer induction variables.

- for (auto &Induction : Legal->getInductionVars()) {

+ for (const auto &Induction : Legal->getInductionVars()) {

auto *Ind = Induction.first;

auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

@@ -4846,12 +4906,12 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

return MaxScalableVF;

// Limit MaxScalableVF by the maximum safe dependence distance.

- Optional<unsigned> MaxVScale = TTI.getMaxVScale();

+ std::optional<unsigned> MaxVScale = TTI.getMaxVScale();

if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))

MaxVScale =

TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

- MaxScalableVF = ElementCount::getScalable(

- MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);

+ MaxScalableVF =

+ ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);

if (!MaxScalableVF)

reportVectorizationInfo(

"Max legal vector width too small, scalable vectorization "

@@ -4991,7 +5051,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

case CM_ScalarEpilogueAllowed:

return computeFeasibleMaxVF(TC, UserVF, false);

case CM_ScalarEpilogueNotAllowedUsePredicate:

- LLVM_FALLTHROUGH;

+ [[fallthrough]];

case CM_ScalarEpilogueNotNeededUsePredicate:

LLVM_DEBUG(

dbgs() << "LV: vector predicate hint/switch found.\n"

@@ -5113,7 +5173,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,

ElementCount MaxSafeVF, bool FoldTailByMasking) {

bool ComputeScalableMaxVF = MaxSafeVF.isScalable();

- TypeSize WidestRegister = TTI.getRegisterBitWidth(

+ const TypeSize WidestRegister = TTI.getRegisterBitWidth(

ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector

: TargetTransformInfo::RGK_FixedWidthVector);

@@ -5127,7 +5187,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

// Ensure MaxVF is a power of 2; the dependence distance bound may not be.

// Note that both WidestRegister and WidestType may not be a powers of 2.

auto MaxVectorElementCount = ElementCount::get(

- PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),

+ PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),

ComputeScalableMaxVF);

MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);

LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "

@@ -5140,9 +5200,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

return ElementCount::getFixed(1);

}

- const auto TripCountEC = ElementCount::getFixed(ConstTripCount);

- if (ConstTripCount &&

- ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&

+ unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();

+ if (MaxVectorElementCount.isScalable() &&

+ TheFunction->hasFnAttribute(Attribute::VScaleRange)) {

+ auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);

+ auto Min = Attr.getVScaleRangeMin();

+ WidestRegisterMinEC *= Min;

+ }

+ if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&

(!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {

// If loop trip count (TC) is known at compile time there is no point in

// choosing VF greater than TC (as done in the loop below). Select maximum

@@ -5163,7 +5228,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&

TTI.shouldMaximizeVectorBandwidth(RegKind))) {

auto MaxVectorElementCountMaxBW = ElementCount::get(

- PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),

+ PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),

ComputeScalableMaxVF);

MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);

@@ -5208,7 +5273,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

return MaxVF;

}

-Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {

+std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {

if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {

auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);

auto Min = Attr.getVScaleRangeMin();

@@ -5244,11 +5309,11 @@ bool LoopVectorizationCostModel::isMoreProfitable(

// Improve estimate for the vector width if it is scalable.

unsigned EstimatedWidthA = A.Width.getKnownMinValue();

unsigned EstimatedWidthB = B.Width.getKnownMinValue();

- if (Optional<unsigned> VScale = getVScaleForTuning()) {

+ if (std::optional<unsigned> VScale = getVScaleForTuning()) {

if (A.Width.isScalable())

- EstimatedWidthA *= VScale.value();

+ EstimatedWidthA *= *VScale;

if (B.Width.isScalable())

- EstimatedWidthB *= VScale.value();

+ EstimatedWidthB *= *VScale;

}

// Assume vscale may be larger than 1 (or the value being tuned for),

@@ -5294,7 +5359,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

#ifndef NDEBUG

unsigned AssumedMinimumVscale = 1;

- if (Optional<unsigned> VScale = getVScaleForTuning())

+ if (std::optional<unsigned> VScale = getVScaleForTuning())

AssumedMinimumVscale = *VScale;

unsigned Width =

Candidate.Width.isScalable()

@@ -5365,7 +5430,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

raw_string_ostream OS(OutString);

assert(!Subset.empty() && "Unexpected empty range");

OS << "Instruction with invalid costs prevented vectorization at VF=(";

- for (auto &Pair : Subset)

+ for (const auto &Pair : Subset)

OS << (Pair.second == Subset.front().second ? "" : ", ")

<< Pair.second;

OS << "):";

@@ -5403,12 +5468,12 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(

// Cross iteration phis such as reductions need special handling and are

// currently unsupported.

if (any_of(L.getHeader()->phis(),

- [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))

+ [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))

return false;

// Phis with uses outside of the loop require special handling and are

// currently unsupported.

- for (auto &Entry : Legal->getInductionVars()) {

+ for (const auto &Entry : Legal->getInductionVars()) {

// Look for uses of the value of the induction at the last iteration.

Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());

for (User *U : PostInc->users())

@@ -5420,14 +5485,6 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(

return false;

}

- // Induction variables that are widened require special handling that is

- // currently not supported.

- if (any_of(Legal->getInductionVars(), [&](auto &Entry) {

- return !(this->isScalarAfterVectorization(Entry.first, VF) ||

- this->isProfitableToScalarize(Entry.first, VF));

- }))

- return false;

// Epilogue vectorization code has not been auditted to ensure it handles

// non-latch exits properly. It may be fine, but it needs auditted and

// tested.

@@ -5443,6 +5500,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(

// as register pressure, code size increase and cost of extra branches into

// account. For now we apply a very crude heuristic and only consider loops

// with vectorization factors larger than a certain value.

+ // Allow the target to opt out entirely.

+ if (!TTI.preferEpilogueVectorization())

+ return false;

// We also consider epilogue vectorization unprofitable for targets that don't

// consider interleaving beneficial (eg. MVE).

if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)

@@ -5512,7 +5574,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(

ElementCount EstimatedRuntimeVF = MainLoopVF;

if (MainLoopVF.isScalable()) {

EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());

- if (Optional<unsigned> VScale = getVScaleForTuning())

+ if (std::optional<unsigned> VScale = getVScaleForTuning())

EstimatedRuntimeVF *= *VScale;

}

@@ -5542,7 +5604,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {

// Reset MaxWidth so that we can find the smallest type used by recurrences

// in the loop.

MaxWidth = -1U;

- for (auto &PhiDescriptorPair : Legal->getReductionVars()) {

+ for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {

const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;

// When finding the min width used by the recurrence we need to account

// for casts on the input operands of the recurrence.

@@ -5554,9 +5616,9 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {

} else {

for (Type *T : ElementTypesInLoop) {

MinWidth = std::min<unsigned>(

- MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());

+ MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());

MaxWidth = std::max<unsigned>(

- MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());

+ MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());

}

return {MinWidth, MaxWidth};

@@ -5605,8 +5667,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {

}

-unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

- unsigned LoopCost) {

+unsigned

+LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

+ InstructionCost LoopCost) {

// -- The interleave heuristics --

// We interleave the loop in order to expose ILP and reduce the loop overhead.

// There are many micro-architectural considerations that we can't predict

@@ -5642,9 +5705,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

// If we did not calculate the cost for VF (because the user selected the VF)

// then we calculate the cost of VF here.

if (LoopCost == 0) {

- InstructionCost C = expectedCost(VF).first;

- assert(C.isValid() && "Expected to have chosen a VF with valid cost");

- LoopCost = *C.getValue();

+ LoopCost = expectedCost(VF).first;

+ assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");

// Loop body is free and there is no need for interleaving.

if (LoopCost == 0)

@@ -5772,8 +5834,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

// We assume that the cost overhead is 1 and we use the cost model

// to estimate the cost of the loop and interleave until the cost of the

// loop overhead is about 5% of the cost of the loop.

- unsigned SmallIC =

- std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));

+ unsigned SmallIC = std::min(

+ IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));

// Interleave until store/load ports (estimated by max interleave count) are

// saturated.

@@ -5888,8 +5950,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

IntervalMap EndPoint;

// Saves the list of instruction indices that are used in the loop.

SmallPtrSet<Instruction *, 8> Ends;

- // Saves the list of values that are used in the loop but are

- // defined outside the loop, such as arguments and constants.

+ // Saves the list of values that are used in the loop but are defined outside

+ // the loop (not including non-instruction values such as arguments and

+ // constants).

SmallPtrSet<Value *, 8> LoopInvariants;

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {

@@ -5901,6 +5964,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

auto *Instr = dyn_cast<Instruction>(U);

// Ignore non-instruction values such as arguments, constants, etc.

+ // FIXME: Might need some motivation why these values are ignored. If

+ // for example an argument is used inside the loop it will increase the

+ // register pressure (so shouldn't we add it to LoopInvariants).

if (!Instr)

continue;

@@ -5956,44 +6022,44 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

// For each VF find the maximum usage of registers.

for (unsigned j = 0, e = VFs.size(); j < e; ++j) {

- // Count the number of live intervals.

+ // Count the number of registers used, per register class, given all open

+ // intervals.

+ // Note that elements in this SmallMapVector will be default constructed

+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if

+ // there is no previous entry for ClassID.

SmallMapVector<unsigned, unsigned, 4> RegUsage;

if (VFs[j].isScalar()) {

- for (auto Inst : OpenIntervals) {

- unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());

- if (RegUsage.find(ClassID) == RegUsage.end())

- RegUsage[ClassID] = 1;

- else

- RegUsage[ClassID] += 1;

+ for (auto *Inst : OpenIntervals) {

+ unsigned ClassID =

+ TTI.getRegisterClassForType(false, Inst->getType());

+ // FIXME: The target might use more than one register for the type

+ // even in the scalar case.

+ RegUsage[ClassID] += 1;

}

} else {

collectUniformsAndScalars(VFs[j]);

- for (auto Inst : OpenIntervals) {

+ for (auto *Inst : OpenIntervals) {

// Skip ignored values for VF > 1.

if (VecValuesToIgnore.count(Inst))

continue;

if (isScalarAfterVectorization(Inst, VFs[j])) {

- unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());

- if (RegUsage.find(ClassID) == RegUsage.end())

- RegUsage[ClassID] = 1;

- else

- RegUsage[ClassID] += 1;

+ unsigned ClassID =

+ TTI.getRegisterClassForType(false, Inst->getType());

+ // FIXME: The target might use more than one register for the type

+ // even in the scalar case.

+ RegUsage[ClassID] += 1;

} else {

- unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());

- if (RegUsage.find(ClassID) == RegUsage.end())

- RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);

- else

- RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);

+ unsigned ClassID =

+ TTI.getRegisterClassForType(true, Inst->getType());

+ RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);

}

for (auto& pair : RegUsage) {

- if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())

- MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);

- else

- MaxUsages[j][pair.first] = pair.second;

+ auto &Entry = MaxUsages[j][pair.first];

+ Entry = std::max(Entry, pair.second);

}

@@ -6005,17 +6071,19 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

}

for (unsigned i = 0, e = VFs.size(); i < e; ++i) {

+ // Note that elements in this SmallMapVector will be default constructed

+ // as 0. So we can use "Invariant[ClassID] += n" in the code below even if

+ // there is no previous entry for ClassID.

SmallMapVector<unsigned, unsigned, 4> Invariant;

- for (auto Inst : LoopInvariants) {

+ for (auto *Inst : LoopInvariants) {

+ // FIXME: The target might use more than one register for the type

+ // even in the scalar case.

unsigned Usage =

VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);

unsigned ClassID =

TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());

- if (Invariant.find(ClassID) == Invariant.end())

- Invariant[ClassID] = Usage;

- else

- Invariant[ClassID] += Usage;

+ Invariant[ClassID] += Usage;

}

LLVM_DEBUG({

@@ -6054,7 +6122,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,

// from moving "masked load/store" check from legality to cost model.

// Masked Load/Gather emulation was previously never allowed.

// Limited number of Masked Store/Scatter emulation was allowed.

- assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&

+ assert((isPredicatedInst(I)) &&

"Expecting a scalar emulated instruction");

return isa<LoadInst>(I) ||

(isa<StoreInst>(I) &&

@@ -6099,7 +6167,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {

}

-int LoopVectorizationCostModel::computePredInstDiscount(

+InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {

assert(!isUniformAfterVectorization(PredInst, VF) &&

"Instruction marked uniform-after-vectorization will be predicated");

@@ -6173,13 +6241,14 @@ int LoopVectorizationCostModel::computePredInstDiscount(

// Compute the scalarization overhead of needed insertelement instructions

// and phi nodes.

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {

ScalarCost += TTI.getScalarizationOverhead(

cast<VectorType>(ToVectorTy(I->getType(), VF)),

- APInt::getAllOnes(VF.getFixedValue()), true, false);

+ APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,

+ /*Extract*/ false, CostKind);

ScalarCost +=

- VF.getFixedValue() *

- TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);

+ VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);

}

// Compute the scalarization overhead of needed extractelement

@@ -6195,7 +6264,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(

else if (needsExtract(J, VF)) {

ScalarCost += TTI.getScalarizationOverhead(

cast<VectorType>(ToVectorTy(J->getType(), VF)),

- APInt::getAllOnes(VF.getFixedValue()), false, true);

+ APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,

+ /*Extract*/ true, CostKind);

}

@@ -6208,7 +6278,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(

ScalarCosts[I] = ScalarCost;

}

- return *Discount.getValue();

+ return Discount;

}

LoopVectorizationCostModel::VectorizationCostTy

@@ -6324,19 +6394,20 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

// Don't pass *I here, since it is scalar but will actually be part of a

// vectorized loop where the user of it is a vectorized instruction.

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

const Align Alignment = getLoadStoreAlignment(I);

- Cost += VF.getKnownMinValue() *

- TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,

- AS, TTI::TCK_RecipThroughput);

+ Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),

+ ValTy->getScalarType(),

+ Alignment, AS, CostKind);

// Get the overhead of the extractelement and insertelement instructions

// we might create due to scalarization.

- Cost += getScalarizationOverhead(I, VF);

+ Cost += getScalarizationOverhead(I, VF, CostKind);

// If we have a predicated load/store, it will need extra i1 extracts and

// conditional branches, but may not be executed for each vector lane. Scale

// the cost by the probability of executing the predicated block.

- if (isPredicatedInst(I, VF)) {

+ if (isPredicatedInst(I)) {

Cost /= getReciprocalPredBlockProb();

// Add the cost of an i1 extract and a branch

@@ -6344,8 +6415,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);

Cost += TTI.getScalarizationOverhead(

Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),

- /*Insert=*/false, /*Extract=*/true);

- Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);

+ /*Insert=*/false, /*Extract=*/true, CostKind);

+ Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);

if (useEmulatedMaskMemRefHack(I, VF))

// Artificially setting to a high enough value to practically disable

@@ -6370,17 +6441,19 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,

"Stride should be 1 or -1 for consecutive memory access");

const Align Alignment = getLoadStoreAlignment(I);

InstructionCost Cost = 0;

- if (Legal->isMaskRequired(I))

+ if (Legal->isMaskRequired(I)) {

Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,

CostKind);

- else

+ } else {

+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));

Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,

- CostKind, I);

+ CostKind, OpInfo, I);

+ }

bool Reverse = ConsecutiveStride < 0;

if (Reverse)

- Cost +=

- TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);

+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,

+ std::nullopt, CostKind, 0);

return Cost;

}

@@ -6409,7 +6482,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

(isLoopInvariantStoreValue

? 0

: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,

- VF.getKnownMinValue() - 1));

+ CostKind, VF.getKnownMinValue() - 1));

}

InstructionCost

@@ -6437,6 +6510,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

Type *ValTy = getLoadStoreType(I);

auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));

unsigned AS = getLoadStoreAddressSpace(I);

+ enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

auto Group = getInterleavedAccessGroup(I);

assert(Group && "Fail to get an interleaved access group.");

@@ -6456,25 +6530,26 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));

InstructionCost Cost = TTI.getInterleavedMemoryOpCost(

I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),

- AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);

+ AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);

if (Group->isReverse()) {

// TODO: Add support for reversed masked interleaved access.

assert(!Legal->isMaskRequired(I) &&

"Reverse masked interleaved access not supported.");

- Cost +=

- Group->getNumMembers() *

- TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);

+ Cost += Group->getNumMembers() *

+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,

+ std::nullopt, CostKind, 0);

}

return Cost;

}

-Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

+std::optional<InstructionCost>

+LoopVectorizationCostModel::getReductionPatternCost(

Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {

using namespace llvm::PatternMatch;

// Early exit for no inloop reductions

if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))

- return None;

+ return std::nullopt;

auto *VectorTy = cast<VectorType>(Ty);

// We are looking for a pattern of, and finding the minimal acceptable cost:

@@ -6492,20 +6567,19 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

Instruction *RetI = I;

if (match(RetI, m_ZExtOrSExt(m_Value()))) {

if (!RetI->hasOneUser())

- return None;

+ return std::nullopt;

RetI = RetI->user_back();

}

- if (match(RetI, m_Mul(m_Value(), m_Value())) &&

+ if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&

RetI->user_back()->getOpcode() == Instruction::Add) {

- if (!RetI->hasOneUser())

- return None;

RetI = RetI->user_back();

}

// Test if the found instruction is a reduction, and if not return an invalid

// cost specifying the parent to use the original cost modelling.

if (!InLoopReductionImmediateChains.count(RetI))

- return None;

+ return std::nullopt;

// Find the reduction this chain is a part of and calculate the basic cost of

// the reduction on its own.

@@ -6541,7 +6615,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);

Instruction *Op0, *Op1;

- if (RedOp &&

+ if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&

match(RedOp,

m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&

match(Op0, m_ZExtOrSExt(m_Value())) &&

@@ -6550,7 +6624,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&

(Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {

- // Matched reduce(ext(mul(ext(A), ext(B)))

+ // Matched reduce.add(ext(mul(ext(A), ext(B)))

// Note that the extend opcodes need to all match, or if A==B they will have

// been converted to zext(mul(sext(A), sext(A))) as it is known positive,

// which is equally fine.

@@ -6567,9 +6641,8 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,

TTI::CastContextHint::None, CostKind, RedOp);

- InstructionCost RedCost = TTI.getExtendedAddReductionCost(

- /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,

- CostKind);

+ InstructionCost RedCost = TTI.getMulAccReductionCost(

+ IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);

if (RedCost.isValid() &&

RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)

@@ -6579,16 +6652,16 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

// Matched reduce(ext(A))

bool IsUnsigned = isa<ZExtInst>(RedOp);

auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);

- InstructionCost RedCost = TTI.getExtendedAddReductionCost(

- /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,

- CostKind);

+ InstructionCost RedCost = TTI.getExtendedReductionCost(

+ RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,

+ RdxDesc.getFastMathFlags(), CostKind);

InstructionCost ExtCost =

TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,

TTI::CastContextHint::None, CostKind, RedOp);

if (RedCost.isValid() && RedCost < BaseCost + ExtCost)

return I == RetI ? RedCost : 0;

- } else if (RedOp &&

+ } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&

match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {

if (match(Op0, m_ZExtOrSExt(m_Value())) &&

Op0->getOpcode() == Op1->getOpcode() &&

@@ -6601,7 +6674,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

: Op0Ty;

auto *ExtType = VectorType::get(LargestOpTy, VectorTy);

- // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of

+ // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of

// different sizes. We take the largest type as the ext to reduce, and add

// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).

InstructionCost ExtCost0 = TTI.getCastInstrCost(

@@ -6613,9 +6686,8 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

InstructionCost MulCost =

TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);

- InstructionCost RedCost = TTI.getExtendedAddReductionCost(

- /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,

- CostKind);

+ InstructionCost RedCost = TTI.getMulAccReductionCost(

+ IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);

InstructionCost ExtraExtCost = 0;

if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {

Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;

@@ -6629,20 +6701,19 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

(RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))

return I == RetI ? RedCost : 0;

} else if (!match(I, m_ZExtOrSExt(m_Value()))) {

- // Matched reduce(mul())

+ // Matched reduce.add(mul())

InstructionCost MulCost =

TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);

- InstructionCost RedCost = TTI.getExtendedAddReductionCost(

- /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,

- CostKind);

+ InstructionCost RedCost = TTI.getMulAccReductionCost(

+ true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);

if (RedCost.isValid() && RedCost < MulCost + BaseCost)

return I == RetI ? RedCost : 0;

}

- return I == RetI ? Optional<InstructionCost>(BaseCost) : None;

+ return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;

}

InstructionCost

@@ -6655,9 +6726,10 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

const Align Alignment = getLoadStoreAlignment(I);

unsigned AS = getLoadStoreAddressSpace(I);

+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));

return TTI.getAddressComputationCost(ValTy) +

TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,

- TTI::TCK_RecipThroughput, I);

+ TTI::TCK_RecipThroughput, OpInfo, I);

}

return getWideningCost(I, VF);

}

@@ -6705,9 +6777,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,

return VectorizationCostTy(C, TypeNotScalarized);

}

-InstructionCost

-LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

- ElementCount VF) const {

+InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(

+ Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {

// There is no mechanism yet to create a scalable scalarization loop,

// so this is currently Invalid.

@@ -6722,8 +6793,9 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

if (!RetTy->isVoidTy() &&

(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))

Cost += TTI.getScalarizationOverhead(

- cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,

- false);

+ cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),

+ /*Insert*/ true,

+ /*Extract*/ false, CostKind);

// Some targets keep addresses scalar.

if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())

@@ -6743,7 +6815,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

for (auto *V : filterExtractingOperands(Ops, VF))

Tys.push_back(MaybeVectorizeType(V->getType(), VF));

return Cost + TTI.getOperandsScalarizationOverhead(

- filterExtractingOperands(Ops, VF), Tys);

+ filterExtractingOperands(Ops, VF), Tys, CostKind);

}

void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {

@@ -6765,29 +6837,47 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {

NumPredStores++;

if (Legal->isUniformMemOp(I)) {

- // Lowering story for uniform memory ops is currently a bit complicated.

- // Scalarization works for everything which isn't a store with scalable

- // VF. Fixed len VFs just scalarize and then DCE later; scalarization

- // knows how to handle uniform-per-part values (i.e. the first lane

- // in each unrolled VF) and can thus handle scalable loads too. For

- // scalable stores, we use a scatter if legal. If not, we have no way

- // to lower (currently) and thus have to abort vectorization.

- if (isa<StoreInst>(&I) && VF.isScalable()) {

- if (isLegalGatherOrScatter(&I, VF))

- setWideningDecision(&I, VF, CM_GatherScatter,

- getGatherScatterCost(&I, VF));

- else

- // Error case, abort vectorization

- setWideningDecision(&I, VF, CM_Scalarize,

- InstructionCost::getInvalid());

- continue;

- }

+ auto isLegalToScalarize = [&]() {

+ if (!VF.isScalable())

+ // Scalarization of fixed length vectors "just works".

+ return true;

+ // We have dedicated lowering for unpredicated uniform loads and

+ // stores. Note that even with tail folding we know that at least

+ // one lane is active (i.e. generalized predication is not possible

+ // here), and the logic below depends on this fact.

+ if (!foldTailByMasking())

+ return true;

+ // For scalable vectors, a uniform memop load is always

+ // uniform-by-parts and we know how to scalarize that.

+ if (isa<LoadInst>(I))

+ return true;

+ // A uniform store isn't neccessarily uniform-by-part

+ // and we can't assume scalarization.

+ auto &SI = cast<StoreInst>(I);

+ return TheLoop->isLoopInvariant(SI.getValueOperand());

+ };

+ const InstructionCost GatherScatterCost =

+ isLegalGatherOrScatter(&I, VF) ?

+ getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();

// Load: Scalar load + broadcast

// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract

- // TODO: Avoid replicating loads and stores instead of relying on

- // instcombine to remove them.

- setWideningDecision(&I, VF, CM_Scalarize,

- getUniformMemOpCost(&I, VF));

+ // FIXME: This cost is a significant under-estimate for tail folded

+ // memory ops.

+ const InstructionCost ScalarizationCost = isLegalToScalarize() ?

+ getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();

+ // Choose better solution for the current VF, Note that Invalid

+ // costs compare as maximumal large. If both are invalid, we get

+ // scalable invalid which signals a failure and a vectorization abort.

+ if (GatherScatterCost < ScalarizationCost)

+ setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);

+ else

+ setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);

continue;

}

@@ -6982,7 +7072,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);

return (

TTI.getScalarizationOverhead(

- Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +

+ Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),

+ /*Insert*/ false, /*Extract*/ true, CostKind) +

(TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));

} else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())

// The back-edge branch will remain, as will all scalar branches.

@@ -6998,11 +7089,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

auto *Phi = cast<PHINode>(I);

// First-order recurrences are replaced by vector shuffles inside the loop.

- // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.

- if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))

- return TTI.getShuffleCost(

- TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),

- None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));

+ if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {

+ SmallVector<int> Mask(VF.getKnownMinValue());

+ std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);

+ return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,

+ cast<VectorType>(VectorTy), Mask, CostKind,

+ VF.getKnownMinValue() - 1);

+ }

// Phi nodes in non-header blocks (not inductions, reductions, etc.) are

// converted into select instructions. We require N - 1 selects per phi

@@ -7020,34 +7113,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

case Instruction::SDiv:

case Instruction::URem:

case Instruction::SRem:

- // If we have a predicated instruction, it may not be executed for each

- // vector lane. Get the scalarization cost and scale this amount by the

- // probability of executing the predicated block. If the instruction is not

- // predicated, we fall through to the next case.

- if (VF.isVector() && isScalarWithPredication(I, VF)) {

- InstructionCost Cost = 0;

- // These instructions have a non-void type, so account for the phi nodes

- // that we will create. This cost is likely to be zero. The phi node

- // cost, if any, should be scaled by the block probability because it

- // models a copy at the end of each predicated block.

- Cost += VF.getKnownMinValue() *

- TTI.getCFInstrCost(Instruction::PHI, CostKind);

- // The cost of the non-predicated instruction.

- Cost += VF.getKnownMinValue() *

- TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);

- // The cost of insertelement and extractelement instructions needed for

- // scalarization.

- Cost += getScalarizationOverhead(I, VF);

- // Scale the cost by the probability of executing the predicated blocks.

- // This assumes the predicated block for each vector lane is equally

- // likely.

- return Cost / getReciprocalPredBlockProb();

+ if (VF.isVector() && isPredicatedInst(I)) {

+ const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);

+ return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?

+ ScalarCost : SafeDivisorCost;

}

- LLVM_FALLTHROUGH;

+ // We've proven all lanes safe to speculate, fall through.

+ [[fallthrough]];

case Instruction::Add:

case Instruction::FAdd:

case Instruction::Sub:

@@ -7073,22 +7145,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

// Certain instructions can be cheaper to vectorize if they have a constant

// second vector operand. One example of this are shifts on x86.

Value *Op2 = I->getOperand(1);

- TargetTransformInfo::OperandValueProperties Op2VP;

- TargetTransformInfo::OperandValueKind Op2VK =

- TTI.getOperandInfo(Op2, Op2VP);

- if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))

- Op2VK = TargetTransformInfo::OK_UniformValue;

+ auto Op2Info = TTI.getOperandInfo(Op2);

+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))

+ Op2Info.Kind = TargetTransformInfo::OK_UniformValue;

SmallVector<const Value *, 4> Operands(I->operand_values());

return TTI.getArithmeticInstrCost(

- I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,

- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);

+ I->getOpcode(), VectorTy, CostKind,

+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

+ Op2Info, Operands, I);

}

case Instruction::FNeg: {

return TTI.getArithmeticInstrCost(

- I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,

- TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,

- TargetTransformInfo::OP_None, I->getOperand(0), I);

+ I->getOpcode(), VectorTy, CostKind,

+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

+ I->getOperand(0), I);

}

case Instruction::Select: {

SelectInst *SI = cast<SelectInst>(I);

@@ -7101,17 +7173,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {

// select x, y, false --> x & y

// select x, true, y --> x | y

- TTI::OperandValueProperties Op1VP = TTI::OP_None;

- TTI::OperandValueProperties Op2VP = TTI::OP_None;

- TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);

- TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);

+ const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);

+ const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);

assert(Op0->getType()->getScalarSizeInBits() == 1 &&

Op1->getType()->getScalarSizeInBits() == 1);

SmallVector<const Value *, 2> Operands{Op0, Op1};

return TTI.getArithmeticInstrCost(

match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,

- CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);

+ CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);

}

Type *CondTy = SI->getCondition()->getType();

@@ -7153,7 +7223,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

case Instruction::BitCast:

if (I->getType()->isPointerTy())

return 0;

- LLVM_FALLTHROUGH;

+ [[fallthrough]];

case Instruction::ZExt:

case Instruction::SExt:

case Instruction::FPToUI:

@@ -7262,7 +7332,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

// the result would need to be a vector of pointers.

if (VF.isScalable())

return InstructionCost::getInvalid();

- LLVM_FALLTHROUGH;

+ [[fallthrough]];

default:

// This opcode is unknown. Assume that it is the same as 'mul'.

return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);

@@ -7276,7 +7346,6 @@ static const char lv_name[] = "Loop Vectorization";

INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)

INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)

INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)

INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)

INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)

@@ -7317,14 +7386,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {

// Ignore type-promoting instructions we identified during reduction

// detection.

- for (auto &Reduction : Legal->getReductionVars()) {

+ for (const auto &Reduction : Legal->getReductionVars()) {

const RecurrenceDescriptor &RedDes = Reduction.second;

const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();

VecValuesToIgnore.insert(Casts.begin(), Casts.end());

}

// Ignore type-casting instructions we identified during induction

// detection.

- for (auto &Induction : Legal->getInductionVars()) {

+ for (const auto &Induction : Legal->getInductionVars()) {

const InductionDescriptor &IndDes = Induction.second;

const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();

VecValuesToIgnore.insert(Casts.begin(), Casts.end());

@@ -7332,7 +7401,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {

}

void LoopVectorizationCostModel::collectInLoopReductions() {

- for (auto &Reduction : Legal->getReductionVars()) {

+ for (const auto &Reduction : Legal->getReductionVars()) {

PHINode *Phi = Reduction.first;

const RecurrenceDescriptor &RdxDesc = Reduction.second;

@@ -7394,7 +7463,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

if (UserVF.isZero()) {

VF = ElementCount::getFixed(determineVPlanVF(

TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

- .getFixedSize(),

+ .getFixedValue(),

CM));

LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");

@@ -7425,12 +7494,12 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

return VectorizationFactor::Disabled();

}

-Optional<VectorizationFactor>

+std::optional<VectorizationFactor>

LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

assert(OrigLoop->isInnermost() && "Inner loop expected.");

FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);

if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.

- return None;

+ return std::nullopt;

// Invalidate interleave groups if all blocks of loop will be predicated.

if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&

@@ -7550,9 +7619,26 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

InnerLoopVectorizer &ILV,

DominatorTree *DT,

bool IsEpilogueVectorization) {

+ assert(BestVPlan.hasVF(BestVF) &&

+ "Trying to execute plan with unsupported VF");

+ assert(BestVPlan.hasUF(BestUF) &&

+ "Trying to execute plan with unsupported UF");

LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF

<< '\n');

+ // Workaround! Compute the trip count of the original loop and cache it

+ // before we start modifying the CFG. This code has a systemic problem

+ // wherein it tries to run analysis over partially constructed IR; this is

+ // wrong, and not simply for SCEV. The trip count of the original loop

+ // simply happens to be prone to hitting this in practice. In theory, we

+ // can hit the same issue for any SCEV, or ValueTracking query done during

+ // mutation. See PR49900.

+ ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());

+ if (!IsEpilogueVectorization)

+ VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

// Perform the actual loop transformation.

// 1. Set up the skeleton for vectorization, including vector pre-header and

@@ -7602,7 +7688,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

// replace the vectorizer-specific hints below).

MDNode *OrigLoopID = OrigLoop->getLoopID();

- Optional<MDNode *> VectorizedLoopID =

+ std::optional<MDNode *> VectorizedLoopID =

makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,

LLVMLoopVectorizeFollowupVectorized});

@@ -7610,7 +7696,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();

Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);

if (VectorizedLoopID)

- L->setLoopID(VectorizedLoopID.value());

+ L->setLoopID(*VectorizedLoopID);

else {

// Keep all loop hints from the original loop on the vector loop (we'll

// replace the vectorizer-specific hints below).

@@ -7620,9 +7706,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

LoopVectorizeHints Hints(L, true, *ORE);

Hints.setAlreadyVectorized();

}

- // Disable runtime unrolling when vectorizing the epilogue loop.

- if (CanonicalIVStartValue)

- AddRuntimeUnrollDisableMetaData(L);

+ AddRuntimeUnrollDisableMetaData(L);

// 3. Fix the vectorized code: take care of header phi's, live-outs,

// predication, updating analyses.

@@ -7651,16 +7735,6 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }

/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.

std::pair<BasicBlock *, Value *>

EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {

- MDNode *OrigLoopID = OrigLoop->getLoopID();

- // Workaround! Compute the trip count of the original loop and cache it

- // before we start modifying the CFG. This code has a systemic problem

- // wherein it tries to run analysis over partially constructed IR; this is

- // wrong, and not simply for SCEV. The trip count of the original loop

- // simply happens to be prone to hitting this in practice. In theory, we

- // can hit the same issue for any SCEV, or ValueTracking query done during

- // mutation. See PR49900.

- getOrCreateTripCount(OrigLoop->getLoopPreheader());

createVectorLoopSkeleton("");

// Generate the code to check the minimum iteration count of the vector

@@ -7691,11 +7765,11 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {

EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

// Skip induction resume value creation here because they will be created in

- // the second pass. If we created them here, they wouldn't be used anyway,

- // because the vplan in the second pass still contains the inductions from the

- // original loop.

+ // the second pass for the scalar loop. The induction resume values for the

+ // inductions in the epilogue loop are created before executing the plan for

+ // the epilogue loop.

- return {completeLoopSkeleton(OrigLoopID), nullptr};

+ return {completeLoopSkeleton(), nullptr};

}

void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {

@@ -7779,7 +7853,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.

std::pair<BasicBlock *, Value *>

EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {

- MDNode *OrigLoopID = OrigLoop->getLoopID();

createVectorLoopSkeleton("vec.epilog.");

// Now, compare the remaining count and if there aren't enough iterations to

@@ -7825,31 +7898,40 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {

DT->changeImmediateDominator(LoopExitBlock,

EPI.EpilogueIterationCountCheck);

- // Keep track of bypass blocks, as they feed start values to the induction

- // phis in the scalar loop preheader.

+ // Keep track of bypass blocks, as they feed start values to the induction and

+ // reduction phis in the scalar loop preheader.

if (EPI.SCEVSafetyCheck)

LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);

if (EPI.MemSafetyCheck)

LoopBypassBlocks.push_back(EPI.MemSafetyCheck);

LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);

- // The vec.epilog.iter.check block may contain Phi nodes from reductions which

- // merge control-flow from the latch block and the middle block. Update the

- // incoming values here and move the Phi into the preheader.

+ // The vec.epilog.iter.check block may contain Phi nodes from inductions or

+ // reductions which merge control-flow from the latch block and the middle

+ // block. Update the incoming values here and move the Phi into the preheader.

SmallVector<PHINode *, 4> PhisInBlock;

for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())

PhisInBlock.push_back(&Phi);

for (PHINode *Phi : PhisInBlock) {

+ Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());

Phi->replaceIncomingBlockWith(

VecEpilogueIterationCountCheck->getSinglePredecessor(),

VecEpilogueIterationCountCheck);

+ // If the phi doesn't have an incoming value from the

+ // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming

+ // value and also those from other check blocks. This is needed for

+ // reduction phis only.

+ if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {

+ return EPI.EpilogueIterationCountCheck == IncB;

+ }))

+ continue;

Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);

if (EPI.SCEVSafetyCheck)

Phi->removeIncomingValue(EPI.SCEVSafetyCheck);

if (EPI.MemSafetyCheck)

Phi->removeIncomingValue(EPI.MemSafetyCheck);

- Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());

}

// Generate a resume induction for the vector epilogue and put it in the

@@ -7871,7 +7953,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {

createInductionResumeValues({VecEpilogueIterationCountCheck,

EPI.VectorTripCount} /* AdditionalBypass */);

- return {completeLoopSkeleton(OrigLoopID), EPResumeVal};

+ return {completeLoopSkeleton(), EPResumeVal};

}

BasicBlock *

@@ -8149,9 +8231,18 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(

*PSE.getSE(), *OrigLoop, Range);

// Check if this is pointer induction. If so, build the recipe for it.

- if (auto *II = Legal->getPointerInductionDescriptor(Phi))

- return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,

- *PSE.getSE());

+ if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {

+ VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),

+ *PSE.getSE());

+ assert(isa<SCEVConstant>(II->getStep()));

+ return new VPWidenPointerInductionRecipe(

+ Phi, Operands[0], Step, *II,

+ LoopVectorizationPlanner::getDecisionAndClampRange(

+ [&](ElementCount VF) {

+ return CM.isScalarAfterVectorization(Phi, VF);

+ },

+ Range));

+ }

return nullptr;

}

@@ -8188,12 +8279,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,

VPlanPtr &Plan) {

// If all incoming values are equal, the incoming VPValue can be used directly

// instead of creating a new VPBlendRecipe.

- VPValue *FirstIncoming = Operands[0];

- if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {

- return FirstIncoming == Inc;

- })) {

+ if (llvm::all_equal(Operands))

return Operands[0];

- }

unsigned NumIncoming = Phi->getNumIncomingValues();

// For in-loop reductions, we do not need to create an additional select.

@@ -8252,24 +8339,42 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

ID == Intrinsic::experimental_noalias_scope_decl))

return nullptr;

- auto willWiden = [&](ElementCount VF) -> bool {

- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

- // The following case may be scalarized depending on the VF.

- // The flag shows whether we use Intrinsic or a usual Call for vectorized

- // version of the instruction.

- // Is it beneficial to perform intrinsic call compared to lib call?

- bool NeedToScalarize = false;

- InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);

- InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;

- bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;

- return UseVectorIntrinsic || !NeedToScalarize;

- };

+ ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());

- if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))

- return nullptr;

+ // Is it beneficial to perform intrinsic call compared to lib call?

+ bool ShouldUseVectorIntrinsic =

+ ID && LoopVectorizationPlanner::getDecisionAndClampRange(

+ [&](ElementCount VF) -> bool {

+ bool NeedToScalarize = false;

+ // Is it beneficial to perform intrinsic call compared to lib

+ // call?

+ InstructionCost CallCost =

+ CM.getVectorCallCost(CI, VF, NeedToScalarize);

+ InstructionCost IntrinsicCost =

+ CM.getVectorIntrinsicCost(CI, VF);

+ return IntrinsicCost <= CallCost;

+ },

+ Range);

+ if (ShouldUseVectorIntrinsic)

+ return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);

+ // Is better to call a vectorized version of the function than to to scalarize

+ // the call?

+ auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(

+ [&](ElementCount VF) -> bool {

+ // The following case may be scalarized depending on the VF.

+ // The flag shows whether we can use a usual Call for vectorized

+ // version of the instruction.

+ bool NeedToScalarize = false;

+ CM.getVectorCallCost(CI, VF, NeedToScalarize);

+ return !NeedToScalarize;

+ },

+ Range);

+ if (ShouldUseVectorCall)

+ return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),

+ Intrinsic::not_intrinsic);

- ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());

- return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));

+ return nullptr;

}

bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {

@@ -8286,55 +8391,65 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {

Range);

}

-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,

- ArrayRef<VPValue *> Operands) const {

- auto IsVectorizableOpcode = [](unsigned Opcode) {

- switch (Opcode) {

- case Instruction::Add:

- case Instruction::And:

- case Instruction::AShr:

- case Instruction::BitCast:

- case Instruction::FAdd:

- case Instruction::FCmp:

- case Instruction::FDiv:

- case Instruction::FMul:

- case Instruction::FNeg:

- case Instruction::FPExt:

- case Instruction::FPToSI:

- case Instruction::FPToUI:

- case Instruction::FPTrunc:

- case Instruction::FRem:

- case Instruction::FSub:

- case Instruction::ICmp:

- case Instruction::IntToPtr:

- case Instruction::LShr:

- case Instruction::Mul:

- case Instruction::Or:

- case Instruction::PtrToInt:

- case Instruction::SDiv:

- case Instruction::Select:

- case Instruction::SExt:

- case Instruction::Shl:

- case Instruction::SIToFP:

- case Instruction::SRem:

- case Instruction::Sub:

- case Instruction::Trunc:

- case Instruction::UDiv:

- case Instruction::UIToFP:

- case Instruction::URem:

- case Instruction::Xor:

- case Instruction::ZExt:

- case Instruction::Freeze:

- return true;

+VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,

+ ArrayRef<VPValue *> Operands,

+ VPBasicBlock *VPBB, VPlanPtr &Plan) {

+ switch (I->getOpcode()) {

+ default:

+ return nullptr;

+ case Instruction::SDiv:

+ case Instruction::UDiv:

+ case Instruction::SRem:

+ case Instruction::URem: {

+ // If not provably safe, use a select to form a safe divisor before widening the

+ // div/rem operation itself. Otherwise fall through to general handling below.

+ if (CM.isPredicatedInst(I)) {

+ SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());

+ VPValue *Mask = createBlockInMask(I->getParent(), Plan);

+ VPValue *One =

+ Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));

+ auto *SafeRHS =

+ new VPInstruction(Instruction::Select, {Mask, Ops[1], One},

+ I->getDebugLoc());

+ VPBB->appendRecipe(SafeRHS);

+ Ops[1] = SafeRHS;

+ return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));

}

- return false;

+ LLVM_FALLTHROUGH;

+ }

+ case Instruction::Add:

+ case Instruction::And:

+ case Instruction::AShr:

+ case Instruction::BitCast:

+ case Instruction::FAdd:

+ case Instruction::FCmp:

+ case Instruction::FDiv:

+ case Instruction::FMul:

+ case Instruction::FNeg:

+ case Instruction::FPExt:

+ case Instruction::FPToSI:

+ case Instruction::FPToUI:

+ case Instruction::FPTrunc:

+ case Instruction::FRem:

+ case Instruction::FSub:

+ case Instruction::ICmp:

+ case Instruction::IntToPtr:

+ case Instruction::LShr:

+ case Instruction::Mul:

+ case Instruction::Or:

+ case Instruction::PtrToInt:

+ case Instruction::Select:

+ case Instruction::SExt:

+ case Instruction::Shl:

+ case Instruction::SIToFP:

+ case Instruction::Sub:

+ case Instruction::Trunc:

+ case Instruction::UIToFP:

+ case Instruction::Xor:

+ case Instruction::ZExt:

+ case Instruction::Freeze:

+ return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));

};

- if (!IsVectorizableOpcode(I->getOpcode()))

- return nullptr;

- // Success: widen this instruction.

- return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));

}

void VPRecipeBuilder::fixHeaderPhis() {

@@ -8354,9 +8469,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },

Range);

- bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(

- [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },

- Range);

+ bool IsPredicated = CM.isPredicatedInst(I);

// Even if the instruction is not marked as uniform, there are certain

// intrinsic calls that can be effectively treated as such, so we check for

@@ -8396,11 +8509,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

// value. Avoid hoisting the insert-element which packs the scalar value into

// a vector value, as that happens iff all users use the vector value.

for (VPValue *Op : Recipe->operands()) {

- auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());

+ auto *PredR =

+ dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());

if (!PredR)

continue;

- auto *RepR =

- cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());

+ auto *RepR = cast<VPReplicateRecipe>(

+ PredR->getOperand(0)->getDefiningRecipe());

assert(RepR->isPredicated() &&

"expected Replicate recipe to be predicated");

RepR->setAlsoPack(false);

@@ -8469,20 +8583,26 @@ VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,

VPRecipeOrVPValueTy

VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

ArrayRef<VPValue *> Operands,

- VFRange &Range, VPlanPtr &Plan) {

+ VFRange &Range, VPBasicBlock *VPBB,

+ VPlanPtr &Plan) {

// First, check for specific widening recipes that deal with inductions, Phi

// nodes, calls and memory operations.

VPRecipeBase *Recipe;

if (auto Phi = dyn_cast<PHINode>(Instr)) {

if (Phi->getParent() != OrigLoop->getHeader())

return tryToBlend(Phi, Operands, Plan);

+ // Always record recipes for header phis. Later first-order recurrence phis

+ // can have earlier phis as incoming values.

+ recordRecipeOf(Phi);

if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))

return toVPRecipeResult(Recipe);

VPHeaderPHIRecipe *PhiRecipe = nullptr;

assert((Legal->isReductionVariable(Phi) ||

- Legal->isFirstOrderRecurrence(Phi)) &&

- "can only widen reductions and first-order recurrences here");

+ Legal->isFixedOrderRecurrence(Phi)) &&

+ "can only widen reductions and fixed-order recurrences here");

VPValue *StartV = Operands[0];

if (Legal->isReductionVariable(Phi)) {

const RecurrenceDescriptor &RdxDesc =

@@ -8493,13 +8613,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

CM.isInLoopReduction(Phi),

CM.useOrderedReductions(RdxDesc));

} else {

+ // TODO: Currently fixed-order recurrences are modeled as chains of

+ // first-order recurrences. If there are no users of the intermediate

+ // recurrences in the chain, the fixed order recurrence should be modeled

+ // directly, enabling more efficient codegen.

PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);

}

// Record the incoming value from the backedge, so we can add the incoming

// value from the backedge after all recipes have been created.

- recordRecipeOf(cast<Instruction>(

- Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));

+ auto *Inc = cast<Instruction>(

+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));

+ auto RecipeIter = Ingredient2Recipe.find(Inc);

+ if (RecipeIter == Ingredient2Recipe.end())

+ recordRecipeOf(Inc);

PhisToFix.push_back(PhiRecipe);

return toVPRecipeResult(PhiRecipe);

}

@@ -8534,7 +8662,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

*SI, make_range(Operands.begin(), Operands.end()), InvariantCond));

}

- return toVPRecipeResult(tryToWiden(Instr, Operands));

+ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));

}

void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

@@ -8564,7 +8692,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

assert(

SinkTarget != FirstInst &&

"Must find a live instruction (at least the one feeding the "

- "first-order recurrence PHI) before reaching beginning of the block");

+ "fixed-order recurrence PHI) before reaching beginning of the block");

SinkTarget = SinkTarget->getPrevNode();

assert(SinkTarget != P.first &&

"sink source equals target, no sinking required");

@@ -8696,18 +8824,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// Mark instructions we'll need to sink later and their targets as

// ingredients whose recipe we'll need to record.

- for (auto &Entry : SinkAfter) {

+ for (const auto &Entry : SinkAfter) {

RecipeBuilder.recordRecipeOf(Entry.first);

RecipeBuilder.recordRecipeOf(Entry.second);

}

- for (auto &Reduction : CM.getInLoopReductionChains()) {

+ for (const auto &Reduction : CM.getInLoopReductionChains()) {

PHINode *Phi = Reduction.first;

RecurKind Kind =

Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();

const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;

RecipeBuilder.recordRecipeOf(Phi);

- for (auto &R : ReductionOperations) {

+ for (const auto &R : ReductionOperations) {

RecipeBuilder.recordRecipeOf(R);

// For min/max reductions, where we have a pair of icmp/select, we also

// need to record the ICmp recipe, so it can be removed later.

@@ -8805,14 +8933,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

continue;

if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(

- Instr, Operands, Range, Plan)) {

+ Instr, Operands, Range, VPBB, Plan)) {

// If Instr can be simplified to an existing VPValue, use it.

if (RecipeOrValue.is<VPValue *>()) {

auto *VPV = RecipeOrValue.get<VPValue *>();

Plan->addVPValue(Instr, VPV);

// If the re-used value is a recipe, register the recipe for the

// instruction, in case the recipe for Instr needs to be recorded.

- if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))

+ if (VPRecipeBase *R = VPV->getDefiningRecipe())

RecipeBuilder.setRecipe(Instr, R);

continue;

}

@@ -8854,11 +8982,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());

}

- HeaderVPBB->setName("vector.body");

- // Fold the last, empty block into its predecessor.

- VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);

- assert(VPBB && "expected to fold last (empty) block");

// After here, VPBB should not be used.

VPBB = nullptr;

@@ -8888,7 +9011,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

}

return nullptr;

};

- for (auto &Entry : SinkAfter) {

+ for (const auto &Entry : SinkAfter) {

VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);

VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);

@@ -8949,14 +9072,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

RecipeBuilder, Range.Start);

// Introduce a recipe to combine the incoming and previous values of a

- // first-order recurrence.

+ // fixed-order recurrence.

for (VPRecipeBase &R :

Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);

if (!RecurPhi)

continue;

- VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();

+ VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();

+ // Fixed-order recurrences do not contain cycles, so this loop is guaranteed

+ // to terminate.

+ while (auto *PrevPhi =

+ dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))

+ PrevRecipe = &PrevPhi->getBackedgeRecipe();

VPBasicBlock *InsertBlock = PrevRecipe->getParent();

auto *Region = GetReplicateRegion(PrevRecipe);

if (Region)

@@ -8983,7 +9111,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// Interleave memory: for each Interleave Group we marked earlier as relevant

// for this VPlan, replace the Recipes widening its memory instructions with a

// single VPInterleaveRecipe at its insertion point.

- for (auto IG : InterleaveGroups) {

+ for (const auto *IG : InterleaveGroups) {

auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(

RecipeBuilder.getRecipe(IG->getInsertPos()));

SmallVector<VPValue *, 4> StoredValues;

@@ -9011,33 +9139,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

}

- std::string PlanName;

- raw_string_ostream RSO(PlanName);

- ElementCount VF = Range.Start;

- Plan->addVF(VF);

- RSO << "Initial VPlan for VF={" << VF;

- for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {

+ for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);

+ VF *= 2)

Plan->addVF(VF);

- RSO << "," << VF;

- }

- RSO << "},UF>=1";

- RSO.flush();

- Plan->setName(PlanName);

+ Plan->setName("Initial VPlan");

// From this point onwards, VPlan-to-VPlan transformations may change the plan

// in ways that accessing values using original IR values is incorrect.

Plan->disableValue2VPValue();

VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());

- VPlanTransforms::sinkScalarOperands(*Plan);

VPlanTransforms::removeDeadRecipes(*Plan);

- VPlanTransforms::mergeReplicateRegions(*Plan);

- VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);

- // Fold Exit block into its predecessor if possible.

- // TODO: Fold block earlier once all VPlan transforms properly maintain a

- // VPBasicBlock as exit.

- VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());

+ bool ShouldSimplify = true;

+ while (ShouldSimplify) {

+ ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);

+ ShouldSimplify |=

+ VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);

+ ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);

+ }

+ VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);

+ VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);

assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");

return Plan;

@@ -9066,7 +9189,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

VPlanTransforms::VPInstructionsToVPRecipes(

OrigLoop, Plan,

[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },

- DeadInstructions, *PSE.getSE());

+ DeadInstructions, *PSE.getSE(), *TLI);

// Remove the existing terminator of the exiting block of the top-most region.

// A BranchOnCount will be added instead when adding the canonical IV recipes.

@@ -9087,7 +9210,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

void LoopVectorizationPlanner::adjustRecipesForReductions(

VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,

ElementCount MinVF) {

- for (auto &Reduction : CM.getInLoopReductionChains()) {

+ for (const auto &Reduction : CM.getInLoopReductionChains()) {

PHINode *Phi = Reduction.first;

const RecurrenceDescriptor &RdxDesc =

Legal->getReductionVars().find(Phi)->second;

@@ -9127,9 +9250,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;

VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));

- auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())

- ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)

- : nullptr;

+ VPValue *CondOp = nullptr;

+ if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {

+ VPBuilder::InsertPointGuard Guard(Builder);

+ Builder.setInsertPoint(WidenRecipe->getParent(),

+ WidenRecipe->getIterator());

+ CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);

+ }

if (IsFMulAdd) {

// If the instruction is a call to the llvm.fmuladd intrinsic then we

@@ -9179,7 +9306,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

VPValue *Cond =

RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);

VPValue *Red = PhiR->getBackedgeValue();

- assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&

+ assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&

"reduction recipe must be defined before latch");

Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});

}

@@ -9217,11 +9344,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,

}

#endif

-void VPWidenCallRecipe::execute(VPTransformState &State) {

- State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,

- *this, State);

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Int or FP induction being replicated.");

@@ -9353,8 +9475,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

PartStart, ConstantInt::get(PtrInd->getType(), Lane));

Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);

- Value *Step = CreateStepValue(IndDesc.getStep(), SE,

- State.CFG.PrevBB->getTerminator());

+ Value *Step = State.get(getOperand(1), VPIteration(0, Part));

Value *SclrGep = emitTransformedIndex(

State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);

SclrGep->setName("next.gep");

@@ -9378,12 +9499,9 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);

// A pointer induction, performed by using a gep

- const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();

Instruction *InductionLoc = &*State.Builder.GetInsertPoint();

- const SCEV *ScalarStep = IndDesc.getStep();

- SCEVExpander Exp(SE, DL, "induction");

- Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);

+ Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));

Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);

Value *NumUnrolledElems =

State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));

@@ -9411,6 +9529,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

StartOffset = State.Builder.CreateAdd(

StartOffset, State.Builder.CreateStepVector(VecPhiType));

+ assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&

+ "scalar step must be the same across all parts");

Value *GEP = State.Builder.CreateGEP(

IndDesc.getElementType(), NewPointerPhi,

State.Builder.CreateMul(

@@ -9421,8 +9541,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

}

-void VPScalarIVStepsRecipe::execute(VPTransformState &State) {

- assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");

+void VPDerivedIVRecipe::execute(VPTransformState &State) {

+ assert(!State.Instance && "VPDerivedIVRecipe being replicated.");

// Fast-math-flags propagate from the original induction instruction.

IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);

@@ -9432,52 +9552,33 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {

IndDesc.getInductionBinOp()->getFastMathFlags());

Value *Step = State.get(getStepValue(), VPIteration(0, 0));

- auto CreateScalarIV = [&](Value *&Step) -> Value * {

- Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));

- auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);

- if (!isCanonical() || CanonicalIV->getType() != Ty) {

- ScalarIV =

- Ty->isIntegerTy()

- ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)

- : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);

- ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,

- getStartValue()->getLiveInIRValue(), Step,

- IndDesc);

- ScalarIV->setName("offset.idx");

- }

- if (TruncToTy) {

- assert(Step->getType()->isIntegerTy() &&

- "Truncation requires an integer step");

- ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);

- Step = State.Builder.CreateTrunc(Step, TruncToTy);

- }

- return ScalarIV;

- };

- Value *ScalarIV = CreateScalarIV(Step);

- if (State.VF.isVector()) {

- buildScalarSteps(ScalarIV, Step, IndDesc, this, State);

- return;

+ Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));

+ Value *DerivedIV =

+ emitTransformedIndex(State.Builder, CanonicalIV,

+ getStartValue()->getLiveInIRValue(), Step, IndDesc);

+ DerivedIV->setName("offset.idx");

+ if (ResultTy != DerivedIV->getType()) {

+ assert(Step->getType()->isIntegerTy() &&

+ "Truncation requires an integer step");

+ DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);

}

+ assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- assert(!State.VF.isScalable() && "scalable vectors not yet supported.");

- Value *EntryPart;

- if (Step->getType()->isFloatingPointTy()) {

- Value *StartIdx =

- getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);

- // Floating-point operations inherit FMF via the builder's flags.

- Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);

- EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),

- ScalarIV, MulOp);

- } else {

- Value *StartIdx =

- getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);

- EntryPart = State.Builder.CreateAdd(

- ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");

- }

- State.set(this, EntryPart, Part);

- }

+ State.set(this, DerivedIV, VPIteration(0, 0));

+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {

+ // Fast-math-flags propagate from the original induction instruction.

+ IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);

+ if (IndDesc.getInductionBinOp() &&

+ isa<FPMathOperator>(IndDesc.getInductionBinOp()))

+ State.Builder.setFastMathFlags(

+ IndDesc.getInductionBinOp()->getFastMathFlags());

+ Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));

+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));

+ buildScalarSteps(BaseIV, Step, IndDesc, this, State);

}

void VPInterleaveRecipe::execute(VPTransformState &State) {

@@ -9536,9 +9637,10 @@ void VPReductionRecipe::execute(VPTransformState &State) {

}

void VPReplicateRecipe::execute(VPTransformState &State) {

+ Instruction *UI = getUnderlyingInstr();

if (State.Instance) { // Generate a single instance.

assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");

- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,

+ State.ILV->scalarizeInstruction(UI, this, *State.Instance,

IsPredicated, State);

// Insert scalar instance packing it into a vector.

if (AlsoPack && State.VF.isVector()) {

@@ -9546,7 +9648,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

if (State.Instance->Lane.isFirstLane()) {

assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");

Value *Poison = PoisonValue::get(

- VectorType::get(getUnderlyingValue()->getType(), State.VF));

+ VectorType::get(UI->getType(), State.VF));

State.set(this, Poison, State.Instance->Part);

}

State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);

@@ -9555,12 +9657,36 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

}

if (IsUniform) {

+ // If the recipe is uniform across all parts (instead of just per VF), only

+ // generate a single instance.

+ if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&

+ all_of(operands(), [](VPValue *Op) {

+ return Op->isDefinedOutsideVectorRegions();

+ })) {

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,

+ State);

+ if (user_begin() != user_end()) {

+ for (unsigned Part = 1; Part < State.UF; ++Part)

+ State.set(this, State.get(this, VPIteration(0, 0)),

+ VPIteration(Part, 0));

+ }

+ return;

+ }

// Uniform within VL means we need to generate lane 0 only for each

// unrolled copy.

for (unsigned Part = 0; Part < State.UF; ++Part)

- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,

- VPIteration(Part, 0), IsPredicated,

- State);

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),

+ IsPredicated, State);

+ return;

+ }

+ // A store of a loop varying value to a loop invariant address only

+ // needs only the last copy of the store.

+ if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {

+ auto Lane = VPLane::getLastLaneForVF(State.VF);

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,

+ State);

return;

}

@@ -9569,9 +9695,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

const unsigned EndLane = State.VF.getKnownMinValue();

for (unsigned Part = 0; Part < State.UF; ++Part)

for (unsigned Lane = 0; Lane < EndLane; ++Lane)

- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,

- VPIteration(Part, Lane), IsPredicated,

- State);

+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),

+ IsPredicated, State);

}

void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

@@ -9709,7 +9834,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(

Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,

BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,

AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,

- LoopVectorizationLegality &LVL) {

+ LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {

// 1) OptSize takes precedence over all other options, i.e. if this is set,

// don't look at hints or options, and don't request a scalar epilogue.

// (For PGSO, as shouldOptimizeForSize isn't currently accessible from

@@ -9744,7 +9869,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(

};

// 4) if the TTI hook indicates this is profitable, request predication.

- if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))

+ if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))

return CM_ScalarEpilogueNotNeededUsePredicate;

return CM_ScalarEpilogueAllowed;

@@ -9770,15 +9895,14 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {

return ScalarValue;

}

- auto *RepR = dyn_cast<VPReplicateRecipe>(Def);

- bool IsUniform = RepR && RepR->isUniform();

+ bool IsUniform = vputils::isUniformAfterVectorization(Def);

unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;

// Check if there is a scalar value for the selected lane.

if (!hasScalarValue(Def, {Part, LastLane})) {

- // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.

- assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||

- isa<VPScalarIVStepsRecipe>(Def->getDef())) &&

+ // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.

+ assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||

+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&

"unexpected recipe found to be invariant");

IsUniform = true;

LastLane = 0;

@@ -9839,7 +9963,7 @@ static bool processLoopInVPlanNativePath(

InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());

ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);

+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);

LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,

&Hints, IAI);

@@ -9927,7 +10051,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {

static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

VectorizationFactor &VF,

- Optional<unsigned> VScale, Loop *L,

+ std::optional<unsigned> VScale, Loop *L,

ScalarEvolution &SE) {

InstructionCost CheckCost = Checks.getCost();

if (!CheckCost.isValid())

@@ -10075,7 +10199,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Check if it is legal to vectorize the loop.

LoopVectorizationRequirements Requirements;

- LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,

+ LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,

&Requirements, &Hints, DB, AC, BFI, PSI);

if (!LVL.canVectorize(EnableVPlanNativePath)) {

LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");

@@ -10083,11 +10207,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {

return false;

}

- // Check the function attributes and profiles to find out if this function

- // should be optimized for size.

- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);

// Entrance to the VPlan-native vectorization path. Outer loops are processed

// here. They may require CFG and instruction level transformations before

// even evaluating whether vectorization is profitable. Since we cannot modify

@@ -10099,6 +10218,22 @@ bool LoopVectorizePass::processLoop(Loop *L) {

assert(L->isInnermost() && "Inner loop expected.");

+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());

+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();

+ // If an override option has been passed in for interleaved accesses, use it.

+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)

+ UseInterleaved = EnableInterleavedMemAccesses;

+ // Analyze interleaved memory accesses.

+ if (UseInterleaved)

+ IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));

+ // Check the function attributes and profiles to find out if this function

+ // should be optimized for size.

+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);

// Check the loop for a trip count threshold: vectorize loops with a tiny trip

// count by optimizing for size, to minimize overheads.

auto ExpectedTC = getSmallBestKnownTC(*SE, L);

@@ -10109,15 +10244,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {

if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)

LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");

else {

- LLVM_DEBUG(dbgs() << "\n");

- SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;

+ if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {

+ LLVM_DEBUG(dbgs() << "\n");

+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;

+ } else {

+ LLVM_DEBUG(dbgs() << " But the target considers the trip count too "

+ "small to consider vectorizing.\n");

+ reportVectorizationFailure(

+ "The trip count is below the minial threshold value.",

+ "loop trip count is too low, avoiding vectorization",

+ "LowTripCount", ORE, L);

+ Hints.emitRemarkWithHints();

+ return false;

+ }

}

- // Check the function attributes to see if implicit floats are allowed.

- // FIXME: This check doesn't seem possibly correct -- what if the loop is

- // an integer loop and the vector instructions selected are purely integer

- // vector instructions?

+ // Check the function attributes to see if implicit floats or vectors are

+ // allowed.

if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {

reportVectorizationFailure(

"Can't vectorize when the NoImplicitFloat attribute is used",

@@ -10162,18 +10306,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {

return false;

}

- bool UseInterleaved = TTI->enableInterleavedAccessVectorization();

- InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());

- // If an override option has been passed in for interleaved accesses, use it.

- if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)

- UseInterleaved = EnableInterleavedMemAccesses;

- // Analyze interleaved memory accesses.

- if (UseInterleaved) {

- IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));

- }

// Use the cost model.

LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,

F, &Hints, IAI);

@@ -10188,7 +10320,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

unsigned UserIC = Hints.getInterleave();

// Plan how to best vectorize, return the best VF and its cost.

- Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);

+ std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);

VectorizationFactor VF = VectorizationFactor::Disabled();

unsigned IC = 1;

@@ -10198,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

if (MaybeVF) {

VF = *MaybeVF;

// Select the interleave count.

- IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());

+ IC = CM.selectInterleaveCount(VF.Width, VF.Cost);

unsigned SelectedIC = std::max(IC, UserIC);

// Optimistically generate runtime checks if they are needed. Drop them if

@@ -10360,16 +10492,39 @@ bool LoopVectorizePass::processLoop(Loop *L) {

VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();

Header->setName("vec.epilog.vector.body");

- // Ensure that the start values for any VPReductionPHIRecipes are

- // updated before vectorising the epilogue loop.

+ // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,

+ // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated

+ // before vectorizing the epilogue loop.

for (VPRecipeBase &R : Header->phis()) {

+ if (isa<VPCanonicalIVPHIRecipe>(&R))

+ continue;

+ Value *ResumeV = nullptr;

+ // TODO: Move setting of resume values to prepareToExecute.

if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {

- if (auto *Resume = MainILV.getReductionResumeValue(

- ReductionPhi->getRecurrenceDescriptor())) {

- VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);

- ReductionPhi->setOperand(0, StartVal);

+ ResumeV = MainILV.getReductionResumeValue(

+ ReductionPhi->getRecurrenceDescriptor());

+ } else {

+ // Create induction resume values for both widened pointer and

+ // integer/fp inductions and update the start value of the induction

+ // recipes to use the resume value.

+ PHINode *IndPhi = nullptr;

+ const InductionDescriptor *ID;

+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {

+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue());

+ ID = &Ind->getInductionDescriptor();

+ } else {

+ auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);

+ IndPhi = WidenInd->getPHINode();

+ ID = &WidenInd->getInductionDescriptor();

}

+ ResumeV = MainILV.createInductionResumeValue(

+ IndPhi, *ID, {EPI.MainLoopIterationCountCheck});

}

+ assert(ResumeV && "Must have a resume value");

+ VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);

+ cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);

}

LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,

@@ -10407,11 +10562,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {

checkMixedPrecision(L, ORE);

}

- Optional<MDNode *> RemainderLoopID =

+ std::optional<MDNode *> RemainderLoopID =

makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,

LLVMLoopVectorizeFollowupEpilogue});

if (RemainderLoopID) {

- L->setLoopID(RemainderLoopID.value());

+ L->setLoopID(*RemainderLoopID);

} else {

if (DisableRuntimeUnroll)

AddRuntimeUnrollDisableMetaData(L);

@@ -10427,8 +10582,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

LoopVectorizeResult LoopVectorizePass::runImpl(

Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,

DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,

- DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,

- std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,

+ DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,

OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {

SE = &SE_;

LI = &LI_;

@@ -10436,9 +10590,8 @@ LoopVectorizeResult LoopVectorizePass::runImpl(

DT = &DT_;

BFI = &BFI_;

TLI = TLI_;

- AA = &AA_;

AC = &AC_;

- GetLAA = &GetLAA_;

+ LAIs = &LAIs_;

DB = &DB_;

ORE = &ORE_;

PSI = PSI_;

@@ -10461,7 +10614,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl(

// legality and profitability checks. This means running the loop vectorizer

// will simplify all loops, regardless of whether anything end up being

// vectorized.

- for (auto &L : *LI)

+ for (const auto &L : *LI)

Changed |= CFGChanged |=

simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);

@@ -10484,6 +10637,9 @@ LoopVectorizeResult LoopVectorizePass::runImpl(

Changed |= formLCSSARecursively(*L, *DT, LI, SE);

Changed |= CFGChanged |= processLoop(L);

+ if (Changed)

+ LAIs->clear();

}

// Process each loop nest in the function.

@@ -10502,23 +10658,16 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &DT = AM.getResult<DominatorTreeAnalysis>(F);

auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);

auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);

- auto &AA = AM.getResult<AAManager>(F);

auto &AC = AM.getResult<AssumptionAnalysis>(F);

auto &DB = AM.getResult<DemandedBitsAnalysis>(F);

auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

- auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();

- std::function<const LoopAccessInfo &(Loop &)> GetLAA =

- [&](Loop &L) -> const LoopAccessInfo & {

- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,

- TLI, TTI, nullptr, nullptr, nullptr};

- return LAM.getResult<LoopAccessAnalysis>(L, AR);

- };

+ LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);

auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);

ProfileSummaryInfo *PSI =

MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());

LoopVectorizeResult Result =

- runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);

+ runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);

if (!Result.MadeAnyChange)

return PreservedAnalyses::all();

PreservedAnalyses PA;