src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2024-07-27 23:34:35 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2024-10-23 18:26:01 +0000
commit	0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583 (patch)
tree	6cf5ab1f05330c6773b1f3f64799d56a9c7a1faa /contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent	6b9f7133aba44189d9625c352bc2c2a59baf18ef (diff)
parent	ac9a064cb179f3425b310fa2847f8764ac970a4d (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

2905

1 files changed, 1369 insertions, 1536 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dd596c567cd4..6d28b8fabe42 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -59,7 +59,9 @@

#include "VPlan.h"

#include "VPlanAnalysis.h"

#include "VPlanHCFGBuilder.h"

+#include "VPlanPatternMatch.h"

#include "VPlanTransforms.h"

+#include "VPlanVerifier.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/DenseMap.h"

@@ -123,6 +125,7 @@

#include "llvm/IR/User.h"

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

+#include "llvm/IR/VectorBuilder.h"

#include "llvm/IR/Verifier.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

@@ -202,6 +205,11 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(

"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,

cl::desc("The maximum allowed number of runtime memory checks"));

+static cl::opt<bool> UseLegacyCostModel(

+ "vectorize-use-legacy-cost-model", cl::init(false), cl::Hidden,

+ cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "

+ "This option will be removed in the future."));

// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,

// that predication is preferred, and this lists all options. I.e., the

// vectorizer will try to fold the tail-loop (epilogue) into the vector body

@@ -247,10 +255,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(

clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",

"Create lane mask using active.lane.mask intrinsic, and use "

"it for both data and control flow"),

- clEnumValN(

- TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,

- "data-and-control-without-rt-check",

- "Similar to data-and-control, but remove the runtime check")));

+ clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,

+ "data-and-control-without-rt-check",

+ "Similar to data-and-control, but remove the runtime check"),

+ clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",

+ "Use predicated EVL instructions for tail folding. If EVL "

+ "is unsupported, fallback to data-without-lane-mask.")));

static cl::opt<bool> MaximizeBandwidth(

"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,

@@ -267,11 +277,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(

"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,

cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

-static cl::opt<unsigned> TinyTripCountInterleaveThreshold(

- "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,

- cl::desc("We don't interleave loops with a estimated constant trip count "

- "below this number"));

static cl::opt<unsigned> ForceTargetNumScalarRegs(

"force-target-num-scalar-regs", cl::init(0), cl::Hidden,

cl::desc("A flag that overrides the target's number of scalar registers."));

@@ -290,7 +295,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(

cl::desc("A flag that overrides the target's max interleave factor for "

"vectorized loops."));

-static cl::opt<unsigned> ForceTargetInstructionCost(

+cl::opt<unsigned> ForceTargetInstructionCost(

"force-target-instruction-cost", cl::init(0), cl::Hidden,

cl::desc("A flag that overrides the target's expected cost for "

"an instruction to a single constant value. Mostly "

@@ -319,12 +324,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(

cl::desc(

"Enable runtime interleaving until load/store ports are saturated"));

-/// Interleave small loops with scalar reductions.

-static cl::opt<bool> InterleaveSmallLoopScalarReduction(

- "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,

- cl::desc("Enable interleaving for loops with small iteration counts that "

- "contain scalar reductions to expose ILP."));

/// The number of stores in a loop that are allowed to need predication.

static cl::opt<unsigned> NumberOfStoresToPredicate(

"vectorize-num-stores-pred", cl::init(1), cl::Hidden,

@@ -418,14 +417,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {

return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);

}

-/// A helper function that returns the reciprocal of the block probability of

-/// predicated blocks. If we return X, we are assuming the predicated block

-/// will execute once for every X iterations of the loop header.

-///

-/// TODO: We should use actual block probability here, if available. Currently,

-/// we always assume predicated blocks have a 50% chance of executing.

-static unsigned getReciprocalPredBlockProb() { return 2; }

/// Returns "best known" trip count for the specified loop \p L as defined by

/// the following procedure:

/// 1) Returns exact trip count if it is known.

@@ -450,37 +441,6 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,

return std::nullopt;

}

-/// Return a vector containing interleaved elements from multiple

-/// smaller input vectors.

-static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,

- const Twine &Name) {

- unsigned Factor = Vals.size();

- assert(Factor > 1 && "Tried to interleave invalid number of vectors");

- VectorType *VecTy = cast<VectorType>(Vals[0]->getType());

-#ifndef NDEBUG

- for (Value *Val : Vals)

- assert(Val->getType() == VecTy && "Tried to interleave mismatched types");

-#endif

- // Scalable vectors cannot use arbitrary shufflevectors (only splats), so

- // must use intrinsics to interleave.

- if (VecTy->isScalableTy()) {

- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);

- return Builder.CreateIntrinsic(

- WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,

- /*FMFSource=*/nullptr, Name);

- }

- // Fixed length. Start by concatenating all vectors into a wide vector.

- Value *WideVec = concatenateVectors(Builder, Vals);

- // Interleave the elements into the wide vector.

- const unsigned NumElts = VecTy->getElementCount().getFixedValue();

- return Builder.CreateShuffleVector(

- WideVec, createInterleaveMask(NumElts, Factor), Name);

namespace {

// Forward declare GeneratedRTChecks.

class GeneratedRTChecks;

@@ -552,11 +512,6 @@ public:

// Return true if any runtime check is added.

bool areSafetyChecksAdded() { return AddedSafetyChecks; }

- /// A type for vectorized values in the new loop. Each value from the

- /// original loop, when vectorized, is represented by UF vector values in the

- /// new unrolled loop, where UF is the unroll factor.

- using VectorParts = SmallVector<Value *, 2>;

/// A helper function to scalarize a single Instruction in the innermost loop.

/// Generates a sequence of scalar instances for each lane between \p MinLane

/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,

@@ -567,23 +522,9 @@ public:

const VPIteration &Instance,

VPTransformState &State);

- /// Try to vectorize interleaved access group \p Group with the base address

- /// given in \p Addr, optionally masking the vector operations if \p

- /// BlockInMask is non-null. Use \p State to translate given VPValues to IR

- /// values in the vectorized loop.

- void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,

- ArrayRef<VPValue *> VPDefs,

- VPTransformState &State, VPValue *Addr,

- ArrayRef<VPValue *> StoredValues,

- VPValue *BlockInMask, bool NeedsMaskForGaps);

/// Fix the non-induction PHIs in \p Plan.

void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);

- /// Returns true if the reordering of FP operations is not allowed, but we are

- /// able to vectorize with strict in-order reductions for the given RdxDesc.

- bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);

/// Create a new phi node for the induction variable \p OrigPhi to resume

/// iteration count in the scalar epilogue, from where the vectorized loop

/// left off. \p Step is the SCEV-expanded induction step to use. In cases

@@ -622,14 +563,6 @@ protected:

BasicBlock *MiddleBlock, BasicBlock *VectorHeader,

VPlan &Plan, VPTransformState &State);

- /// Create the exit value of first order recurrences in the middle block and

- /// update their users.

- void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,

- VPTransformState &State);

- /// Create code for the loop exit value of the reduction.

- void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);

/// Iteratively sink the scalarized operands of a predicated instruction into

/// the block that was created for it.

void sinkScalarOperands(Instruction *PredInst);

@@ -637,11 +570,6 @@ protected:

/// Returns (and creates if needed) the trip count of the widened loop.

Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);

- /// Returns a bitcasted value to the requested vector type.

- /// Also handles bitcasts of vector<float> <-> vector<pointer> types.

- Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,

- const DataLayout &DL);

/// Emit a bypass check to see if the vector trip count is zero, including if

/// it overflows.

void emitIterationCountCheck(BasicBlock *Bypass);

@@ -675,17 +603,6 @@ protected:

/// running the verifier. Return the preheader of the completed vector loop.

BasicBlock *completeLoopSkeleton();

- /// Collect poison-generating recipes that may generate a poison value that is

- /// used after vectorization, even when their operands are not poison. Those

- /// recipes meet the following conditions:

- /// * Contribute to the address computation of a recipe generating a widen

- /// memory load/store (VPWidenMemoryInstructionRecipe or

- /// VPInterleaveRecipe).

- /// * Such a widen memory load/store has at least one underlying Instruction

- /// that is in a basic block that needs predication and after vectorization

- /// the generated instruction won't be predicated.

- void collectPoisonGeneratingRecipes(VPTransformState &State);

/// Allow subclasses to override and print debug traces before/after vplan

/// execution, when trace information is requested.

virtual void printDebugTracesAtStart(){};

@@ -1028,9 +945,12 @@ void reportVectorizationFailure(const StringRef DebugMsg,

<< "loop not vectorized: " << OREMsg);

}

-void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,

+/// Reports an informative message: print \p Msg for debugging purposes as well

+/// as an optimization remark. Uses either \p I as location of the remark, or

+/// otherwise \p TheLoop.

+static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,

OptimizationRemarkEmitter *ORE, Loop *TheLoop,

- Instruction *I) {

+ Instruction *I = nullptr) {

LLVM_DEBUG(debugVectorizationMessage("", Msg, I));

LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);

ORE->emit(

@@ -1057,108 +977,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,

} // end namespace llvm

-#ifndef NDEBUG

-/// \return string containing a file name and a line # for the given loop.

-static std::string getDebugLocString(const Loop *L) {

- std::string Result;

- if (L) {

- raw_string_ostream OS(Result);

- if (const DebugLoc LoopDbgLoc = L->getStartLoc())

- LoopDbgLoc.print(OS);

- else

- // Just print the module name.

- OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();

- OS.flush();

- }

- return Result;

-#endif

-void InnerLoopVectorizer::collectPoisonGeneratingRecipes(

- VPTransformState &State) {

- // Collect recipes in the backward slice of `Root` that may generate a poison

- // value that is used after vectorization.

- SmallPtrSet<VPRecipeBase *, 16> Visited;

- auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {

- SmallVector<VPRecipeBase *, 16> Worklist;

- Worklist.push_back(Root);

- // Traverse the backward slice of Root through its use-def chain.

- while (!Worklist.empty()) {

- VPRecipeBase *CurRec = Worklist.back();

- Worklist.pop_back();

- if (!Visited.insert(CurRec).second)

- continue;

- // Prune search if we find another recipe generating a widen memory

- // instruction. Widen memory instructions involved in address computation

- // will lead to gather/scatter instructions, which don't need to be

- // handled.

- if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||

- isa<VPInterleaveRecipe>(CurRec) ||

- isa<VPScalarIVStepsRecipe>(CurRec) ||

- isa<VPCanonicalIVPHIRecipe>(CurRec) ||

- isa<VPActiveLaneMaskPHIRecipe>(CurRec))

- continue;

- // This recipe contributes to the address computation of a widen

- // load/store. If the underlying instruction has poison-generating flags,

- // drop them directly.

- if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {

- RecWithFlags->dropPoisonGeneratingFlags();

- } else {

- Instruction *Instr = dyn_cast_or_null<Instruction>(

- CurRec->getVPSingleValue()->getUnderlyingValue());

- (void)Instr;

- assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&

- "found instruction with poison generating flags not covered by "

- "VPRecipeWithIRFlags");

- }

- // Add new definitions to the worklist.

- for (VPValue *operand : CurRec->operands())

- if (VPRecipeBase *OpDef = operand->getDefiningRecipe())

- Worklist.push_back(OpDef);

- }

- });

- // Traverse all the recipes in the VPlan and collect the poison-generating

- // recipes in the backward slice starting at the address of a VPWidenRecipe or

- // VPInterleaveRecipe.

- auto Iter = vp_depth_first_deep(State.Plan->getEntry());

- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

- for (VPRecipeBase &Recipe : *VPBB) {

- if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {

- Instruction &UnderlyingInstr = WidenRec->getIngredient();

- VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();

- if (AddrDef && WidenRec->isConsecutive() &&

- Legal->blockNeedsPredication(UnderlyingInstr.getParent()))

- collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

- } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {

- VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();

- if (AddrDef) {

- // Check if any member of the interleave group needs predication.

- const InterleaveGroup<Instruction> *InterGroup =

- InterleaveRec->getInterleaveGroup();

- bool NeedPredication = false;

- for (int I = 0, NumMembers = InterGroup->getNumMembers();

- I < NumMembers; ++I) {

- Instruction *Member = InterGroup->getMember(I);

- if (Member)

- NeedPredication |=

- Legal->blockNeedsPredication(Member->getParent());

- }

- if (NeedPredication)

- collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

- }

namespace llvm {

// Loop vectorization cost-model hints how the scalar epilogue loop should be

@@ -1222,7 +1040,7 @@ public:

bool selectUserVectorizationFactor(ElementCount UserVF) {

collectUniformsAndScalars(UserVF);

collectInstsToScalarize(UserVF);

- return expectedCost(UserVF).first.isValid();

+ return expectedCost(UserVF).isValid();

}

/// \return The size (in bits) of the smallest and widest types in the code

@@ -1298,11 +1116,9 @@ public:

bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {

assert(VF.isVector() &&

"Profitable to scalarize relevant only for VF > 1.");

- // Cost model is not run in the VPlan-native path - return conservative

- // result until this changes.

- if (EnableVPlanNativePath)

- return false;

+ assert(

+ TheLoop->isInnermost() &&

+ "cost-model should not be used for outer loops (in VPlan-native path)");

auto Scalars = InstsToScalarize.find(VF);

assert(Scalars != InstsToScalarize.end() &&

@@ -1312,6 +1128,9 @@ public:

/// Returns true if \p I is known to be uniform after vectorization.

bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {

+ assert(

+ TheLoop->isInnermost() &&

+ "cost-model should not be used for outer loops (in VPlan-native path)");

// Pseudo probe needs to be duplicated for each unrolled iteration and

// vector lane so that profiled loop trip count can be accurately

// accumulated instead of being under counted.

@@ -1321,11 +1140,6 @@ public:

if (VF.isScalar())

return true;

- // Cost model is not run in the VPlan-native path - return conservative

- // result until this changes.

- if (EnableVPlanNativePath)

- return false;

auto UniformsPerVF = Uniforms.find(VF);

assert(UniformsPerVF != Uniforms.end() &&

"VF not yet analyzed for uniformity");

@@ -1334,14 +1148,12 @@ public:

/// Returns true if \p I is known to be scalar after vectorization.

bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {

+ assert(

+ TheLoop->isInnermost() &&

+ "cost-model should not be used for outer loops (in VPlan-native path)");

if (VF.isScalar())

return true;

- // Cost model is not run in the VPlan-native path - return conservative

- // result until this changes.

- if (EnableVPlanNativePath)

- return false;

auto ScalarsPerVF = Scalars.find(VF);

assert(ScalarsPerVF != Scalars.end() &&

"Scalar values are not calculated for VF");

@@ -1399,10 +1211,9 @@ public:

/// through the cost modeling.

InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {

assert(VF.isVector() && "Expected VF to be a vector VF");

- // Cost model is not run in the VPlan-native path - return conservative

- // result until this changes.

- if (EnableVPlanNativePath)

- return CM_GatherScatter;

+ assert(

+ TheLoop->isInnermost() &&

+ "cost-model should not be used for outer loops (in VPlan-native path)");

std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);

auto Itr = WideningDecisions.find(InstOnVF);

@@ -1570,29 +1381,40 @@ public:

/// Returns true if \p I is a memory instruction in an interleaved-group

/// of memory accesses that can be vectorized with wide vector loads/stores

/// and shuffles.

- bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);

+ bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;

/// Check if \p Instr belongs to any interleaved access group.

- bool isAccessInterleaved(Instruction *Instr) {

+ bool isAccessInterleaved(Instruction *Instr) const {

return InterleaveInfo.isInterleaved(Instr);

}

/// Get the interleaved access group that \p Instr belongs to.

const InterleaveGroup<Instruction> *

- getInterleavedAccessGroup(Instruction *Instr) {

+ getInterleavedAccessGroup(Instruction *Instr) const {

return InterleaveInfo.getInterleaveGroup(Instr);

}

/// Returns true if we're required to use a scalar epilogue for at least

/// the final iteration of the original loop.

bool requiresScalarEpilogue(bool IsVectorizing) const {

- if (!isScalarEpilogueAllowed())

+ if (!isScalarEpilogueAllowed()) {

+ LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");

return false;

+ }

// If we might exit from anywhere but the latch, must run the exiting

// iteration in scalar form.

- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())

+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {

+ LLVM_DEBUG(

+ dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");

+ return true;

+ }

+ if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {

+ LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "

+ "interleaved group requires scalar epilogue\n");

return true;

- return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();

+ }

+ LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");

+ return false;

}

/// Returns true if we're required to use a scalar epilogue for at least

@@ -1617,19 +1439,67 @@ public:

}

/// Returns the TailFoldingStyle that is best for the current loop.

- TailFoldingStyle

- getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {

- if (!CanFoldTailByMasking)

+ TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {

+ if (!ChosenTailFoldingStyle)

return TailFoldingStyle::None;

+ return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first

+ : ChosenTailFoldingStyle->second;

+ }

+ /// Selects and saves TailFoldingStyle for 2 options - if IV update may

+ /// overflow or not.

+ /// \param IsScalableVF true if scalable vector factors enabled.

+ /// \param UserIC User specific interleave count.

+ void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {

+ assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");

+ if (!Legal->canFoldTailByMasking()) {

+ ChosenTailFoldingStyle =

+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);

+ return;

+ }

- if (ForceTailFoldingStyle.getNumOccurrences())

- return ForceTailFoldingStyle;

+ if (!ForceTailFoldingStyle.getNumOccurrences()) {

+ ChosenTailFoldingStyle = std::make_pair(

+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),

+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));

+ return;

+ }

- return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);

+ // Set styles when forced.

+ ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),

+ ForceTailFoldingStyle.getValue());

+ if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)

+ return;

+ // Override forced styles if needed.

+ // FIXME: use actual opcode/data type for analysis here.

+ // FIXME: Investigate opportunity for fixed vector factor.

+ bool EVLIsLegal =

+ IsScalableVF && UserIC <= 1 &&

+ TTI.hasActiveVectorLength(0, nullptr, Align()) &&

+ !EnableVPlanNativePath &&

+ // FIXME: implement support for max safe dependency distance.

+ Legal->isSafeForAnyVectorWidth();

+ if (!EVLIsLegal) {

+ // If for some reason EVL mode is unsupported, fallback to

+ // DataWithoutLaneMask to try to vectorize the loop with folded tail

+ // in a generic way.

+ ChosenTailFoldingStyle =

+ std::make_pair(TailFoldingStyle::DataWithoutLaneMask,

+ TailFoldingStyle::DataWithoutLaneMask);

+ LLVM_DEBUG(

+ dbgs()

+ << "LV: Preference for VP intrinsics indicated. Will "

+ "not try to generate VP Intrinsics "

+ << (UserIC > 1

+ ? "since interleave count specified is greater than 1.\n"

+ : "due to non-interleaving reasons.\n"));

+ }

}

/// Returns true if all loop blocks should be masked to fold tail loop.

bool foldTailByMasking() const {

+ // TODO: check if it is possible to check for None style independent of

+ // IVUpdateMayOverflow flag in getTailFoldingStyle.

return getTailFoldingStyle() != TailFoldingStyle::None;

}

@@ -1640,6 +1510,12 @@ public:

return foldTailByMasking() || Legal->blockNeedsPredication(BB);

}

+ /// Returns true if VP intrinsics with explicit vector length support should

+ /// be generated in the tail folded loop.

+ bool foldTailWithEVL() const {

+ return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;

+ }

/// Returns true if the Phi is part of an inloop reduction.

bool isInLoopReduction(PHINode *Phi) const {

return InLoopReductions.contains(Phi);

@@ -1663,20 +1539,13 @@ public:

Scalars.clear();

}

- /// The vectorization cost is a combination of the cost itself and a boolean

- /// indicating whether any of the contributing operations will actually

- /// operate on vector values after type legalization in the backend. If this

- /// latter value is false, then all operations will be scalarized (i.e. no

- /// vectorization has actually taken place).

- using VectorizationCostTy = std::pair<InstructionCost, bool>;

/// Returns the expected execution cost. The unit of the cost does

/// not matter because we use the 'cost' units to compare different

/// vector widths. The cost that is returned is *not* normalized by

/// the factor width. If \p Invalid is not nullptr, this function

/// will add a pair(Instruction*, ElementCount) to \p Invalid for

/// each instruction that has an Invalid cost for the given VF.

- VectorizationCostTy

+ InstructionCost

expectedCost(ElementCount VF,

SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);

@@ -1687,6 +1556,16 @@ public:

/// \p VF is the vectorization factor chosen for the original loop.

bool isEpilogueVectorizationProfitable(const ElementCount VF) const;

+ /// Returns the execution time cost of an instruction for a given vector

+ /// width. Vector width of one means scalar.

+ InstructionCost getInstructionCost(Instruction *I, ElementCount VF);

+ /// Return the cost of instructions in an inloop reduction pattern, if I is

+ /// part of that pattern.

+ std::optional<InstructionCost>

+ getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,

+ TTI::TargetCostKind CostKind) const;

private:

unsigned NumPredStores = 0;

@@ -1708,25 +1587,14 @@ private:

ElementCount MaxSafeVF,

bool FoldTailByMasking);

+ /// Checks if scalable vectorization is supported and enabled. Caches the

+ /// result to avoid repeated debug dumps for repeated queries.

+ bool isScalableVectorizationAllowed();

/// \return the maximum legal scalable VF, based on the safe max number

/// of elements.

ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);

- /// Returns the execution time cost of an instruction for a given vector

- /// width. Vector width of one means scalar.

- VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);

- /// The cost-computation logic from getInstructionCost which provides

- /// the vector type as an output parameter.

- InstructionCost getInstructionCost(Instruction *I, ElementCount VF,

- Type *&VectorTy);

- /// Return the cost of instructions in an inloop reduction pattern, if I is

- /// part of that pattern.

- std::optional<InstructionCost>

- getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,

- TTI::TargetCostKind CostKind) const;

/// Calculate vectorization cost of memory instruction \p I.

InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);

@@ -1782,8 +1650,13 @@ private:

/// iterations to execute in the scalar loop.

ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;

- /// All blocks of loop are to be masked to fold tail of scalar iterations.

- bool CanFoldTailByMasking = false;

+ /// Control finally chosen tail folding style. The first element is used if

+ /// the IV update may overflow, the second element - if it does not.

+ std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>

+ ChosenTailFoldingStyle;

+ /// true if scalable vectorization is supported and enabled.

+ std::optional<bool> IsScalableVectorizationAllowed;

/// A map holding scalar costs for different vectorization factors. The

/// presence of a cost for an instruction in the mapping indicates that the

@@ -2118,16 +1991,18 @@ public:

BestTripCount = *EstimatedTC;

}

+ BestTripCount = std::max(BestTripCount, 1U);

InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;

// Let's ensure the cost is always at least 1.

NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),

(InstructionCost::CostType)1);

- LLVM_DEBUG(dbgs()

- << "We expect runtime memory checks to be hoisted "

- << "out of the outer loop. Cost reduced from "

- << MemCheckCost << " to " << NewMemCheckCost << '\n');

+ if (BestTripCount > 1)

+ LLVM_DEBUG(dbgs()

+ << "We expect runtime memory checks to be hoisted "

+ << "out of the outer loop. Cost reduced from "

+ << MemCheckCost << " to " << NewMemCheckCost << '\n');

MemCheckCost = NewMemCheckCost;

}

@@ -2207,7 +2082,7 @@ public:

BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);

if (AddBranchWeights)

- setBranchWeights(BI, SCEVCheckBypassWeights);

+ setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);

ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);

return SCEVCheckBlock;

}

@@ -2235,7 +2110,7 @@ public:

BranchInst &BI =

*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);

if (AddBranchWeights) {

- setBranchWeights(BI, MemCheckBypassWeights);

+ setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);

}

ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);

MemCheckBlock->getTerminator()->setDebugLoc(

@@ -2472,276 +2347,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {

return TTI.enableMaskedInterleavedAccessVectorization();

}

-// Try to vectorize the interleave group that \p Instr belongs to.

-//

-// E.g. Translate following interleaved load group (factor = 3):

-// for (i = 0; i < N; i+=3) {

-// R = Pic[i]; // Member of index 0

-// G = Pic[i+1]; // Member of index 1

-// B = Pic[i+2]; // Member of index 2

-// ... // do something to R, G, B

-// }

-// To:

-// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B

-// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements

-// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements

-// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements

-//

-// Or translate following interleaved store group (factor = 3):

-// for (i = 0; i < N; i+=3) {

-// ... do something to R, G, B

-// Pic[i] = R; // Member of index 0

-// Pic[i+1] = G; // Member of index 1

-// Pic[i+2] = B; // Member of index 2

-// }

-// To:

-// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>

-// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>

-// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,

-// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements

-// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B

-void InnerLoopVectorizer::vectorizeInterleaveGroup(

- const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,

- VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,

- VPValue *BlockInMask, bool NeedsMaskForGaps) {

- Instruction *Instr = Group->getInsertPos();

- const DataLayout &DL = Instr->getModule()->getDataLayout();

- // Prepare for the vector type of the interleaved load/store.

- Type *ScalarTy = getLoadStoreType(Instr);

- unsigned InterleaveFactor = Group->getFactor();

- auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);

- // Prepare for the new pointers.

- SmallVector<Value *, 2> AddrParts;

- unsigned Index = Group->getIndex(Instr);

- // TODO: extend the masked interleaved-group support to reversed access.

- assert((!BlockInMask || !Group->isReverse()) &&

- "Reversed masked interleave-group not supported.");

- Value *Idx;

- // If the group is reverse, adjust the index to refer to the last vector lane

- // instead of the first. We adjust the index from the first vector lane,

- // rather than directly getting the pointer for lane VF - 1, because the

- // pointer operand of the interleaved access is supposed to be uniform. For

- // uniform instructions, we're only required to generate a value for the

- // first vector lane in each unroll iteration.

- if (Group->isReverse()) {

- Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);

- Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));

- Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));

- Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));

- Idx = Builder.CreateNeg(Idx);

- } else

- Idx = Builder.getInt32(-Index);

- for (unsigned Part = 0; Part < UF; Part++) {

- Value *AddrPart = State.get(Addr, VPIteration(Part, 0));

- if (auto *I = dyn_cast<Instruction>(AddrPart))

- State.setDebugLocFrom(I->getDebugLoc());

- // Notice current instruction could be any index. Need to adjust the address

- // to the member of index 0.

- //

- // E.g. a = A[i+1]; // Member of index 1 (Current instruction)

- // b = A[i]; // Member of index 0

- // Current pointer is pointed to A[i+1], adjust it to A[i].

- //

- // E.g. A[i+1] = a; // Member of index 1

- // A[i] = b; // Member of index 0

- // A[i+2] = c; // Member of index 2 (Current instruction)

- // Current pointer is pointed to A[i+2], adjust it to A[i].

- bool InBounds = false;

- if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))

- InBounds = gep->isInBounds();

- AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);

- AddrParts.push_back(AddrPart);

- }

- State.setDebugLocFrom(Instr->getDebugLoc());

- Value *PoisonVec = PoisonValue::get(VecTy);

- auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](

- unsigned Part, Value *MaskForGaps) -> Value * {

- if (VF.isScalable()) {

- assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");

- assert(InterleaveFactor == 2 &&

- "Unsupported deinterleave factor for scalable vectors");

- auto *BlockInMaskPart = State.get(BlockInMask, Part);

- SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};

- auto *MaskTy =

- VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);

- return Builder.CreateIntrinsic(

- MaskTy, Intrinsic::experimental_vector_interleave2, Ops,

- /*FMFSource=*/nullptr, "interleaved.mask");

- }

- if (!BlockInMask)

- return MaskForGaps;

- Value *BlockInMaskPart = State.get(BlockInMask, Part);

- Value *ShuffledMask = Builder.CreateShuffleVector(

- BlockInMaskPart,

- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),

- "interleaved.mask");

- return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,

- MaskForGaps)

- : ShuffledMask;

- };

- // Vectorize the interleaved load group.

- if (isa<LoadInst>(Instr)) {

- Value *MaskForGaps = nullptr;

- if (NeedsMaskForGaps) {

- MaskForGaps =

- createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);

- assert(MaskForGaps && "Mask for Gaps is required but it is null");

- }

- // For each unroll part, create a wide load for the group.

- SmallVector<Value *, 2> NewLoads;

- for (unsigned Part = 0; Part < UF; Part++) {

- Instruction *NewLoad;

- if (BlockInMask || MaskForGaps) {

- assert(useMaskedInterleavedAccesses(*TTI) &&

- "masked interleaved groups are not allowed.");

- Value *GroupMask = CreateGroupMask(Part, MaskForGaps);

- NewLoad =

- Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),

- GroupMask, PoisonVec, "wide.masked.vec");

- }

- else

- NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],

- Group->getAlign(), "wide.vec");

- Group->addMetadata(NewLoad);

- NewLoads.push_back(NewLoad);

- }

- if (VecTy->isScalableTy()) {

- assert(InterleaveFactor == 2 &&

- "Unsupported deinterleave factor for scalable vectors");

- for (unsigned Part = 0; Part < UF; ++Part) {

- // Scalable vectors cannot use arbitrary shufflevectors (only splats),

- // so must use intrinsics to deinterleave.

- Value *DI = Builder.CreateIntrinsic(

- Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],

- /*FMFSource=*/nullptr, "strided.vec");

- unsigned J = 0;

- for (unsigned I = 0; I < InterleaveFactor; ++I) {

- Instruction *Member = Group->getMember(I);

- if (!Member)

- continue;

- Value *StridedVec = Builder.CreateExtractValue(DI, I);

- // If this member has different type, cast the result type.

- if (Member->getType() != ScalarTy) {

- VectorType *OtherVTy = VectorType::get(Member->getType(), VF);

- StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);

- }

- if (Group->isReverse())

- StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");

- State.set(VPDefs[J], StridedVec, Part);

- ++J;

- }

- return;

- }

- // For each member in the group, shuffle out the appropriate data from the

- // wide loads.

- unsigned J = 0;

- for (unsigned I = 0; I < InterleaveFactor; ++I) {

- Instruction *Member = Group->getMember(I);

- // Skip the gaps in the group.

- if (!Member)

- continue;

- auto StrideMask =

- createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());

- for (unsigned Part = 0; Part < UF; Part++) {

- Value *StridedVec = Builder.CreateShuffleVector(

- NewLoads[Part], StrideMask, "strided.vec");

- // If this member has different type, cast the result type.

- if (Member->getType() != ScalarTy) {

- assert(!VF.isScalable() && "VF is assumed to be non scalable.");

- VectorType *OtherVTy = VectorType::get(Member->getType(), VF);

- StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);

- }

- if (Group->isReverse())

- StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");

- State.set(VPDefs[J], StridedVec, Part);

- }

- ++J;

- }

- return;

- }

- // The sub vector type for current instruction.

- auto *SubVT = VectorType::get(ScalarTy, VF);

- // Vectorize the interleaved store group.

- Value *MaskForGaps =

- createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);

- assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&

- "masked interleaved groups are not allowed.");

- assert((!MaskForGaps || !VF.isScalable()) &&

- "masking gaps for scalable vectors is not yet supported.");

- for (unsigned Part = 0; Part < UF; Part++) {

- // Collect the stored vector from each member.

- SmallVector<Value *, 4> StoredVecs;

- unsigned StoredIdx = 0;

- for (unsigned i = 0; i < InterleaveFactor; i++) {

- assert((Group->getMember(i) || MaskForGaps) &&

- "Fail to get a member from an interleaved store group");

- Instruction *Member = Group->getMember(i);

- // Skip the gaps in the group.

- if (!Member) {

- Value *Undef = PoisonValue::get(SubVT);

- StoredVecs.push_back(Undef);

- continue;

- }

- Value *StoredVec = State.get(StoredValues[StoredIdx], Part);

- ++StoredIdx;

- if (Group->isReverse())

- StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");

- // If this member has different type, cast it to a unified type.

- if (StoredVec->getType() != SubVT)

- StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);

- StoredVecs.push_back(StoredVec);

- }

- // Interleave all the smaller vectors into one wider vector.

- Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");

- Instruction *NewStoreInstr;

- if (BlockInMask || MaskForGaps) {

- Value *GroupMask = CreateGroupMask(Part, MaskForGaps);

- NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],

- Group->getAlign(), GroupMask);

- } else

- NewStoreInstr =

- Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());

- Group->addMetadata(NewStoreInstr);

- }

void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,

VPReplicateRecipe *RepRecipe,

const VPIteration &Instance,

@@ -2822,9 +2427,8 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {

if (Cost->foldTailByMasking()) {

assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&

"VF*UF must be a power of 2 when folding tail by masking");

- Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);

- TC = Builder.CreateAdd(

- TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");

+ TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),

+ "n.rnd.up");

}

// Now we need to generate the expression for the part of the loop that the

@@ -2850,37 +2454,6 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {

return VectorTripCount;

}

-Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,

- const DataLayout &DL) {

- // Verify that V is a vector type with same number of elements as DstVTy.

- auto *DstFVTy = cast<VectorType>(DstVTy);

- auto VF = DstFVTy->getElementCount();

- auto *SrcVecTy = cast<VectorType>(V->getType());

- assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");

- Type *SrcElemTy = SrcVecTy->getElementType();

- Type *DstElemTy = DstFVTy->getElementType();

- assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&

- "Vector elements must have same size");

- // Do a direct cast if element types are castable.

- if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {

- return Builder.CreateBitOrPointerCast(V, DstFVTy);

- }

- // V cannot be directly casted to desired vector type.

- // May happen when V is a floating point vector but DstVTy is a vector of

- // pointers or vice-versa. Handle this using a two-step bitcast using an

- // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.

- assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&

- "Only one type should be a pointer type");

- assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&

- "Only one type should be a floating point type");

- Type *IntTy =

- IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));

- auto *VecIntTy = VectorType::get(IntTy, VF);

- Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);

- return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);

void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

Value *Count = getTripCount();

// Reuse existing vector loop preheader for TC checks.

@@ -2943,16 +2516,10 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

// Update dominator for Bypass & LoopExit (if needed).

DT->changeImmediateDominator(Bypass, TCCheckBlock);

- if (!Cost->requiresScalarEpilogue(VF.isVector()))

- // If there is an epilogue which must run, there's no edge from the

- // middle block to exit blocks and thus no need to update the immediate

- // dominator of the exit blocks.

- DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);

BranchInst &BI =

*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);

if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))

- setBranchWeights(BI, MinItersBypassWeights);

+ setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);

ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);

LoopBypassBlocks.push_back(TCCheckBlock);

}

@@ -3034,33 +2601,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {

LoopScalarPreHeader =

SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,

nullptr, Twine(Prefix) + "scalar.ph");

- auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();

- // Set up the middle block terminator. Two cases:

- // 1) If we know that we must execute the scalar epilogue, emit an

- // unconditional branch.

- // 2) Otherwise, we must have a single unique exit block (due to how we

- // implement the multiple exit case). In this case, set up a conditional

- // branch from the middle block to the loop scalar preheader, and the

- // exit block. completeLoopSkeleton will update the condition to use an

- // iteration check, if required to decide whether to execute the remainder.

- BranchInst *BrInst =

- Cost->requiresScalarEpilogue(VF.isVector())

- ? BranchInst::Create(LoopScalarPreHeader)

- : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,

- Builder.getTrue());

- BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());

- ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);

- // Update dominator for loop exit. During skeleton creation, only the vector

- // pre-header and the middle block are created. The vector loop is entirely

- // created during VPlan exection.

- if (!Cost->requiresScalarEpilogue(VF.isVector()))

- // If there is an epilogue which must run, there's no edge from the

- // middle block to exit blocks and thus no need to update the immediate

- // dominator of the exit blocks.

- DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);

}

PHINode *InnerLoopVectorizer::createInductionResumeValue(

@@ -3100,7 +2640,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(

// Create phi nodes to merge from the backedge-taken check block.

PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",

- LoopScalarPreHeader->getTerminator());

+ LoopScalarPreHeader->getFirstNonPHI());

// Copy original phi DL over to the new one.

BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());

@@ -3157,51 +2697,6 @@ void InnerLoopVectorizer::createInductionResumeValues(

}

-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {

- // The trip counts should be cached by now.

- Value *Count = getTripCount();

- Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

- auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();

- // Add a check in the middle block to see if we have completed

- // all of the iterations in the first vector loop. Three cases:

- // 1) If we require a scalar epilogue, there is no conditional branch as

- // we unconditionally branch to the scalar preheader. Do nothing.

- // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.

- // Thus if tail is to be folded, we know we don't need to run the

- // remainder and we can use the previous value for the condition (true).

- // 3) Otherwise, construct a runtime check.

- if (!Cost->requiresScalarEpilogue(VF.isVector()) &&

- !Cost->foldTailByMasking()) {

- // Here we use the same DebugLoc as the scalar loop latch terminator instead

- // of the corresponding compare because they may have ended up with

- // different line numbers and we want to avoid awkward line stepping while

- // debugging. Eg. if the compare has got a line number inside the loop.

- // TODO: At the moment, CreateICmpEQ will simplify conditions with constant

- // operands. Perform simplification directly on VPlan once the branch is

- // modeled there.

- IRBuilder<> B(LoopMiddleBlock->getTerminator());

- B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());

- Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");

- BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());

- BI.setCondition(CmpN);

- if (hasBranchWeightMD(*ScalarLatchTerm)) {

- // Assume that `Count % VectorTripCount` is equally distributed.

- unsigned TripCount = UF * VF.getKnownMinValue();

- assert(TripCount > 0 && "trip count should not be zero");

- const uint32_t Weights[] = {1, TripCount - 1};

- setBranchWeights(BI, Weights);

- }

-#ifdef EXPENSIVE_CHECKS

- assert(DT->verify(DominatorTree::VerificationLevel::Fast));

-#endif

- return LoopVectorPreHeader;

std::pair<BasicBlock *, Value *>

InnerLoopVectorizer::createVectorizedLoopSkeleton(

const SCEV2ValueTy &ExpandedSCEVs) {

@@ -3224,17 +2719,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(

| [ ]_| <-- vector loop (created during VPlan execution).

| |

| v

- \ -[ ] <--- middle-block.

+ \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to

+ | | successors created during VPlan execution)

\/ |

/\ v

- | ->[ ] <--- new preheader.

+ | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).

| |

(opt) v <-- edge from middle to exit iff epilogue is not required.

| [ ] \

| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).

\ |

\ v

- >[ ] <-- exit block(s).

+ >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)

...

@@ -3261,7 +2757,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(

// Emit phis for the new starting index of the scalar loop.

createInductionResumeValues(ExpandedSCEVs);

- return {completeLoopSkeleton(), nullptr};

+ return {LoopVectorPreHeader, nullptr};

}

// Fix up external users of the induction variable. At this point, we are

@@ -3447,37 +2943,12 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,

TargetTransformInfo::TCK_RecipThroughput);

}

-static Type *smallestIntegerVectorType(Type *T1, Type *T2) {

- auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());

- auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());

- return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;

-static Type *largestIntegerVectorType(Type *T1, Type *T2) {

- auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());

- auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());

- return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;

void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

VPlan &Plan) {

// Fix widened non-induction PHIs by setting up the PHI operands.

if (EnableVPlanNativePath)

fixNonInductionPHIs(Plan, State);

- // At this point every instruction in the original loop is widened to a

- // vector form. Now we need to fix the recurrences in the loop. These PHI

- // nodes are currently empty because we did not want to introduce cycles.

- // This is the second stage of vectorizing recurrences. Note that fixing

- // reduction phis are already modeled in VPlan.

- // TODO: Also model fixing fixed-order recurrence phis in VPlan.

- VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();

- VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();

- for (VPRecipeBase &R : HeaderVPBB->phis()) {

- if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))

- fixFixedOrderRecurrence(FOR, State);

- }

// Forget the original basic block.

PSE.getSE()->forgetLoop(OrigLoop);

PSE.getSE()->forgetBlockAndLoopDispositions();

@@ -3491,6 +2962,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

for (PHINode &PN : Exit->phis())

PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);

+ VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();

VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();

Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);

if (Cost->requiresScalarEpilogue(VF.isVector())) {

@@ -3513,10 +2985,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

VectorLoop->getHeader(), Plan, State);

}

- // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated

- // in the exit block, so update the builder.

- State.Builder.SetInsertPoint(State.CFG.ExitBB,

- State.CFG.ExitBB->getFirstNonPHIIt());

+ // Fix live-out phis not already fixed earlier.

for (const auto &KV : Plan.getLiveOuts())

KV.second->fixPhi(Plan, State);

@@ -3544,125 +3013,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

VF.getKnownMinValue() * UF);

}

-void InnerLoopVectorizer::fixFixedOrderRecurrence(

- VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {

- // This is the second phase of vectorizing first-order recurrences. An

- // overview of the transformation is described below. Suppose we have the

- // following loop.

- //

- // for (int i = 0; i < n; ++i)

- // b[i] = a[i] - a[i - 1];

- //

- // There is a first-order recurrence on "a". For this loop, the shorthand

- // scalar IR looks like:

- //

- // scalar.ph:

- // s_init = a[-1]

- // br scalar.body

- //

- // scalar.body:

- // i = phi [0, scalar.ph], [i+1, scalar.body]

- // s1 = phi [s_init, scalar.ph], [s2, scalar.body]

- // s2 = a[i]

- // b[i] = s2 - s1

- // br cond, scalar.body, ...

- //

- // In this example, s1 is a recurrence because it's value depends on the

- // previous iteration. In the first phase of vectorization, we created a

- // vector phi v1 for s1. We now complete the vectorization and produce the

- // shorthand vector IR shown below (for VF = 4, UF = 1).

- //

- // vector.ph:

- // v_init = vector(..., ..., ..., a[-1])

- // br vector.body

- //

- // vector.body

- // i = phi [0, vector.ph], [i+4, vector.body]

- // v1 = phi [v_init, vector.ph], [v2, vector.body]

- // v2 = a[i, i+1, i+2, i+3];

- // v3 = vector(v1(3), v2(0, 1, 2))

- // b[i, i+1, i+2, i+3] = v2 - v3

- // br cond, vector.body, middle.block

- //

- // middle.block:

- // x = v2(3)

- // br scalar.ph

- //

- // scalar.ph:

- // s_init = phi [x, middle.block], [a[-1], otherwise]

- // br scalar.body

- //

- // After execution completes the vector loop, we extract the next value of

- // the recurrence (x) to use as the initial value in the scalar loop.

- // Extract the last vector element in the middle block. This will be the

- // initial value for the recurrence when jumping to the scalar loop.

- VPValue *PreviousDef = PhiR->getBackedgeValue();

- Value *Incoming = State.get(PreviousDef, UF - 1);

- auto *ExtractForScalar = Incoming;

- auto *IdxTy = Builder.getInt32Ty();

- Value *RuntimeVF = nullptr;

- if (VF.isVector()) {

- auto *One = ConstantInt::get(IdxTy, 1);

- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());

- RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);

- auto *LastIdx = Builder.CreateSub(RuntimeVF, One);

- ExtractForScalar =

- Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");

- }

- auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());

- assert(PhiR->getNumUsers() == 1 &&

- RecurSplice->getOpcode() ==

- VPInstruction::FirstOrderRecurrenceSplice &&

- "recurrence phi must have a single user: FirstOrderRecurrenceSplice");

- SmallVector<VPLiveOut *> LiveOuts;

- for (VPUser *U : RecurSplice->users())

- if (auto *LiveOut = dyn_cast<VPLiveOut>(U))

- LiveOuts.push_back(LiveOut);

- if (!LiveOuts.empty()) {

- // Extract the second last element in the middle block if the

- // Phi is used outside the loop. We need to extract the phi itself

- // and not the last element (the phi update in the current iteration). This

- // will be the value when jumping to the exit block from the

- // LoopMiddleBlock, when the scalar loop is not run at all.

- Value *ExtractForPhiUsedOutsideLoop = nullptr;

- if (VF.isVector()) {

- auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));

- ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(

- Incoming, Idx, "vector.recur.extract.for.phi");

- } else {

- assert(UF > 1 && "VF and UF cannot both be 1");

- // When loop is unrolled without vectorizing, initialize

- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled

- // value of `Incoming`. This is analogous to the vectorized case above:

- // extracting the second last element when VF > 1.

- ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);

- }

- for (VPLiveOut *LiveOut : LiveOuts) {

- assert(!Cost->requiresScalarEpilogue(VF.isVector()));

- PHINode *LCSSAPhi = LiveOut->getPhi();

- LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);

- State.Plan->removeLiveOut(LCSSAPhi);

- }

- // Fix the initial value of the original recurrence in the scalar loop.

- Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());

- PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());

- auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");

- auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();

- for (auto *BB : predecessors(LoopScalarPreHeader)) {

- auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;

- Start->addIncoming(Incoming, BB);

- }

- Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);

- Phi->setName("scalar.recur");

void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {

// The basic block and loop containing the predicated instruction.

auto *PredBB = PredInst->getParent();

@@ -3758,11 +3108,6 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,

}

-bool InnerLoopVectorizer::useOrderedReductions(

- const RecurrenceDescriptor &RdxDesc) {

- return Cost->useOrderedReductions(RdxDesc);

void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

// We should not collect Scalars more than once per VF. Right now, this

// function is called from collectUniformsAndScalars(), which already does

@@ -3928,6 +3273,13 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

if (!ScalarInd)

continue;

+ // If the induction variable update is a fixed-order recurrence, neither the

+ // induction variable or its update should be marked scalar after

+ // vectorization.

+ auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);

+ if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))

+ continue;

// Determine if all users of the induction variable update instruction are

// scalar after vectorization.

auto ScalarIndUpdate =

@@ -4100,7 +3452,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,

}

bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(

- Instruction *I, ElementCount VF) {

+ Instruction *I, ElementCount VF) const {

assert(isAccessInterleaved(I) && "Expecting interleaved access.");

assert(getWideningDecision(I, VF) == CM_Unknown &&

"Decision should not be set yet.");

@@ -4109,7 +3461,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(

// If the instruction's allocated size doesn't equal it's type size, it

// requires padding and will be scalarized.

- auto &DL = I->getModule()->getDataLayout();

+ auto &DL = I->getDataLayout();

auto *ScalarTy = getLoadStoreType(I);

if (hasIrregularType(ScalarTy, DL))

return false;

@@ -4187,7 +3539,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(

// If the instruction's allocated size doesn't equal it's type size, it

// requires padding and will be scalarized.

- auto &DL = I->getModule()->getDataLayout();

+ auto &DL = I->getDataLayout();

if (hasIrregularType(ScalarTy, DL))

return false;

@@ -4220,34 +3572,37 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

// Worklist containing uniform instructions demanding lane 0.

SetVector<Instruction *> Worklist;

- BasicBlock *Latch = TheLoop->getLoopLatch();

// Add uniform instructions demanding lane 0 to the worklist. Instructions

- // that are scalar with predication must not be considered uniform after

+ // that require predication must not be considered uniform after

// vectorization, because that would create an erroneous replicating region

// where only a single instance out of VF should be formed.

- // TODO: optimize such seldom cases if found important, see PR40816.

auto addToWorklistIfAllowed = [&](Instruction *I) -> void {

if (isOutOfScope(I)) {

LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "

<< *I << "\n");

return;

}

- if (isScalarWithPredication(I, VF)) {

- LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "

- << *I << "\n");

+ if (isPredicatedInst(I)) {

+ LLVM_DEBUG(

+ dbgs() << "LV: Found not uniform due to requiring predication: " << *I

+ << "\n");

return;

}

LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");

Worklist.insert(I);

};

- // Start with the conditional branch. If the branch condition is an

- // instruction contained in the loop that is only used by the branch, it is

- // uniform.

- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));

- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())

- addToWorklistIfAllowed(Cmp);

+ // Start with the conditional branches exiting the loop. If the branch

+ // condition is an instruction contained in the loop that is only used by the

+ // branch, it is uniform.

+ SmallVector<BasicBlock *> Exiting;

+ TheLoop->getExitingBlocks(Exiting);

+ for (BasicBlock *E : Exiting) {

+ auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));

+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())

+ addToWorklistIfAllowed(Cmp);

+ }

auto PrevVF = VF.divideCoefficientBy(2);

// Return true if all lanes perform the same memory operation, and we can

@@ -4388,6 +3743,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

// nodes separately. An induction variable will remain uniform if all users

// of the induction variable and induction variable update remain uniform.

// The code below handles both pointer and non-pointer induction variables.

+ BasicBlock *Latch = TheLoop->getLoopLatch();

for (const auto &Induction : Legal->getInductionVars()) {

auto *Ind = Induction.first;

auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

@@ -4454,15 +3810,18 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {

return false;

}

-ElementCount

-LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

+bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {

+ if (IsScalableVectorizationAllowed)

+ return *IsScalableVectorizationAllowed;

+ IsScalableVectorizationAllowed = false;

if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)

- return ElementCount::getScalable(0);

+ return false;

if (Hints->isScalableVectorizationDisabled()) {

reportVectorizationInfo("Scalable vectorization is explicitly disabled",

"ScalableVectorizationDisabled", ORE, TheLoop);

- return ElementCount::getScalable(0);

+ return false;

}

LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");

@@ -4482,7 +3841,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

"Scalable vectorization not supported for the reduction "

"operations found in this loop.",

"ScalableVFUnfeasible", ORE, TheLoop);

- return ElementCount::getScalable(0);

+ return false;

}

// Disable scalable vectorization if the loop contains any instructions

@@ -4494,17 +3853,33 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

reportVectorizationInfo("Scalable vectorization is not supported "

"for all element types found in this loop.",

"ScalableVFUnfeasible", ORE, TheLoop);

- return ElementCount::getScalable(0);

+ return false;

+ }

+ if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {

+ reportVectorizationInfo("The target does not provide maximum vscale value "

+ "for safe distance analysis.",

+ "ScalableVFUnfeasible", ORE, TheLoop);

+ return false;

}

+ IsScalableVectorizationAllowed = true;

+ return true;

+ElementCount

+LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

+ if (!isScalableVectorizationAllowed())

+ return ElementCount::getScalable(0);

+ auto MaxScalableVF = ElementCount::getScalable(

+ std::numeric_limits<ElementCount::ScalarTy>::max());

if (Legal->isSafeForAnyVectorWidth())

return MaxScalableVF;

+ std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);

// Limit MaxScalableVF by the maximum safe dependence distance.

- if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))

- MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);

- else

- MaxScalableVF = ElementCount::getScalable(0);

+ MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);

if (!MaxScalableVF)

reportVectorizationInfo(

@@ -4738,8 +4113,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

// found modulo the vectorization factor is not zero, try to fold the tail

// by masking.

// FIXME: look for a smaller MaxVF that does divide TC rather than masking.

- if (Legal->prepareToFoldTailByMasking()) {

- CanFoldTailByMasking = true;

+ setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);

+ if (foldTailByMasking()) {

+ if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {

+ LLVM_DEBUG(

+ dbgs()

+ << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "

+ "try to generate VP Intrinsics with scalable vector "

+ "factors only.\n");

+ // Tail folded loop using VP intrinsics restricts the VF to be scalable

+ // for now.

+ // TODO: extend it for fixed vectors, if required.

+ assert(MaxFactors.ScalableVF.isScalable() &&

+ "Expected scalable vector factor.");

+ MaxFactors.FixedVF = ElementCount::getFixed(1);

+ }

return MaxFactors;

}

@@ -4860,15 +4249,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

// Select the largest VF which doesn't require more registers than existing

// ones.

- for (int i = RUs.size() - 1; i >= 0; --i) {

- bool Selected = true;

- for (auto &pair : RUs[i].MaxLocalUsers) {

- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);

- if (pair.second > TargetNumRegisters)

- Selected = false;

- }

- if (Selected) {

- MaxVF = VFs[i];

+ for (int I = RUs.size() - 1; I >= 0; --I) {

+ const auto &MLU = RUs[I].MaxLocalUsers;

+ if (all_of(MLU, [&](decltype(MLU.front()) &LU) {

+ return LU.second <= TTI.getNumberOfRegisters(LU.first);

+ })) {

+ MaxVF = VFs[I];

break;

}

@@ -4913,28 +4299,6 @@ bool LoopVectorizationPlanner::isMoreProfitable(

unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);

- if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {

- // If the trip count is a known (possibly small) constant, the trip count

- // will be rounded up to an integer number of iterations under

- // FoldTailByMasking. The total cost in that case will be

- // VecCost*ceil(TripCount/VF). When not folding the tail, the total

- // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be

- // some extra overheads, but for the purpose of comparing the costs of

- // different VFs we can use this to compare the total loop-body cost

- // expected after vectorization.

- auto GetCostForTC = [MaxTripCount, this](unsigned VF,

- InstructionCost VectorCost,

- InstructionCost ScalarCost) {

- return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)

- : VectorCost * (MaxTripCount / VF) +

- ScalarCost * (MaxTripCount % VF);

- };

- auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);

- auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);

- return RTCostA < RTCostB;

- }

// Improve estimate for the vector width if it is scalable.

unsigned EstimatedWidthA = A.Width.getKnownMinValue();

unsigned EstimatedWidthB = B.Width.getKnownMinValue();

@@ -4948,13 +4312,39 @@ bool LoopVectorizationPlanner::isMoreProfitable(

// Assume vscale may be larger than 1 (or the value being tuned for),

// so that scalable vectorization is slightly favorable over fixed-width

// vectorization.

- if (A.Width.isScalable() && !B.Width.isScalable())

- return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);

+ bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&

+ A.Width.isScalable() && !B.Width.isScalable();

+ auto CmpFn = [PreferScalable](const InstructionCost &LHS,

+ const InstructionCost &RHS) {

+ return PreferScalable ? LHS <= RHS : LHS < RHS;

+ };

// To avoid the need for FP division:

- // (CostA / A.Width) < (CostB / B.Width)

- // <=> (CostA * B.Width) < (CostB * A.Width)

- return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);

+ // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)

+ // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)

+ if (!MaxTripCount)

+ return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);

+ auto GetCostForTC = [MaxTripCount, this](unsigned VF,

+ InstructionCost VectorCost,

+ InstructionCost ScalarCost) {

+ // If the trip count is a known (possibly small) constant, the trip count

+ // will be rounded up to an integer number of iterations under

+ // FoldTailByMasking. The total cost in that case will be

+ // VecCost*ceil(TripCount/VF). When not folding the tail, the total

+ // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be

+ // some extra overheads, but for the purpose of comparing the costs of

+ // different VFs we can use this to compare the total loop-body cost

+ // expected after vectorization.

+ if (CM.foldTailByMasking())

+ return VectorCost * divideCeil(MaxTripCount, VF);

+ return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);

+ };

+ auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);

+ auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);

+ return CmpFn(RTCostA, RTCostB);

}

static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,

@@ -4977,8 +4367,10 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,

sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {

if (Numbering[A.first] != Numbering[B.first])

return Numbering[A.first] < Numbering[B.first];

- ElementCountComparator ECC;

- return ECC(A.second, B.second);

+ const auto &LHS = A.second;

+ const auto &RHS = B.second;

+ return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <

+ std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());

});

// For a list of ordered instruction-vf pairs:

@@ -5021,13 +4413,111 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,

} while (!Tail.empty());

}

-VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(

- const ElementCountSet &VFCandidates) {

- InstructionCost ExpectedCost =

- CM.expectedCost(ElementCount::getFixed(1)).first;

+/// Check if any recipe of \p Plan will generate a vector value, which will be

+/// assigned a vector register.

+static bool willGenerateVectors(VPlan &Plan, ElementCount VF,

+ const TargetTransformInfo &TTI) {

+ assert(VF.isVector() && "Checking a scalar VF?");

+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),

+ Plan.getCanonicalIV()->getScalarType()->getContext());

+ DenseSet<VPRecipeBase *> EphemeralRecipes;

+ collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);

+ // Set of already visited types.

+ DenseSet<Type *> Visited;

+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

+ vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {

+ for (VPRecipeBase &R : *VPBB) {

+ if (EphemeralRecipes.contains(&R))

+ continue;

+ // Continue early if the recipe is considered to not produce a vector

+ // result. Note that this includes VPInstruction where some opcodes may

+ // produce a vector, to preserve existing behavior as VPInstructions model

+ // aspects not directly mapped to existing IR instructions.

+ switch (R.getVPDefID()) {

+ case VPDef::VPDerivedIVSC:

+ case VPDef::VPScalarIVStepsSC:

+ case VPDef::VPScalarCastSC:

+ case VPDef::VPReplicateSC:

+ case VPDef::VPInstructionSC:

+ case VPDef::VPCanonicalIVPHISC:

+ case VPDef::VPVectorPointerSC:

+ case VPDef::VPExpandSCEVSC:

+ case VPDef::VPEVLBasedIVPHISC:

+ case VPDef::VPPredInstPHISC:

+ case VPDef::VPBranchOnMaskSC:

+ continue;

+ case VPDef::VPReductionSC:

+ case VPDef::VPActiveLaneMaskPHISC:

+ case VPDef::VPWidenCallSC:

+ case VPDef::VPWidenCanonicalIVSC:

+ case VPDef::VPWidenCastSC:

+ case VPDef::VPWidenGEPSC:

+ case VPDef::VPWidenSC:

+ case VPDef::VPWidenSelectSC:

+ case VPDef::VPBlendSC:

+ case VPDef::VPFirstOrderRecurrencePHISC:

+ case VPDef::VPWidenPHISC:

+ case VPDef::VPWidenIntOrFpInductionSC:

+ case VPDef::VPWidenPointerInductionSC:

+ case VPDef::VPReductionPHISC:

+ case VPDef::VPInterleaveSC:

+ case VPDef::VPWidenLoadEVLSC:

+ case VPDef::VPWidenLoadSC:

+ case VPDef::VPWidenStoreEVLSC:

+ case VPDef::VPWidenStoreSC:

+ break;

+ default:

+ llvm_unreachable("unhandled recipe");

+ }

+ auto WillWiden = [&TTI, VF](Type *ScalarTy) {

+ Type *VectorTy = ToVectorTy(ScalarTy, VF);

+ unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);

+ if (!NumLegalParts)

+ return false;

+ if (VF.isScalable()) {

+ // <vscale x 1 x iN> is assumed to be profitable over iN because

+ // scalable registers are a distinct register class from scalar

+ // ones. If we ever find a target which wants to lower scalable

+ // vectors back to scalars, we'll need to update this code to

+ // explicitly ask TTI about the register class uses for each part.

+ return NumLegalParts <= VF.getKnownMinValue();

+ }

+ // Two or more parts that share a register - are vectorized.

+ return NumLegalParts < VF.getKnownMinValue();

+ };

+ // If no def nor is a store, e.g., branches, continue - no value to check.

+ if (R.getNumDefinedValues() == 0 &&

+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(

+ &R))

+ continue;

+ // For multi-def recipes, currently only interleaved loads, suffice to

+ // check first def only.

+ // For stores check their stored value; for interleaved stores suffice

+ // the check first stored value only. In all cases this is the second

+ // operand.

+ VPValue *ToCheck =

+ R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);

+ Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);

+ if (!Visited.insert({ScalarTy}).second)

+ continue;

+ if (WillWiden(ScalarTy))

+ return true;

+ }

+ return false;

+VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {

+ InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));

LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");

assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");

- assert(VFCandidates.count(ElementCount::getFixed(1)) &&

+ assert(any_of(VPlans,

+ [](std::unique_ptr<VPlan> &P) {

+ return P->hasVF(ElementCount::getFixed(1));

+ }) &&

"Expected Scalar VF to be a candidate");

const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,

@@ -5035,7 +4525,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(

VectorizationFactor ChosenFactor = ScalarCost;

bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;

- if (ForceVectorization && VFCandidates.size() > 1) {

+ if (ForceVectorization &&

+ (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {

// Ignore scalar width, because the user explicitly wants vectorization.

// Initialize cost to max so that VF = 2 is, at least, chosen during cost

// evaluation.

@@ -5043,43 +4534,45 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(

}

SmallVector<InstructionVFPair> InvalidCosts;

- for (const auto &i : VFCandidates) {

- // The cost for scalar VF=1 is already calculated, so ignore it.

- if (i.isScalar())

- continue;

+ for (auto &P : VPlans) {

+ for (ElementCount VF : P->vectorFactors()) {

+ // The cost for scalar VF=1 is already calculated, so ignore it.

+ if (VF.isScalar())

+ continue;

- LoopVectorizationCostModel::VectorizationCostTy C =

- CM.expectedCost(i, &InvalidCosts);

- VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);

+ InstructionCost C = CM.expectedCost(VF, &InvalidCosts);

+ VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);

#ifndef NDEBUG

- unsigned AssumedMinimumVscale =

- getVScaleForTuning(OrigLoop, TTI).value_or(1);

- unsigned Width =

- Candidate.Width.isScalable()

- ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale

- : Candidate.Width.getFixedValue();

- LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i

- << " costs: " << (Candidate.Cost / Width));

- if (i.isScalable())

- LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "

- << AssumedMinimumVscale << ")");

- LLVM_DEBUG(dbgs() << ".\n");

+ unsigned AssumedMinimumVscale =

+ getVScaleForTuning(OrigLoop, TTI).value_or(1);

+ unsigned Width =

+ Candidate.Width.isScalable()

+ ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale

+ : Candidate.Width.getFixedValue();

+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF

+ << " costs: " << (Candidate.Cost / Width));

+ if (VF.isScalable())

+ LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "

+ << AssumedMinimumVscale << ")");

+ LLVM_DEBUG(dbgs() << ".\n");

#endif

- if (!C.second && !ForceVectorization) {

- LLVM_DEBUG(

- dbgs() << "LV: Not considering vector loop of width " << i

- << " because it will not generate any vector instructions.\n");

- continue;

- }

+ if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {

+ LLVM_DEBUG(

+ dbgs()

+ << "LV: Not considering vector loop of width " << VF

+ << " because it will not generate any vector instructions.\n");

+ continue;

+ }

- // If profitable add it to ProfitableVF list.

- if (isMoreProfitable(Candidate, ScalarCost))

- ProfitableVFs.push_back(Candidate);

+ // If profitable add it to ProfitableVF list.

+ if (isMoreProfitable(Candidate, ScalarCost))

+ ProfitableVFs.push_back(Candidate);

- if (isMoreProfitable(Candidate, ChosenFactor))

- ChosenFactor = Candidate;

+ if (isMoreProfitable(Candidate, ChosenFactor))

+ ChosenFactor = Candidate;

+ }

}

emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);

@@ -5258,7 +4751,7 @@ std::pair<unsigned, unsigned>

LoopVectorizationCostModel::getSmallestAndWidestTypes() {

unsigned MinWidth = -1U;

unsigned MaxWidth = 8;

- const DataLayout &DL = TheFunction->getParent()->getDataLayout();

+ const DataLayout &DL = TheFunction->getDataLayout();

// For in-loop reductions, no element types are added to ElementTypesInLoop

// if there are no loads/stores in the loop. In this case, check through the

// reduction variables to determine the maximum width.

@@ -5349,25 +4842,24 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

if (!isScalarEpilogueAllowed())

return 1;

+ // Do not interleave if EVL is preferred and no User IC is specified.

+ if (foldTailWithEVL()) {

+ LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "

+ "Unroll factor forced to be 1.\n");

+ return 1;

+ }

// We used the distance for the interleave count.

if (!Legal->isSafeForAnyVectorWidth())

return 1;

auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);

const bool HasReductions = !Legal->getReductionVars().empty();

- // Do not interleave loops with a relatively small known or estimated trip

- // count. But we will interleave when InterleaveSmallLoopScalarReduction is

- // enabled, and the code has scalar reductions(HasReductions && VF = 1),

- // because with the above conditions interleaving can expose ILP and break

- // cross iteration dependences for reductions.

- if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&

- !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))

- return 1;

// If we did not calculate the cost for VF (because the user selected the VF)

// then we calculate the cost of VF here.

if (LoopCost == 0) {

- LoopCost = expectedCost(VF).first;

+ LoopCost = expectedCost(VF);

assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");

// Loop body is free and there is no need for interleaving.

@@ -5443,7 +4935,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");

unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

- if (KnownTC) {

+ if (KnownTC > 0) {

+ // At least one iteration must be scalar when this constraint holds. So the

+ // maximum available iterations for interleaving is one less.

+ unsigned AvailableTC =

+ requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;

// If trip count is known we select between two prospective ICs, where

// 1) the aggressive IC is capped by the trip count divided by VF

// 2) the conservative IC is capped by the trip count divided by (VF * 2)

@@ -5453,27 +4950,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

// we run the vector loop at least twice.

unsigned InterleaveCountUB = bit_floor(

- std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));

+ std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));

unsigned InterleaveCountLB = bit_floor(std::max(

- 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));

+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));

MaxInterleaveCount = InterleaveCountLB;

if (InterleaveCountUB != InterleaveCountLB) {

- unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));

- unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));

+ unsigned TailTripCountUB =

+ (AvailableTC % (EstimatedVF * InterleaveCountUB));

+ unsigned TailTripCountLB =

+ (AvailableTC % (EstimatedVF * InterleaveCountLB));

// If both produce same scalar tail, maximize the IC to do the same work

// in fewer vector loop iterations

if (TailTripCountUB == TailTripCountLB)

MaxInterleaveCount = InterleaveCountUB;

}

- } else if (BestKnownTC) {

+ } else if (BestKnownTC && *BestKnownTC > 0) {

+ // At least one iteration must be scalar when this constraint holds. So the

+ // maximum available iterations for interleaving is one less.

+ unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())

+ ? (*BestKnownTC) - 1

+ : *BestKnownTC;

// If trip count is an estimated compile time constant, limit the

// IC to be capped by the trip count divided by VF * 2, such that the vector

// loop runs at least twice to make interleaving seem profitable when there

// is an epilogue loop present. Since exact Trip count is not known we

// choose to be conservative in our IC estimate.

MaxInterleaveCount = bit_floor(std::max(

- 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));

+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));

}

assert(MaxInterleaveCount > 0 &&

@@ -5577,8 +5082,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

// If there are scalar reductions and TTI has enabled aggressive

// interleaving for reductions, we will interleave to expose ILP.

- if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&

- AggressivelyInterleaveReductions) {

+ if (VF.isScalar() && AggressivelyInterleaveReductions) {

LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");

// Interleave no less than SmallIC but not as aggressive as the normal IC

// to satisfy the rare situation when resources are too limited.

@@ -5845,15 +5349,21 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {

for (Instruction &I : *BB)

if (isScalarWithPredication(&I, VF)) {

ScalarCostsTy ScalarCosts;

- // Do not apply discount if scalable, because that would lead to

- // invalid scalarization costs.

- // Do not apply discount logic if hacked cost is needed

- // for emulated masked memrefs.

- if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&

+ // Do not apply discount logic for:

+ // 1. Scalars after vectorization, as there will only be a single copy

+ // of the instruction.

+ // 2. Scalable VF, as that would lead to invalid scalarization costs.

+ // 3. Emulated masked memrefs, if a hacked cost is needed.

+ if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&

+ !useEmulatedMaskMemRefHack(&I, VF) &&

computePredInstDiscount(&I, ScalarCosts, VF) >= 0)

ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());

// Remember that BB will remain after vectorization.

PredicatedBBsAfterVectorization[VF].insert(BB);

+ for (auto *Pred : predecessors(BB)) {

+ if (Pred->getSingleSuccessor() == BB)

+ PredicatedBBsAfterVectorization[VF].insert(Pred);

+ }

}

@@ -5920,15 +5430,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

// Compute the cost of the vector instruction. Note that this cost already

// includes the scalarization overhead of the predicated instruction.

- InstructionCost VectorCost = getInstructionCost(I, VF).first;

+ InstructionCost VectorCost = getInstructionCost(I, VF);

// Compute the cost of the scalarized instruction. This cost is the cost of

// the instruction as if it wasn't if-converted and instead remained in the

// predicated block. We will scale this cost by block probability after

// computing the scalarization overhead.

InstructionCost ScalarCost =

- VF.getFixedValue() *

- getInstructionCost(I, ElementCount::getFixed(1)).first;

+ VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));

// Compute the scalarization overhead of needed insertelement instructions

// and phi nodes.

@@ -5972,14 +5481,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

return Discount;

}

-LoopVectorizationCostModel::VectorizationCostTy

-LoopVectorizationCostModel::expectedCost(

+InstructionCost LoopVectorizationCostModel::expectedCost(

ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {

- VectorizationCostTy Cost;

+ InstructionCost Cost;

// For each block.

for (BasicBlock *BB : TheLoop->blocks()) {

- VectorizationCostTy BlockCost;

+ InstructionCost BlockCost;

// For each instruction in the old loop.

for (Instruction &I : BB->instructionsWithoutDebug()) {

@@ -5988,22 +5496,19 @@ LoopVectorizationCostModel::expectedCost(

(VF.isVector() && VecValuesToIgnore.count(&I)))

continue;

- VectorizationCostTy C = getInstructionCost(&I, VF);

+ InstructionCost C = getInstructionCost(&I, VF);

// Check if we should override the cost.

- if (C.first.isValid() &&

- ForceTargetInstructionCost.getNumOccurrences() > 0)

- C.first = InstructionCost(ForceTargetInstructionCost);

+ if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)

+ C = InstructionCost(ForceTargetInstructionCost);

// Keep a list of instructions with invalid costs.

- if (Invalid && !C.first.isValid())

+ if (Invalid && !C.isValid())

Invalid->emplace_back(&I, VF);

- BlockCost.first += C.first;

- BlockCost.second |= C.second;

- LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first

- << " for VF " << VF << " For instruction: " << I

- << '\n');

+ BlockCost += C;

+ LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "

+ << VF << " For instruction: " << I << '\n');

}

// If we are vectorizing a predicated block, it will have been

@@ -6014,10 +5519,9 @@ LoopVectorizationCostModel::expectedCost(

// cost by the probability of executing it. blockNeedsPredication from

// Legal is used so as to not include all blocks in tail folded loops.

if (VF.isScalar() && Legal->blockNeedsPredication(BB))

- BlockCost.first /= getReciprocalPredBlockProb();

+ BlockCost /= getReciprocalPredBlockProb();

- Cost.first += BlockCost.first;

- Cost.second |= BlockCost.second;

+ Cost += BlockCost;

}

return Cost;

@@ -6273,12 +5777,20 @@ LoopVectorizationCostModel::getReductionPatternCost(

const RecurrenceDescriptor &RdxDesc =

Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;

- InstructionCost BaseCost = TTI.getArithmeticReductionCost(

- RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);

+ InstructionCost BaseCost;

+ RecurKind RK = RdxDesc.getRecurrenceKind();

+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {

+ Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);

+ BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,

+ RdxDesc.getFastMathFlags(), CostKind);

+ } else {

+ BaseCost = TTI.getArithmeticReductionCost(

+ RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);

+ }

// For a call to the llvm.fmuladd intrinsic we need to add the cost of a

// normal fmul instruction to the cost of the fadd reduction.

- if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)

+ if (RK == RecurKind::FMulAdd)

BaseCost +=

TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);

@@ -6416,49 +5928,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

return getWideningCost(I, VF);

}

-LoopVectorizationCostModel::VectorizationCostTy

-LoopVectorizationCostModel::getInstructionCost(Instruction *I,

- ElementCount VF) {

- // If we know that this instruction will remain uniform, check the cost of

- // the scalar version.

- if (isUniformAfterVectorization(I, VF))

- VF = ElementCount::getFixed(1);

- if (VF.isVector() && isProfitableToScalarize(I, VF))

- return VectorizationCostTy(InstsToScalarize[VF][I], false);

- // Forced scalars do not have any scalarization overhead.

- auto ForcedScalar = ForcedScalars.find(VF);

- if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {

- auto InstSet = ForcedScalar->second;

- if (InstSet.count(I))

- return VectorizationCostTy(

- (getInstructionCost(I, ElementCount::getFixed(1)).first *

- VF.getKnownMinValue()),

- false);

- }

- Type *VectorTy;

- InstructionCost C = getInstructionCost(I, VF, VectorTy);

- bool TypeNotScalarized = false;

- if (VF.isVector() && VectorTy->isVectorTy()) {

- if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {

- if (VF.isScalable())

- // <vscale x 1 x iN> is assumed to be profitable over iN because

- // scalable registers are a distinct register class from scalar ones.

- // If we ever find a target which wants to lower scalable vectors

- // back to scalars, we'll need to update this code to explicitly

- // ask TTI about the register class uses for each part.

- TypeNotScalarized = NumParts <= VF.getKnownMinValue();

- else

- TypeNotScalarized = NumParts < VF.getKnownMinValue();

- } else

- C = InstructionCost::getInvalid();

- }

- return VectorizationCostTy(C, TypeNotScalarized);

InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(

Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {

@@ -6849,8 +6318,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {

}

InstructionCost

-LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

- Type *&VectorTy) {

+LoopVectorizationCostModel::getInstructionCost(Instruction *I,

+ ElementCount VF) {

+ // If we know that this instruction will remain uniform, check the cost of

+ // the scalar version.

+ if (isUniformAfterVectorization(I, VF))

+ VF = ElementCount::getFixed(1);

+ if (VF.isVector() && isProfitableToScalarize(I, VF))

+ return InstsToScalarize[VF][I];

+ // Forced scalars do not have any scalarization overhead.

+ auto ForcedScalar = ForcedScalars.find(VF);

+ if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {

+ auto InstSet = ForcedScalar->second;

+ if (InstSet.count(I))

+ return getInstructionCost(I, ElementCount::getFixed(1)) *

+ VF.getKnownMinValue();

+ }

Type *RetTy = I->getType();

if (canTruncateToMinimalBitwidth(I, VF))

RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);

@@ -6873,6 +6359,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

};

(void) hasSingleCopyAfterVectorization;

+ Type *VectorTy;

if (isScalarAfterVectorization(I, VF)) {

// With the exception of GEPs and PHIs, after scalarization there should

// only be one copy of the instruction generated in the loop. This is

@@ -6888,6 +6375,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

} else

VectorTy = ToVectorTy(RetTy, VF);

+ if (VF.isVector() && VectorTy->isVectorTy() &&

+ !TTI.getNumberOfParts(VectorTy))

+ return InstructionCost::getInvalid();

// TODO: We need to estimate the cost of intrinsic calls.

switch (I->getOpcode()) {

case Instruction::GetElementPtr:

@@ -6900,11 +6391,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

// In cases of scalarized and predicated instructions, there will be VF

// predicated blocks in the vectorized loop. Each branch around these

// blocks requires also an extract of its vector compare i1 element.

+ // Note that the conditional branch from the loop latch will be replaced by

+ // a single branch controlling the loop, so there is no extra overhead from

+ // scalarization.

bool ScalarPredicatedBB = false;

BranchInst *BI = cast<BranchInst>(I);

if (VF.isVector() && BI->isConditional() &&

(PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||

- PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))

+ PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&

+ BI->getParent() != TheLoop->getLoopLatch())

ScalarPredicatedBB = true;

if (ScalarPredicatedBB) {

@@ -6934,6 +6429,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

// First-order recurrences are replaced by vector shuffles inside the loop.

if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {

+ // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the

+ // penultimate value of the recurrence.

+ // TODO: Consider vscale_range info.

+ if (VF.isScalable() && VF.getKnownMinValue() == 1)

+ return InstructionCost::getInvalid();

SmallVector<int> Mask(VF.getKnownMinValue());

std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);

return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,

@@ -6999,25 +6499,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

Op2Info.Kind = TargetTransformInfo::OK_UniformValue;

SmallVector<const Value *, 4> Operands(I->operand_values());

- auto InstrCost = TTI.getArithmeticInstrCost(

+ return TTI.getArithmeticInstrCost(

I->getOpcode(), VectorTy, CostKind,

{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

- Op2Info, Operands, I);

- // Some targets can replace frem with vector library calls.

- InstructionCost VecCallCost = InstructionCost::getInvalid();

- if (I->getOpcode() == Instruction::FRem) {

- LibFunc Func;

- if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&

- TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {

- SmallVector<Type *, 4> OpTypes;

- for (auto &Op : I->operands())

- OpTypes.push_back(Op->getType());

- VecCallCost =

- TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);

- }

- return std::min(InstrCost, VecCallCost);

+ Op2Info, Operands, I, TLI);

}

case Instruction::FNeg: {

return TTI.getArithmeticInstrCost(

@@ -7157,25 +6642,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

return *RedCost;

Type *SrcScalarTy = I->getOperand(0)->getType();

+ Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));

+ if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))

+ SrcScalarTy =

+ IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);

Type *SrcVecTy =

VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;

if (canTruncateToMinimalBitwidth(I, VF)) {

- // This cast is going to be shrunk. This may remove the cast or it might

- // turn it into slightly different cast. For example, if MinBW == 16,

- // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".

- //

- // Calculate the modified src and dest types.

- Type *MinVecTy = VectorTy;

- if (Opcode == Instruction::Trunc) {

- SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);

- VectorTy =

- largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);

- } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {

- // Leave SrcVecTy unchanged - we only shrink the destination element

- // type.

- VectorTy =

- smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);

- }

+ // If the result type is <= the source type, there will be no extend

+ // after truncating the users to the minimal required bitwidth.

+ if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&

+ (I->getOpcode() == Instruction::ZExt ||

+ I->getOpcode() == Instruction::SExt))

+ return 0;

}

return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);

@@ -7200,16 +6680,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {

// Ignore ephemeral values.

CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

- // Find all stores to invariant variables. Since they are going to sink

- // outside the loop we do not need calculate cost for them.

+ SmallVector<Value *, 4> DeadInterleavePointerOps;

for (BasicBlock *BB : TheLoop->blocks())

for (Instruction &I : *BB) {

+ // Find all stores to invariant variables. Since they are going to sink

+ // outside the loop we do not need calculate cost for them.

StoreInst *SI;

if ((SI = dyn_cast<StoreInst>(&I)) &&

Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))

ValuesToIgnore.insert(&I);

+ // For interleave groups, we only create a pointer for the start of the

+ // interleave group. Queue up addresses of group members except the insert

+ // position for further processing.

+ if (isAccessInterleaved(&I)) {

+ auto *Group = getInterleavedAccessGroup(&I);

+ if (Group->getInsertPos() == &I)

+ continue;

+ Value *PointerOp = getLoadStorePointerOperand(&I);

+ DeadInterleavePointerOps.push_back(PointerOp);

+ }

}

+ // Mark ops feeding interleave group members as free, if they are only used

+ // by other dead computations.

+ for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {

+ auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);

+ if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {

+ Instruction *UI = cast<Instruction>(U);

+ return !VecValuesToIgnore.contains(U) &&

+ (!isAccessInterleaved(UI) ||

+ getInterleavedAccessGroup(UI)->getInsertPos() == UI);

+ }))

+ continue;

+ VecValuesToIgnore.insert(Op);

+ DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());

+ }

// Ignore type-promoting instructions we identified during reduction

// detection.

for (const auto &Reduction : Legal->getReductionVars()) {

@@ -7370,6 +6877,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

CM.invalidateCostModelingDecisions();

}

+ if (CM.foldTailByMasking())

+ Legal->prepareToFoldTailByMasking();

ElementCount MaxUserVF =

UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;

bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);

@@ -7395,14 +6905,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

"InvalidCost", ORE, OrigLoop);

}

- // Populate the set of Vectorization Factor Candidates.

- ElementCountSet VFCandidates;

+ // Collect the Vectorization Factor Candidates.

+ SmallVector<ElementCount> VFCandidates;

for (auto VF = ElementCount::getFixed(1);

ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)

- VFCandidates.insert(VF);

+ VFCandidates.push_back(VF);

for (auto VF = ElementCount::getScalable(1);

ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)

- VFCandidates.insert(VF);

+ VFCandidates.push_back(VF);

CM.collectInLoopReductions();

for (const auto &VF : VFCandidates) {

@@ -7419,11 +6929,17 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);

LLVM_DEBUG(printPlans(dbgs()));

- if (!MaxFactors.hasVector())

+ if (VPlans.empty())

+ return std::nullopt;

+ if (all_of(VPlans,

+ [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))

return VectorizationFactor::Disabled();

- // Select the optimal vectorization factor.

- VectorizationFactor VF = selectVectorizationFactor(VFCandidates);

+ // Select the optimal vectorization factor according to the legacy cost-model.

+ // This is now only used to verify the decisions by the new VPlan-based

+ // cost-model and will be retired once the VPlan-based cost-model is

+ // stabilized.

+ VectorizationFactor VF = selectVectorizationFactor();

assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");

if (!hasPlanWithVF(VF.Width)) {

LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width

@@ -7433,6 +6949,212 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

return VF;

}

+InstructionCost VPCostContext::getLegacyCost(Instruction *UI,

+ ElementCount VF) const {

+ return CM.getInstructionCost(UI, VF);

+bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {

+ return CM.ValuesToIgnore.contains(UI) ||

+ (IsVector && CM.VecValuesToIgnore.contains(UI)) ||

+ SkipCostComputation.contains(UI);

+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,

+ ElementCount VF) const {

+ InstructionCost Cost = 0;

+ LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();

+ VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);

+ // Cost modeling for inductions is inaccurate in the legacy cost model

+ // compared to the recipes that are generated. To match here initially during

+ // VPlan cost model bring up directly use the induction costs from the legacy

+ // cost model. Note that we do this as pre-processing; the VPlan may not have

+ // any recipes associated with the original induction increment instruction

+ // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute

+ // the cost of induction phis and increments (both that are represented by

+ // recipes and those that are not), to avoid distinguishing between them here,

+ // and skip all recipes that represent induction phis and increments (the

+ // former case) later on, if they exist, to avoid counting them twice.

+ // Similarly we pre-compute the cost of any optimized truncates.

+ // TODO: Switch to more accurate costing based on VPlan.

+ for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {

+ Instruction *IVInc = cast<Instruction>(

+ IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));

+ SmallVector<Instruction *> IVInsts = {IV, IVInc};

+ for (User *U : IV->users()) {

+ auto *CI = cast<Instruction>(U);

+ if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))

+ continue;

+ IVInsts.push_back(CI);

+ }

+ for (Instruction *IVInst : IVInsts) {

+ if (!CostCtx.SkipCostComputation.insert(IVInst).second)

+ continue;

+ InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);

+ LLVM_DEBUG({

+ dbgs() << "Cost of " << InductionCost << " for VF " << VF

+ << ": induction instruction " << *IVInst << "\n";

+ });

+ Cost += InductionCost;

+ }

+ /// Compute the cost of all exiting conditions of the loop using the legacy

+ /// cost model. This is to match the legacy behavior, which adds the cost of

+ /// all exit conditions. Note that this over-estimates the cost, as there will

+ /// be a single condition to control the vector loop.

+ SmallVector<BasicBlock *> Exiting;

+ CM.TheLoop->getExitingBlocks(Exiting);

+ SetVector<Instruction *> ExitInstrs;

+ // Collect all exit conditions.

+ for (BasicBlock *EB : Exiting) {

+ auto *Term = dyn_cast<BranchInst>(EB->getTerminator());

+ if (!Term)

+ continue;

+ if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {

+ ExitInstrs.insert(CondI);

+ }

+ // Compute the cost of all instructions only feeding the exit conditions.

+ for (unsigned I = 0; I != ExitInstrs.size(); ++I) {

+ Instruction *CondI = ExitInstrs[I];

+ if (!OrigLoop->contains(CondI) ||

+ !CostCtx.SkipCostComputation.insert(CondI).second)

+ continue;

+ Cost += CostCtx.getLegacyCost(CondI, VF);

+ for (Value *Op : CondI->operands()) {

+ auto *OpI = dyn_cast<Instruction>(Op);

+ if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {

+ return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&

+ !ExitInstrs.contains(cast<Instruction>(U));

+ }))

+ continue;

+ ExitInstrs.insert(OpI);

+ }

+ // The legacy cost model has special logic to compute the cost of in-loop

+ // reductions, which may be smaller than the sum of all instructions involved

+ // in the reduction. For AnyOf reductions, VPlan codegen may remove the select

+ // which the legacy cost model uses to assign cost. Pre-compute their costs

+ // for now.

+ // TODO: Switch to costing based on VPlan once the logic has been ported.

+ for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {

+ if (!CM.isInLoopReduction(RedPhi) &&

+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(

+ RdxDesc.getRecurrenceKind()))

+ continue;

+ // AnyOf reduction codegen may remove the select. To match the legacy cost

+ // model, pre-compute the cost for AnyOf reductions here.

+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(

+ RdxDesc.getRecurrenceKind())) {

+ auto *Select = cast<SelectInst>(*find_if(

+ RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));

+ assert(!CostCtx.SkipCostComputation.contains(Select) &&

+ "reduction op visited multiple times");

+ CostCtx.SkipCostComputation.insert(Select);

+ auto ReductionCost = CostCtx.getLegacyCost(Select, VF);

+ LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF

+ << ":\n any-of reduction " << *Select << "\n");

+ Cost += ReductionCost;

+ continue;

+ }

+ const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);

+ SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),

+ ChainOps.end());

+ // Also include the operands of instructions in the chain, as the cost-model

+ // may mark extends as free.

+ for (auto *ChainOp : ChainOps) {

+ for (Value *Op : ChainOp->operands()) {

+ if (auto *I = dyn_cast<Instruction>(Op))

+ ChainOpsAndOperands.insert(I);

+ }

+ // Pre-compute the cost for I, if it has a reduction pattern cost.

+ for (Instruction *I : ChainOpsAndOperands) {

+ auto ReductionCost = CM.getReductionPatternCost(

+ I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);

+ if (!ReductionCost)

+ continue;

+ assert(!CostCtx.SkipCostComputation.contains(I) &&

+ "reduction op visited multiple times");

+ CostCtx.SkipCostComputation.insert(I);

+ LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF

+ << ":\n in-loop reduction " << *I << "\n");

+ Cost += *ReductionCost;

+ }

+ // Pre-compute the costs for branches except for the backedge, as the number

+ // of replicate regions in a VPlan may not directly match the number of

+ // branches, which would lead to different decisions.

+ // TODO: Compute cost of branches for each replicate region in the VPlan,

+ // which is more accurate than the legacy cost model.

+ for (BasicBlock *BB : OrigLoop->blocks()) {

+ if (BB == OrigLoop->getLoopLatch())

+ continue;

+ CostCtx.SkipCostComputation.insert(BB->getTerminator());

+ auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);

+ Cost += BranchCost;

+ }

+ // Now compute and add the VPlan-based cost.

+ Cost += Plan.cost(VF, CostCtx);

+ LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");

+ return Cost;

+VPlan &LoopVectorizationPlanner::getBestPlan() const {

+ // If there is a single VPlan with a single VF, return it directly.

+ VPlan &FirstPlan = *VPlans[0];

+ if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)

+ return FirstPlan;

+ VPlan *BestPlan = &FirstPlan;

+ ElementCount ScalarVF = ElementCount::getFixed(1);

+ assert(hasPlanWithVF(ScalarVF) &&

+ "More than a single plan/VF w/o any plan having scalar VF");

+ // TODO: Compute scalar cost using VPlan-based cost model.

+ InstructionCost ScalarCost = CM.expectedCost(ScalarVF);

+ VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);

+ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;

+ if (ForceVectorization) {

+ // Ignore scalar width, because the user explicitly wants vectorization.

+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost

+ // evaluation.

+ BestFactor.Cost = InstructionCost::getMax();

+ }

+ for (auto &P : VPlans) {

+ for (ElementCount VF : P->vectorFactors()) {

+ if (VF.isScalar())

+ continue;

+ if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {

+ LLVM_DEBUG(

+ dbgs()

+ << "LV: Not considering vector loop of width " << VF

+ << " because it will not generate any vector instructions.\n");

+ continue;

+ }

+ InstructionCost Cost = cost(*P, VF);

+ VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);

+ if (isMoreProfitable(CurrentFactor, BestFactor)) {

+ BestFactor = CurrentFactor;

+ BestPlan = &*P;

+ }

+ BestPlan->setVF(BestFactor.Width);

+ return *BestPlan;

VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {

assert(count_if(VPlans,

[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==

@@ -7485,7 +7207,8 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {

static void createAndCollectMergePhiForReduction(

VPInstruction *RedResult,

DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,

- VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {

+ VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,

+ bool VectorizingEpilogue) {

if (!RedResult ||

RedResult->getOpcode() != VPInstruction::ComputeReductionResult)

return;

@@ -7493,19 +7216,29 @@ static void createAndCollectMergePhiForReduction(

auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));

const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

- TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();

Value *FinalValue =

State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));

auto *ResumePhi =

dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());

+ if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(

+ RdxDesc.getRecurrenceKind())) {

+ auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());

+ assert(Cmp->getPredicate() == CmpInst::ICMP_NE);

+ assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());

+ ResumePhi = cast<PHINode>(Cmp->getOperand(0));

+ }

+ assert((!VectorizingEpilogue || ResumePhi) &&

+ "when vectorizing the epilogue loop, we need a resume phi from main "

+ "vector loop");

// TODO: bc.merge.rdx should not be created here, instead it should be

// modeled in VPlan.

BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();

// Create a phi node that merges control-flow from the backedge-taken check

// block and the middle block.

- auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",

- LoopScalarPreHeader->getTerminator());

+ auto *BCBlockPhi =

+ PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",

+ LoopScalarPreHeader->getTerminator()->getIterator());

// If we are fixing reductions in the epilogue loop then we should already

// have created a bc.merge.rdx Phi after the main vector body. Ensure that

@@ -7517,7 +7250,7 @@ static void createAndCollectMergePhiForReduction(

BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),

Incoming);

else

- BCBlockPhi->addIncoming(ReductionStartValue, Incoming);

+ BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);

}

auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());

@@ -7549,12 +7282,14 @@ LoopVectorizationPlanner::executePlan(

assert(

(IsEpilogueVectorization || !ExpandedSCEVs) &&

"expanded SCEVs to reuse can only be used during epilogue vectorization");

+ (void)IsEpilogueVectorization;

- LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF

- << '\n');

+ VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

- if (!IsEpilogueVectorization)

- VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

+ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF

+ << ", UF=" << BestUF << '\n');

+ BestVPlan.setName("Final VPlan");

+ LLVM_DEBUG(BestVPlan.dump());

// Perform the actual loop transformation.

VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,

@@ -7579,6 +7314,9 @@ LoopVectorizationPlanner::executePlan(

std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =

ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs

: State.ExpandedSCEVs);

+#ifdef EXPENSIVE_CHECKS

+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));

+#endif

// Only use noalias metadata when using memory checks guaranteeing no overlap

// across all iterations.

@@ -7598,8 +7336,6 @@ LoopVectorizationPlanner::executePlan(

State.LVer->prepareNoAliasMetadata();

}

- ILV.collectPoisonGeneratingRecipes(State);

ILV.printDebugTracesAtStart();

//===------------------------------------------------===//

@@ -7622,9 +7358,9 @@ LoopVectorizationPlanner::executePlan(

auto *ExitVPBB =

cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());

for (VPRecipeBase &R : *ExitVPBB) {

- createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),

- ReductionResumeValues, State, OrigLoop,

- State.CFG.VPBB2IRBB[ExitVPBB]);

+ createAndCollectMergePhiForReduction(

+ dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,

+ State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);

}

// 2.6. Maintain Loop Hints

@@ -7661,6 +7397,18 @@ LoopVectorizationPlanner::executePlan(

ILV.printDebugTracesAtEnd();

+ // 4. Adjust branch weight of the branch in the middle block.

+ auto *MiddleTerm =

+ cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());

+ if (MiddleTerm->isConditional() &&

+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {

+ // Assume that `Count % VectorTripCount` is equally distributed.

+ unsigned TripCount = State.UF * State.VF.getKnownMinValue();

+ assert(TripCount > 0 && "trip count should not be zero");

+ const uint32_t Weights[] = {1, TripCount - 1};

+ setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);

+ }

return {State.ExpandedSCEVs, ReductionResumeValues};

}

@@ -7717,7 +7465,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(

// inductions in the epilogue loop are created before executing the plan for

// the epilogue loop.

- return {completeLoopSkeleton(), nullptr};

+ return {LoopVectorPreHeader, nullptr};

}

void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {

@@ -7772,14 +7520,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

DT->getNode(Bypass)->getIDom()) &&

"TC check is expected to dominate Bypass");

- // Update dominator for Bypass & LoopExit.

+ // Update dominator for Bypass.

DT->changeImmediateDominator(Bypass, TCCheckBlock);

- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))

- // For loops with multiple exits, there's no edge from the middle block

- // to exit blocks (as the epilogue must run) and thus no need to update

- // the immediate dominator of the exit blocks.

- DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);

LoopBypassBlocks.push_back(TCCheckBlock);

// Save the trip count so we don't have to regenerate it in the

@@ -7791,7 +7533,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

BranchInst &BI =

*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);

if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))

- setBranchWeights(BI, MinItersBypassWeights);

+ setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);

ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);

return TCCheckBlock;

@@ -7810,11 +7552,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(

// Now, compare the remaining count and if there aren't enough iterations to

// execute the vectorized epilogue skip to the scalar part.

- BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;

- VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");

- LoopVectorPreHeader =

- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,

- LI, nullptr, "vec.epilog.ph");

+ LoopVectorPreHeader->setName("vec.epilog.ph");

+ BasicBlock *VecEpilogueIterationCountCheck =

+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,

+ nullptr, "vec.epilog.iter.check", true);

emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,

VecEpilogueIterationCountCheck);

@@ -7907,7 +7648,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(

{VecEpilogueIterationCountCheck,

EPI.VectorTripCount} /* AdditionalBypass */);

- return {completeLoopSkeleton(), EPResumeVal};

+ return {LoopVectorPreHeader, EPResumeVal};

}

BasicBlock *

@@ -7949,10 +7690,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(

unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);

const uint32_t Weights[] = {EstimatedSkipCount,

MainLoopStep - EstimatedSkipCount};

- setBranchWeights(BI, Weights);

+ setBranchWeights(BI, Weights, /*IsExpected=*/false);

}

ReplaceInstWithInst(Insert->getTerminator(), &BI);

LoopBypassBlocks.push_back(Insert);

return Insert;

}

@@ -8000,8 +7740,19 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,

}

-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

- VPlan &Plan) {

+iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>

+VPRecipeBuilder::mapToVPValues(User::op_range Operands) {

+ std::function<VPValue *(Value *)> Fn = [this](Value *Op) {

+ if (auto *I = dyn_cast<Instruction>(Op)) {

+ if (auto *R = Ingredient2Recipe.lookup(I))

+ return R->getVPSingleValue();

+ }

+ return Plan.getOrAddLiveIn(Op);

+ };

+ return map_range(Operands, Fn);

+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {

assert(is_contained(predecessors(Dst), Src) && "Invalid edge");

// Look for cached value.

@@ -8025,27 +7776,34 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

if (OrigLoop->isLoopExiting(Src))

return EdgeMaskCache[Edge] = SrcMask;

- VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());

+ VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);

assert(EdgeMask && "No Edge Mask found for condition");

if (BI->getSuccessor(0) != Dst)

EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());

if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.

- // The condition is 'SrcMask && EdgeMask', which is equivalent to

- // 'select i1 SrcMask, i1 EdgeMask, i1 false'.

- // The select version does not introduce new UB if SrcMask is false and

- // EdgeMask is poison. Using 'and' here introduces undefined behavior.

- VPValue *False = Plan.getVPValueOrAddLiveIn(

- ConstantInt::getFalse(BI->getCondition()->getType()));

- EdgeMask =

- Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());

+ // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask

+ // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'

+ // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.

+ EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());

}

return EdgeMaskCache[Edge] = EdgeMask;

}

-void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {

+VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {

+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");

+ // Look for cached value.

+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);

+ EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);

+ assert(ECEntryIt != EdgeMaskCache.end() &&

+ "looking up mask for edge which has not been created");

+ return ECEntryIt->second;

+void VPRecipeBuilder::createHeaderMask() {

BasicBlock *Header = OrigLoop->getHeader();

// When not folding the tail, use nullptr to model all-true mask.

@@ -8080,7 +7838,7 @@ VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {

return BCEntryIt->second;

}

-void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {

+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {

assert(OrigLoop->contains(BB) && "Block is not a part of a loop");

assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");

assert(OrigLoop->getHeader() != BB &&

@@ -8091,7 +7849,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {

VPValue *BlockMask = nullptr;

// This is the block mask. We OR all incoming edges.

for (auto *Predecessor : predecessors(BB)) {

- VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);

+ VPValue *EdgeMask = createEdgeMask(Predecessor, BB);

if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.

BlockMaskCache[BB] = EdgeMask;

return;

@@ -8108,10 +7866,9 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {

BlockMaskCache[BB] = BlockMask;

}

-VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,

- ArrayRef<VPValue *> Operands,

- VFRange &Range,

- VPlanPtr &Plan) {

+VPWidenMemoryRecipe *

+VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,

+ VFRange &Range) {

assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&

"Must be called with either a load or store");

@@ -8154,12 +7911,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,

Ptr = VectorPtr;

}

if (LoadInst *Load = dyn_cast<LoadInst>(I))

- return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,

- Reverse);

+ return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,

+ I->getDebugLoc());

StoreInst *Store = cast<StoreInst>(I);

- return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,

- Consecutive, Reverse);

+ return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,

+ Reverse, I->getDebugLoc());

}

/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also

@@ -8167,8 +7924,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,

static VPWidenIntOrFpInductionRecipe *

createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,

VPValue *Start, const InductionDescriptor &IndDesc,

- VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,

- VFRange &Range) {

+ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {

assert(IndDesc.getStartValue() ==

Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));

assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&

@@ -8183,14 +7939,14 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,

return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);

}

-VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(

- PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {

+VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(

+ PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {

// Check if this is an integer or fp induction. If so, build the recipe that

// produces its scalar and vector values.

if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))

return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,

- *PSE.getSE(), *OrigLoop, Range);

+ *PSE.getSE(), *OrigLoop);

// Check if this is pointer induction. If so, build the recipe for it.

if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {

@@ -8208,7 +7964,7 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(

}

VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(

- TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {

+ TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {

// Optimize the special case where the source is a constant integer

// induction variable. Notice that we can only optimize the 'trunc' case

// because (a) FP conversions lose precision, (b) sext/zext may wrap, and

@@ -8228,62 +7984,46 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(

auto *Phi = cast<PHINode>(I->getOperand(0));

const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);

- VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());

+ VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());

return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),

- *OrigLoop, Range);

+ *OrigLoop);

}

return nullptr;

}

-VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,

- ArrayRef<VPValue *> Operands,

- VPlanPtr &Plan) {

- // If all incoming values are equal, the incoming VPValue can be used directly

- // instead of creating a new VPBlendRecipe.

- if (llvm::all_equal(Operands))

- return Operands[0];

+VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,

+ ArrayRef<VPValue *> Operands) {

unsigned NumIncoming = Phi->getNumIncomingValues();

- // For in-loop reductions, we do not need to create an additional select.

- VPValue *InLoopVal = nullptr;

- for (unsigned In = 0; In < NumIncoming; In++) {

- PHINode *PhiOp =

- dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());

- if (PhiOp && CM.isInLoopReduction(PhiOp)) {

- assert(!InLoopVal && "Found more than one in-loop reduction!");

- InLoopVal = Operands[In];

- }

- assert((!InLoopVal || NumIncoming == 2) &&

- "Found an in-loop reduction for PHI with unexpected number of "

- "incoming values");

- if (InLoopVal)

- return Operands[Operands[0] == InLoopVal ? 1 : 0];

// We know that all PHIs in non-header blocks are converted into selects, so

// we don't have to worry about the insertion order and we can just use the

// builder. At this point we generate the predication tree. There may be

// duplications since this is a simple recursive scan, but future

// optimizations will clean it up.

+ // TODO: At the moment the first mask is always skipped, but it would be

+ // better to skip the most expensive mask.

SmallVector<VPValue *, 2> OperandsWithMask;

for (unsigned In = 0; In < NumIncoming; In++) {

- VPValue *EdgeMask =

- createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);

- assert((EdgeMask || NumIncoming == 1) &&

- "Multiple predecessors with one having a full mask");

OperandsWithMask.push_back(Operands[In]);

- if (EdgeMask)

- OperandsWithMask.push_back(EdgeMask);

+ VPValue *EdgeMask =

+ getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());

+ if (!EdgeMask) {

+ assert(In == 0 && "Both null and non-null edge masks found");

+ assert(all_equal(Operands) &&

+ "Distinct incoming values with one having a full mask");

+ break;

+ }

+ if (In == 0)

+ continue;

+ OperandsWithMask.push_back(EdgeMask);

}

- return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));

+ return new VPBlendRecipe(Phi, OperandsWithMask);

}

VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

ArrayRef<VPValue *> Operands,

- VFRange &Range,

- VPlanPtr &Plan) {

+ VFRange &Range) {

bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(

[this, CI](ElementCount VF) {

return CM.isScalarWithPredication(CI, VF);

@@ -8301,6 +8041,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

return nullptr;

SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));

+ Ops.push_back(Operands.back());

// Is it beneficial to perform intrinsic call compared to lib call?

bool ShouldUseVectorIntrinsic =

@@ -8311,7 +8052,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

Range);

if (ShouldUseVectorIntrinsic)

- return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,

+ return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,

CI->getDebugLoc());

Function *Variant = nullptr;

@@ -8358,13 +8099,13 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

if (Legal->isMaskRequired(CI))

Mask = getBlockInMask(CI->getParent());

else

- Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(

+ Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(

IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));

Ops.insert(Ops.begin() + *MaskPos, Mask);

}

- return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),

+ return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),

Intrinsic::not_intrinsic, CI->getDebugLoc(),

Variant);

}

@@ -8386,9 +8127,9 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {

Range);

}

-VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,

- ArrayRef<VPValue *> Operands,

- VPBasicBlock *VPBB, VPlanPtr &Plan) {

+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,

+ ArrayRef<VPValue *> Operands,

+ VPBasicBlock *VPBB) {

switch (I->getOpcode()) {

default:

return nullptr;

@@ -8401,12 +8142,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,

if (CM.isPredicatedInst(I)) {

SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());

VPValue *Mask = getBlockInMask(I->getParent());

- VPValue *One = Plan->getVPValueOrAddLiveIn(

- ConstantInt::get(I->getType(), 1u, false));

- auto *SafeRHS =

- new VPInstruction(Instruction::Select, {Mask, Ops[1], One},

- I->getDebugLoc());

- VPBB->appendRecipe(SafeRHS);

+ VPValue *One =

+ Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));

+ auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());

Ops[1] = SafeRHS;

return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));

}

@@ -8445,9 +8183,8 @@ void VPRecipeBuilder::fixHeaderPhis() {

}

-VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,

- VFRange &Range,

- VPlan &Plan) {

+VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,

+ VFRange &Range) {

bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(

[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },

Range);

@@ -8497,29 +8234,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,

BlockInMask = getBlockInMask(I->getParent());

}

- auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),

+ // Note that there is some custom logic to mark some intrinsics as uniform

+ // manually above for scalable vectors, which this assert needs to account for

+ // as well.

+ assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||

+ (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&

+ "Should not predicate a uniform recipe");

+ auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),

IsUniform, BlockInMask);

- return toVPRecipeResult(Recipe);

+ return Recipe;

}

-VPRecipeOrVPValueTy

+VPRecipeBase *

VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

ArrayRef<VPValue *> Operands,

- VFRange &Range, VPBasicBlock *VPBB,

- VPlanPtr &Plan) {

+ VFRange &Range, VPBasicBlock *VPBB) {

// First, check for specific widening recipes that deal with inductions, Phi

// nodes, calls and memory operations.

VPRecipeBase *Recipe;

if (auto Phi = dyn_cast<PHINode>(Instr)) {

if (Phi->getParent() != OrigLoop->getHeader())

- return tryToBlend(Phi, Operands, Plan);

+ return tryToBlend(Phi, Operands);

- // Always record recipes for header phis. Later first-order recurrence phis

- // can have earlier phis as incoming values.

- recordRecipeOf(Phi);

- if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))

- return toVPRecipeResult(Recipe);

+ if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))

+ return Recipe;

VPHeaderPHIRecipe *PhiRecipe = nullptr;

assert((Legal->isReductionVariable(Phi) ||

@@ -8542,22 +8280,13 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);

}

- // Record the incoming value from the backedge, so we can add the incoming

- // value from the backedge after all recipes have been created.

- auto *Inc = cast<Instruction>(

- Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));

- auto RecipeIter = Ingredient2Recipe.find(Inc);

- if (RecipeIter == Ingredient2Recipe.end())

- recordRecipeOf(Inc);

PhisToFix.push_back(PhiRecipe);

- return toVPRecipeResult(PhiRecipe);

+ return PhiRecipe;

}

- if (isa<TruncInst>(Instr) &&

- (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,

- Range, *Plan)))

- return toVPRecipeResult(Recipe);

+ if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(

+ cast<TruncInst>(Instr), Operands, Range)))

+ return Recipe;

// All widen recipes below deal only with VF > 1.

if (LoopVectorizationPlanner::getDecisionAndClampRange(

@@ -8565,29 +8294,29 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

return nullptr;

if (auto *CI = dyn_cast<CallInst>(Instr))

- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));

+ return tryToWidenCall(CI, Operands, Range);

if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))

- return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));

+ return tryToWidenMemory(Instr, Operands, Range);

if (!shouldWiden(Instr, Range))

return nullptr;

if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))

- return toVPRecipeResult(new VPWidenGEPRecipe(

- GEP, make_range(Operands.begin(), Operands.end())));

+ return new VPWidenGEPRecipe(GEP,

+ make_range(Operands.begin(), Operands.end()));

if (auto *SI = dyn_cast<SelectInst>(Instr)) {

- return toVPRecipeResult(new VPWidenSelectRecipe(

- *SI, make_range(Operands.begin(), Operands.end())));

+ return new VPWidenSelectRecipe(

+ *SI, make_range(Operands.begin(), Operands.end()));

}

if (auto *CI = dyn_cast<CastInst>(Instr)) {

- return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],

- CI->getType(), *CI));

+ return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),

+ *CI);

}

- return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));

+ return tryToWiden(Instr, Operands, VPBB);

}

void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

@@ -8603,7 +8332,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

VPlanTransforms::truncateToMinimalBitwidths(

*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());

VPlanTransforms::optimize(*Plan, *PSE.getSE());

- assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");

+ // TODO: try to put it close to addActiveLaneMask().

+ // Discard the plan if it is not EVL-compatible

+ if (CM.foldTailWithEVL() &&

+ !VPlanTransforms::tryAddExplicitVectorLength(*Plan))

+ break;

+ assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");

VPlans.push_back(std::move(Plan));

}

VF = SubRange.End;

@@ -8615,7 +8349,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,

DebugLoc DL) {

Value *StartIdx = ConstantInt::get(IdxTy, 0);

- auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);

+ auto *StartV = Plan.getOrAddLiveIn(StartIdx);

// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.

auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);

@@ -8623,27 +8357,22 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,

VPBasicBlock *Header = TopRegion->getEntryBasicBlock();

Header->insert(CanonicalIVPHI, Header->begin());

- // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar

- // IV by VF * UF.

- auto *CanonicalIVIncrement =

- new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},

- {HasNUW, false}, DL, "index.next");

+ VPBuilder Builder(TopRegion->getExitingBasicBlock());

+ // Add a VPInstruction to increment the scalar canonical IV by VF * UF.

+ auto *CanonicalIVIncrement = Builder.createOverflowingOp(

+ Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,

+ "index.next");

CanonicalIVPHI->addOperand(CanonicalIVIncrement);

- VPBasicBlock *EB = TopRegion->getExitingBasicBlock();

- EB->appendRecipe(CanonicalIVIncrement);

// Add the BranchOnCount VPInstruction to the latch.

- VPInstruction *BranchBack =

- new VPInstruction(VPInstruction::BranchOnCount,

- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);

- EB->appendRecipe(BranchBack);

+ Builder.createNaryOp(VPInstruction::BranchOnCount,

+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);

}

// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the

// original exit block.

static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,

- VPlan &Plan) {

+ VPRecipeBuilder &Builder, VPlan &Plan) {

BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();

BasicBlock *ExitingBB = OrigLoop->getExitingBlock();

// Only handle single-exit loops with unique exit blocks for now.

@@ -8654,17 +8383,115 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,

for (PHINode &ExitPhi : ExitBB->phis()) {

Value *IncomingValue =

ExitPhi.getIncomingValueForBlock(ExitingBB);

- VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);

+ VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);

+ // Exit values for inductions are computed and updated outside of VPlan and

+ // independent of induction recipes.

+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update

+ // live-outs.

+ if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&

+ !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||

+ isa<VPWidenPointerInductionRecipe>(V))

+ continue;

Plan.addLiveOut(&ExitPhi, V);

}

+/// Feed a resume value for every FOR from the vector loop to the scalar loop,

+/// if middle block branches to scalar preheader, by introducing ExtractFromEnd

+/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the

+/// latter and corresponds to the scalar header.

+static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {

+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

+ // Start by finding out if middle block branches to scalar preheader, which is

+ // not a VPIRBasicBlock, unlike Exit block - the other possible successor of

+ // middle block.

+ // TODO: Should be replaced by

+ // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the

+ // scalar region is modeled as well.

+ VPBasicBlock *ScalarPHVPBB = nullptr;

+ auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());

+ for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {

+ if (isa<VPIRBasicBlock>(Succ))

+ continue;

+ assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");

+ ScalarPHVPBB = cast<VPBasicBlock>(Succ);

+ }

+ if (!ScalarPHVPBB)

+ return;

+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);

+ VPBuilder MiddleBuilder(MiddleVPBB);

+ // Reset insert point so new recipes are inserted before terminator and

+ // condition, if there is either the former or both.

+ if (auto *Terminator = MiddleVPBB->getTerminator()) {

+ auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));

+ assert((!Condition || Condition->getParent() == MiddleVPBB) &&

+ "Condition expected in MiddleVPBB");

+ MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);

+ }

+ VPValue *OneVPV = Plan.getOrAddLiveIn(

+ ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));

+ for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {

+ auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);

+ if (!FOR)

+ continue;

+ // Extract the resume value and create a new VPLiveOut for it.

+ auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,

+ {FOR->getBackedgeValue(), OneVPV},

+ {}, "vector.recur.extract");

+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(

+ VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},

+ "scalar.recur.init");

+ Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);

+ }

VPlanPtr

LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);

+ // ---------------------------------------------------------------------------

+ // Build initial VPlan: Scan the body of the loop in a topological order to

+ // visit each basic block after having visited its predecessor basic blocks.

+ // ---------------------------------------------------------------------------

+ // Create initial VPlan skeleton, having a basic block for the pre-header

+ // which contains SCEV expansions that need to happen before the CFG is

+ // modified; a basic block for the vector pre-header, followed by a region for

+ // the vector loop, followed by the middle basic block. The skeleton vector

+ // loop region contains a header and latch basic blocks.

+ bool RequiresScalarEpilogueCheck =

+ LoopVectorizationPlanner::getDecisionAndClampRange(

+ [this](ElementCount VF) {

+ return !CM.requiresScalarEpilogue(VF.isVector());

+ },

+ Range);

+ VPlanPtr Plan = VPlan::createInitialVPlan(

+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),

+ *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),

+ OrigLoop);

+ // Don't use getDecisionAndClampRange here, because we don't know the UF

+ // so this function is better to be conservative, rather than to split

+ // it up into different VPlans.

+ // TODO: Consider using getDecisionAndClampRange here to split up VPlans.

+ bool IVUpdateMayOverflow = false;

+ for (ElementCount VF : Range)

+ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);

+ DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());

+ TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);

+ // When not folding the tail, we know that the induction increment will not

+ // overflow.

+ bool HasNUW = Style == TailFoldingStyle::None;

+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);

+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);

// ---------------------------------------------------------------------------

// Pre-construction: record ingredients whose recipes we'll need to further

@@ -8690,55 +8517,26 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

if (!getDecisionAndClampRange(applyIG, Range))

continue;

InterleaveGroups.insert(IG);

- for (unsigned i = 0; i < IG->getFactor(); i++)

- if (Instruction *Member = IG->getMember(i))

- RecipeBuilder.recordRecipeOf(Member);

};

// ---------------------------------------------------------------------------

- // Build initial VPlan: Scan the body of the loop in a topological order to

- // visit each basic block after having visited its predecessor basic blocks.

+ // Construct recipes for the instructions in the loop

// ---------------------------------------------------------------------------

- // Create initial VPlan skeleton, having a basic block for the pre-header

- // which contains SCEV expansions that need to happen before the CFG is

- // modified; a basic block for the vector pre-header, followed by a region for

- // the vector loop, followed by the middle basic block. The skeleton vector

- // loop region contains a header and latch basic blocks.

- VPlanPtr Plan = VPlan::createInitialVPlan(

- createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),

- *PSE.getSE());

- VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");

- VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");

- VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);

- Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);

- Plan->getVectorLoopRegion()->setExiting(LatchVPBB);

- // Don't use getDecisionAndClampRange here, because we don't know the UF

- // so this function is better to be conservative, rather than to split

- // it up into different VPlans.

- // TODO: Consider using getDecisionAndClampRange here to split up VPlans.

- bool IVUpdateMayOverflow = false;

- for (ElementCount VF : Range)

- IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);

- DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());

- TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);

- // When not folding the tail, we know that the induction increment will not

- // overflow.

- bool HasNUW = Style == TailFoldingStyle::None;

- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);

// Scan the body of the loop in a topological order to visit each basic block

// after having visited its predecessor basic blocks.

LoopBlocksDFS DFS(OrigLoop);

DFS.perform(LI);

+ VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();

VPBasicBlock *VPBB = HeaderVPBB;

- bool NeedsMasks = CM.foldTailByMasking() ||

- any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {

- return Legal->blockNeedsPredication(BB);

- });

+ BasicBlock *HeaderBB = OrigLoop->getHeader();

+ bool NeedsMasks =

+ CM.foldTailByMasking() ||

+ any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {

+ bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();

+ return Legal->blockNeedsPredication(BB) || NeedsBlends;

+ });

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {

// Relevant instructions from basic block BB will be grouped into VPRecipe

// ingredients and fill a new VPBasicBlock.

@@ -8747,9 +8545,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

Builder.setInsertPoint(VPBB);

if (VPBB == HeaderVPBB)

- RecipeBuilder.createHeaderMask(*Plan);

+ RecipeBuilder.createHeaderMask();

else if (NeedsMasks)

- RecipeBuilder.createBlockInMask(BB, *Plan);

+ RecipeBuilder.createBlockInMask(BB);

// Introduce each ingredient into VPlan.

// TODO: Model and preserve debug intrinsics in VPlan.

@@ -8757,11 +8555,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

Instruction *Instr = &I;

SmallVector<VPValue *, 4> Operands;

auto *Phi = dyn_cast<PHINode>(Instr);

- if (Phi && Phi->getParent() == OrigLoop->getHeader()) {

- Operands.push_back(Plan->getVPValueOrAddLiveIn(

+ if (Phi && Phi->getParent() == HeaderBB) {

+ Operands.push_back(Plan->getOrAddLiveIn(

Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));

} else {

- auto OpRange = Plan->mapToVPValues(Instr->operands());

+ auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());

Operands = {OpRange.begin(), OpRange.end()};

}

@@ -8772,26 +8570,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))

continue;

- auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(

- Instr, Operands, Range, VPBB, Plan);

- if (!RecipeOrValue)

- RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);

- // If Instr can be simplified to an existing VPValue, use it.

- if (isa<VPValue *>(RecipeOrValue)) {

- auto *VPV = cast<VPValue *>(RecipeOrValue);

- Plan->addVPValue(Instr, VPV);

- // If the re-used value is a recipe, register the recipe for the

- // instruction, in case the recipe for Instr needs to be recorded.

- if (VPRecipeBase *R = VPV->getDefiningRecipe())

- RecipeBuilder.setRecipe(Instr, R);

- continue;

- }

- // Otherwise, add the new recipe.

- VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);

- for (auto *Def : Recipe->definedValues()) {

- auto *UV = Def->getUnderlyingValue();

- Plan->addVPValue(UV, Def);

- }

+ VPRecipeBase *Recipe =

+ RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);

+ if (!Recipe)

+ Recipe = RecipeBuilder.handleReplication(Instr, Range);

RecipeBuilder.setRecipe(Instr, Recipe);

if (isa<VPHeaderPHIRecipe>(Recipe)) {

@@ -8823,7 +8605,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

// and there is nothing to fix from vector loop; phis should have incoming

// from scalar loop only.

} else

- addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);

+ addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);

assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&

!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&

@@ -8831,30 +8613,33 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

"VPBasicBlock");

RecipeBuilder.fixHeaderPhis();

+ addLiveOutsForFirstOrderRecurrences(*Plan);

// ---------------------------------------------------------------------------

// Transform initial VPlan: Apply previously taken decisions, in order, to

// bring the VPlan to its final state.

// ---------------------------------------------------------------------------

// Adjust the recipes for any inloop reductions.

- adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);

+ adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);

// Interleave memory: for each Interleave Group we marked earlier as relevant

// for this VPlan, replace the Recipes widening its memory instructions with a

// single VPInterleaveRecipe at its insertion point.

for (const auto *IG : InterleaveGroups) {

- auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(

- RecipeBuilder.getRecipe(IG->getInsertPos()));

+ auto *Recipe =

+ cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));

SmallVector<VPValue *, 4> StoredValues;

for (unsigned i = 0; i < IG->getFactor(); ++i)

if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {

- auto *StoreR =

- cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));

+ auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));

StoredValues.push_back(StoreR->getStoredValue());

}

bool NeedsMaskForGaps =

IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();

+ assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&

+ "masked interleaved groups are not allowed.");

auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,

Recipe->getMask(), NeedsMaskForGaps);

VPIG->insertBefore(Recipe);

@@ -8883,17 +8668,31 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

// Only handle constant strides for now.

if (!ScevStride)

continue;

- Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());

- auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);

- // The versioned value may not be used in the loop directly, so just add a

- // new live-in in those cases.

- Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);

+ auto *CI = Plan->getOrAddLiveIn(

+ ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));

+ if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))

+ StrideVPV->replaceAllUsesWith(CI);

+ // The versioned value may not be used in the loop directly but through a

+ // sext/zext. Add new live-ins in those cases.

+ for (Value *U : StrideV->users()) {

+ if (!isa<SExtInst, ZExtInst>(U))

+ continue;

+ VPValue *StrideVPV = Plan->getLiveIn(U);

+ if (!StrideVPV)

+ continue;

+ unsigned BW = U->getType()->getScalarSizeInBits();

+ APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)

+ : ScevStride->getAPInt().zext(BW);

+ VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));

+ StrideVPV->replaceAllUsesWith(CI);

+ }

}

- // From this point onwards, VPlan-to-VPlan transformations may change the plan

- // in ways that accessing values using original IR values is incorrect.

- Plan->disableValue2VPValue();

+ VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {

+ return Legal->blockNeedsPredication(BB);

+ });

// Sink users of fixed-order recurrence past the recipe defining the previous

// value and introduce FirstOrderRecurrenceSplice VPInstructions.

@@ -8923,7 +8722,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

// Create new empty VPlan

auto Plan = VPlan::createInitialVPlan(

createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),

- *PSE.getSE());

+ *PSE.getSE(), true, false, OrigLoop);

// Build hierarchical CFG

VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);

@@ -8948,6 +8747,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

bool HasNUW = true;

addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,

DebugLoc());

+ assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");

return Plan;

}

@@ -8960,9 +8760,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

// A ComputeReductionResult recipe is added to the middle block, also for

// in-loop reductions which compute their result in-loop, because generating

// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.

+//

+// Adjust AnyOf reductions; replace the reduction phi for the selected value

+// with a boolean reduction phi node to check if the condition is true in any

+// iteration. The final value is selected by the final ComputeReductionResult.

void LoopVectorizationPlanner::adjustRecipesForReductions(

- VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,

- ElementCount MinVF) {

+ VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {

VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();

VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();

// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores

@@ -9034,7 +8837,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

// the phi until LoopExitValue. We keep track of the previous item

// (PreviousLink) to tell which of the two operands of a Link will remain

// scalar and which will be reduced. For minmax by select(cmp), Link will be

- // the select instructions.

+ // the select instructions. Blend recipes of in-loop reduction phi's will

+ // get folded to their non-phi operand, as the reduction recipe handles the

+ // condition directly.

VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].

for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {

Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();

@@ -9065,6 +8870,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());

VecOp = FMulRecipe;

} else {

+ auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);

+ if (PhiR->isInLoop() && Blend) {

+ assert(Blend->getNumIncomingValues() == 2 &&

+ "Blend must have 2 incoming values");

+ if (Blend->getIncomingValue(0) == PhiR)

+ Blend->replaceAllUsesWith(Blend->getIncomingValue(1));

+ else {

+ assert(Blend->getIncomingValue(1) == PhiR &&

+ "PhiR must be an operand of the blend");

+ Blend->replaceAllUsesWith(Blend->getIncomingValue(0));

+ }

+ continue;

+ }

if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

if (isa<VPWidenRecipe>(CurrentLink)) {

assert(isa<CmpInst>(CurrentLinkI) &&

@@ -9095,14 +8914,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

BasicBlock *BB = CurrentLinkI->getParent();

VPValue *CondOp = nullptr;

- if (CM.blockNeedsPredicationForAnyReason(BB)) {

- VPBuilder::InsertPointGuard Guard(Builder);

- Builder.setInsertPoint(CurrentLink);

+ if (CM.blockNeedsPredicationForAnyReason(BB))

CondOp = RecipeBuilder.getBlockInMask(BB);

- }

- VPReductionRecipe *RedRecipe = new VPReductionRecipe(

- RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);

+ VPReductionRecipe *RedRecipe =

+ new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,

+ CondOp, CM.useOrderedReductions(RdxDesc));

// Append the recipe to the end of the VPBasicBlock because we need to

// ensure that it comes after all of it's inputs, including CondOp.

// Note that this transformation may leave over dead recipes (including

@@ -9112,7 +8929,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

PreviousLink = RedRecipe;

}

+ VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();

Builder.setInsertPoint(&*LatchVPBB->begin());

+ VPBasicBlock *MiddleVPBB =

+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());

+ VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();

for (VPRecipeBase &R :

Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

@@ -9120,6 +8941,41 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

continue;

const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

+ // Adjust AnyOf reductions; replace the reduction phi for the selected value

+ // with a boolean reduction phi node to check if the condition is true in

+ // any iteration. The final value is selected by the final

+ // ComputeReductionResult.

+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(

+ RdxDesc.getRecurrenceKind())) {

+ auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {

+ return isa<VPWidenSelectRecipe>(U) ||

+ (isa<VPReplicateRecipe>(U) &&

+ cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==

+ Instruction::Select);

+ }));

+ VPValue *Cmp = Select->getOperand(0);

+ // If the compare is checking the reduction PHI node, adjust it to check

+ // the start value.

+ if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {

+ for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)

+ if (CmpR->getOperand(I) == PhiR)

+ CmpR->setOperand(I, PhiR->getStartValue());

+ }

+ VPBuilder::InsertPointGuard Guard(Builder);

+ Builder.setInsertPoint(Select);

+ // If the true value of the select is the reduction phi, the new value is

+ // selected if the negated condition is true in any iteration.

+ if (Select->getOperand(1) == PhiR)

+ Cmp = Builder.createNot(Cmp);

+ VPValue *Or = Builder.createOr(PhiR, Cmp);

+ Select->getVPSingleValue()->replaceAllUsesWith(Or);

+ // Convert the reduction phi to operate on bools.

+ PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(

+ OrigLoop->getHeader()->getContext())));

+ }

// If tail is folded by masking, introduce selects between the phi

// and the live-out instruction of each reduction, at the beginning of the

// dedicated latch block.

@@ -9152,7 +9008,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

// then extend the loop exit value to enable InstCombine to evaluate the

// entire expression in the smaller type.

Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();

- if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {

+ if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&

+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(

+ RdxDesc.getRecurrenceKind())) {

assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");

Type *RdxTy = RdxDesc.getRecurrenceType();

auto *Trunc =

@@ -9184,8 +9042,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

// also modeled in VPlan.

auto *FinalReductionResult = new VPInstruction(

VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);

- cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())

- ->appendRecipe(FinalReductionResult);

+ FinalReductionResult->insertBefore(*MiddleVPBB, IP);

OrigExitingVPV->replaceUsesWithIf(

FinalReductionResult,

[](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });

@@ -9194,91 +9051,29 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

VPlanTransforms::clearReductionWrapFlags(*Plan);

}

-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,

- VPSlotTracker &SlotTracker) const {

- O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";

- IG->getInsertPos()->printAsOperand(O, false);

- O << ", ";

- getAddr()->printAsOperand(O, SlotTracker);

- VPValue *Mask = getMask();

- if (Mask) {

- O << ", ";

- Mask->printAsOperand(O, SlotTracker);

- }

- unsigned OpIdx = 0;

- for (unsigned i = 0; i < IG->getFactor(); ++i) {

- if (!IG->getMember(i))

- continue;

- if (getNumStoreOperands() > 0) {

- O << "\n" << Indent << " store ";

- getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);

- O << " to index " << i;

- } else {

- O << "\n" << Indent << " ";

- getVPValue(OpIdx)->printAsOperand(O, SlotTracker);

- O << " = load from index " << i;

- }

- ++OpIdx;

- }

-#endif

void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&

"Not a pointer induction according to InductionDescriptor!");

assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&

"Unexpected type.");

+ assert(!onlyScalarsGenerated(State.VF.isScalable()) &&

+ "Recipe should have been replaced");

auto *IVR = getParent()->getPlan()->getCanonicalIV();

- PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));

- if (onlyScalarsGenerated(State.VF)) {

- // This is the normalized GEP that starts counting at zero.

- Value *PtrInd = State.Builder.CreateSExtOrTrunc(

- CanonicalIV, IndDesc.getStep()->getType());

- // Determine the number of scalars we need to generate for each unroll

- // iteration. If the instruction is uniform, we only need to generate the

- // first lane. Otherwise, we generate all VF values.

- bool IsUniform = vputils::onlyFirstLaneUsed(this);

- assert((IsUniform || !State.VF.isScalable()) &&

- "Cannot scalarize a scalable VF");

- unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *PartStart =

- createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);

- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {

- Value *Idx = State.Builder.CreateAdd(

- PartStart, ConstantInt::get(PtrInd->getType(), Lane));

- Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);

- Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));

- Value *SclrGep = emitTransformedIndex(

- State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,

- IndDesc.getKind(), IndDesc.getInductionBinOp());

- SclrGep->setName("next.gep");

- State.set(this, SclrGep, VPIteration(Part, Lane));

- }

- return;

- }

+ PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));

Type *PhiType = IndDesc.getStep()->getType();

// Build a pointer phi

Value *ScalarStartValue = getStartValue()->getLiveInIRValue();

Type *ScStValueType = ScalarStartValue->getType();

- PHINode *NewPointerPhi =

- PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);

+ PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",

+ CanonicalIV->getIterator());

BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);

NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);

// A pointer induction, performed by using a gep

- Instruction *InductionLoc = &*State.Builder.GetInsertPoint();

+ BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();

Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));

Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);

@@ -9329,84 +9124,21 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {

State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());

Value *Step = State.get(getStepValue(), VPIteration(0, 0));

- Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));

+ Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));

Value *DerivedIV = emitTransformedIndex(

State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,

Kind, cast_if_present<BinaryOperator>(FPBinOp));

DerivedIV->setName("offset.idx");

- if (TruncResultTy) {

- assert(TruncResultTy != DerivedIV->getType() &&

- Step->getType()->isIntegerTy() &&

- "Truncation requires an integer step");

- DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);

- }

assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");

State.set(this, DerivedIV, VPIteration(0, 0));

}

-void VPInterleaveRecipe::execute(VPTransformState &State) {

- assert(!State.Instance && "Interleave group being replicated.");

- State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),

- getStoredValues(), getMask(),

- NeedsMaskForGaps);

-void VPReductionRecipe::execute(VPTransformState &State) {

- assert(!State.Instance && "Reduction being replicated.");

- Value *PrevInChain = State.get(getChainOp(), 0);

- RecurKind Kind = RdxDesc.getRecurrenceKind();

- bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);

- // Propagate the fast-math flags carried by the underlying instruction.

- IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);

- State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *NewVecOp = State.get(getVecOp(), Part);

- if (VPValue *Cond = getCondOp()) {

- Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)

- : State.get(Cond, {Part, 0});

- VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());

- Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();

- Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,

- RdxDesc.getFastMathFlags());

- if (State.VF.isVector()) {

- Iden =

- State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);

- }

- Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);

- NewVecOp = Select;

- }

- Value *NewRed;

- Value *NextInChain;

- if (IsOrdered) {

- if (State.VF.isVector())

- NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,

- PrevInChain);

- else

- NewRed = State.Builder.CreateBinOp(

- (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,

- NewVecOp);

- PrevInChain = NewRed;

- } else {

- PrevInChain = State.get(getChainOp(), Part);

- NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);

- }

- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

- NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),

- NewRed, PrevInChain);

- } else if (IsOrdered)

- NextInChain = NewRed;

- else

- NextInChain = State.Builder.CreateBinOp(

- (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);

- State.set(this, NextInChain, Part);

- }

void VPReplicateRecipe::execute(VPTransformState &State) {

Instruction *UI = getUnderlyingInstr();

if (State.Instance) { // Generate a single instance.

+ assert((State.VF.isScalar() || !isUniform()) &&

+ "uniform recipe shouldn't be predicated");

assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");

State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);

// Insert scalar instance packing it into a vector.

@@ -9464,98 +9196,180 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);

}

-void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

- VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;

- // Attempt to issue a wide load.

- LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);

- StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);

- assert((LI || SI) && "Invalid Load/Store instruction");

- assert((!SI || StoredValue) && "No stored value provided for widened store");

- assert((!LI || !StoredValue) && "Stored value provided for widened load");

+void VPWidenLoadRecipe::execute(VPTransformState &State) {

+ auto *LI = cast<LoadInst>(&Ingredient);

Type *ScalarDataTy = getLoadStoreType(&Ingredient);

auto *DataTy = VectorType::get(ScalarDataTy, State.VF);

const Align Alignment = getLoadStoreAlignment(&Ingredient);

- bool CreateGatherScatter = !isConsecutive();

+ bool CreateGather = !isConsecutive();

auto &Builder = State.Builder;

- InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);

- bool isMaskRequired = getMask();

- if (isMaskRequired) {

- // Mask reversal is only needed for non-all-one (null) masks, as reverse of

- // a null all-one mask is a null mask.

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *Mask = State.get(getMask(), Part);

+ State.setDebugLocFrom(getDebugLoc());

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *NewLI;

+ Value *Mask = nullptr;

+ if (auto *VPMask = getMask()) {

+ // Mask reversal is only needed for non-all-one (null) masks, as reverse

+ // of a null all-one mask is a null mask.

+ Mask = State.get(VPMask, Part);

if (isReverse())

Mask = Builder.CreateVectorReverse(Mask, "reverse");

- BlockInMaskParts[Part] = Mask;

}

+ Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);

+ if (CreateGather) {

+ NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,

+ "wide.masked.gather");

+ } else if (Mask) {

+ NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,

+ PoisonValue::get(DataTy),

+ "wide.masked.load");

+ } else {

+ NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");

+ }

+ // Add metadata to the load, but setVectorValue to the reverse shuffle.

+ State.addMetadata(NewLI, LI);

+ if (Reverse)

+ NewLI = Builder.CreateVectorReverse(NewLI, "reverse");

+ State.set(this, NewLI, Part);

}

- // Handle Stores:

- if (SI) {

- State.setDebugLocFrom(SI->getDebugLoc());

+/// Use all-true mask for reverse rather than actual mask, as it avoids a

+/// dependence w/o affecting the result.

+static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,

+ Value *EVL, const Twine &Name) {

+ VectorType *ValTy = cast<VectorType>(Operand->getType());

+ Value *AllTrueMask =

+ Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());

+ return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,

+ {Operand, AllTrueMask, EVL}, nullptr, Name);

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Instruction *NewSI = nullptr;

- Value *StoredVal = State.get(StoredValue, Part);

- if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

- Value *VectorGep = State.get(getAddr(), Part);

- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,

- MaskPart);

- } else {

- if (isReverse()) {

- // If we store to reverse consecutive memory locations, then we need

- // to reverse the order of elements in the stored value.

- StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");

- // We don't want to update the value in the map as it might be used in

- // another expression. So don't call resetVectorValue(StoredVal).

- }

- auto *VecPtr = State.get(getAddr(), Part);

- if (isMaskRequired)

- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,

- BlockInMaskParts[Part]);

- else

- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);

- }

- State.addMetadata(NewSI, SI);

- }

- return;

+void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {

+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "

+ "explicit vector length.");

+ auto *LI = cast<LoadInst>(&Ingredient);

+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);

+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);

+ const Align Alignment = getLoadStoreAlignment(&Ingredient);

+ bool CreateGather = !isConsecutive();

+ auto &Builder = State.Builder;

+ State.setDebugLocFrom(getDebugLoc());

+ CallInst *NewLI;

+ Value *EVL = State.get(getEVL(), VPIteration(0, 0));

+ Value *Addr = State.get(getAddr(), 0, !CreateGather);

+ Value *Mask = nullptr;

+ if (VPValue *VPMask = getMask()) {

+ Mask = State.get(VPMask, 0);

+ if (isReverse())

+ Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");

+ } else {

+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());

+ }

+ if (CreateGather) {

+ NewLI =

+ Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},

+ nullptr, "wide.masked.gather");

+ } else {

+ VectorBuilder VBuilder(Builder);

+ VBuilder.setEVL(EVL).setMask(Mask);

+ NewLI = cast<CallInst>(VBuilder.createVectorInstruction(

+ Instruction::Load, DataTy, Addr, "vp.op.load"));

}

+ NewLI->addParamAttr(

+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));

+ State.addMetadata(NewLI, LI);

+ Instruction *Res = NewLI;

+ if (isReverse())

+ Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");

+ State.set(this, Res, 0);

+void VPWidenStoreRecipe::execute(VPTransformState &State) {

+ auto *SI = cast<StoreInst>(&Ingredient);

+ VPValue *StoredVPValue = getStoredValue();

+ bool CreateScatter = !isConsecutive();

+ const Align Alignment = getLoadStoreAlignment(&Ingredient);

+ auto &Builder = State.Builder;

+ State.setDebugLocFrom(getDebugLoc());

- // Handle loads.

- assert(LI && "Must have a load instruction");

- State.setDebugLocFrom(LI->getDebugLoc());

for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *NewLI;

- if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

- Value *VectorGep = State.get(getAddr(), Part);

- NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,

- nullptr, "wide.masked.gather");

- State.addMetadata(NewLI, LI);

- } else {

- auto *VecPtr = State.get(getAddr(), Part);

- if (isMaskRequired)

- NewLI = Builder.CreateMaskedLoad(

- DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

- PoisonValue::get(DataTy), "wide.masked.load");

- else

- NewLI =

- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");

+ Instruction *NewSI = nullptr;

+ Value *Mask = nullptr;

+ if (auto *VPMask = getMask()) {

+ // Mask reversal is only needed for non-all-one (null) masks, as reverse

+ // of a null all-one mask is a null mask.

+ Mask = State.get(VPMask, Part);

+ if (isReverse())

+ Mask = Builder.CreateVectorReverse(Mask, "reverse");

+ }

- // Add metadata to the load, but setVectorValue to the reverse shuffle.

- State.addMetadata(NewLI, LI);

- if (Reverse)

- NewLI = Builder.CreateVectorReverse(NewLI, "reverse");

+ Value *StoredVal = State.get(StoredVPValue, Part);

+ if (isReverse()) {

+ // If we store to reverse consecutive memory locations, then we need

+ // to reverse the order of elements in the stored value.

+ StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");

+ // We don't want to update the value in the map as it might be used in

+ // another expression. So don't call resetVectorValue(StoredVal).

}

+ Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);

+ if (CreateScatter)

+ NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);

+ else if (Mask)

+ NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);

+ else

+ NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);

+ State.addMetadata(NewSI, SI);

+ }

- State.set(getVPSingleValue(), NewLI, Part);

+void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {

+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "

+ "explicit vector length.");

+ auto *SI = cast<StoreInst>(&Ingredient);

+ VPValue *StoredValue = getStoredValue();

+ bool CreateScatter = !isConsecutive();

+ const Align Alignment = getLoadStoreAlignment(&Ingredient);

+ auto &Builder = State.Builder;

+ State.setDebugLocFrom(getDebugLoc());

+ CallInst *NewSI = nullptr;

+ Value *StoredVal = State.get(StoredValue, 0);

+ Value *EVL = State.get(getEVL(), VPIteration(0, 0));

+ if (isReverse())

+ StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");

+ Value *Mask = nullptr;

+ if (VPValue *VPMask = getMask()) {

+ Mask = State.get(VPMask, 0);

+ if (isReverse())

+ Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");

+ } else {

+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());

}

+ Value *Addr = State.get(getAddr(), 0, !CreateScatter);

+ if (CreateScatter) {

+ NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),

+ Intrinsic::vp_scatter,

+ {StoredVal, Addr, Mask, EVL});

+ } else {

+ VectorBuilder VBuilder(Builder);

+ VBuilder.setEVL(EVL).setMask(Mask);

+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(

+ Instruction::Store, Type::getVoidTy(EVL->getContext()),

+ {StoredVal, Addr}));

+ }

+ NewSI->addParamAttr(

+ 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));

+ State.addMetadata(NewSI, SI);

}

// Determine how to lower the scalar epilogue, which depends on 1) optimising

@@ -9658,7 +9472,7 @@ static bool processLoopInVPlanNativePath(

bool AddBranchWeights =

hasBranchWeightMD(*L->getLoopLatch()->getTerminator());

GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,

- F->getParent()->getDataLayout(), AddBranchWeights);

+ F->getDataLayout(), AddBranchWeights);

InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,

VF.Width, 1, LVL, &CM, BFI, PSI, Checks);

LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""

@@ -9741,7 +9555,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

}

// The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.

- double ScalarC = *VF.ScalarCost.getValue();

+ uint64_t ScalarC = *VF.ScalarCost.getValue();

if (ScalarC == 0)

return true;

@@ -9768,7 +9582,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

// RtC + VecC * (TC / VF) + EpiC < ScalarC * TC

// Now we can compute the minimum required trip count TC as

- // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC

+ // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC

// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that

// the computations are performed on doubles, not integers and the result

@@ -9780,9 +9594,9 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

AssumedMinimumVscale = *VScale;

IntVF *= AssumedMinimumVscale;

}

- double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;

- double RtC = *CheckCost.getValue();

- double MinTC1 = RtC / (ScalarC - VecCOverVF);

+ uint64_t RtC = *CheckCost.getValue();

+ uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();

+ uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);

// Second, compute a minimum iteration count so that the cost of the

// runtime checks is only a fraction of the total scalar loop cost. This

@@ -9791,12 +9605,12 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

// * TC. To bound the runtime check to be a fraction 1/X of the scalar

// cost, compute

// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC

- double MinTC2 = RtC * 10 / ScalarC;

+ uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);

// Now pick the larger minimum. If it is not a multiple of VF and a scalar

// epilogue is allowed, choose the next closest multiple of VF. This should

// partly compensate for ignoring the epilogue cost.

- uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));

+ uint64_t MinTC = std::max(MinTC1, MinTC2);

if (SEL == CM_ScalarEpilogueAllowed)

MinTC = alignTo(MinTC, IntVF);

VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);

@@ -9831,13 +9645,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {

assert((EnableVPlanNativePath || L->isInnermost()) &&

"VPlan-native path is not enabled. Only process inner loops.");

-#ifndef NDEBUG

- const std::string DebugLocStr = getDebugLocString(L);

-#endif /* NDEBUG */

LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"

<< L->getHeader()->getParent()->getName() << "' from "

- << DebugLocStr << "\n");

+ << L->getLocStr() << "\n");

LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);

@@ -10006,7 +9816,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

bool AddBranchWeights =

hasBranchWeightMD(*L->getLoopLatch()->getTerminator());

GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,

- F->getParent()->getDataLayout(), AddBranchWeights);

+ F->getDataLayout(), AddBranchWeights);

if (MaybeVF) {

VF = *MaybeVF;

// Select the interleave count.

@@ -10107,7 +9917,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

});

} else if (VectorizeLoop && !InterleaveLoop) {

LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width

- << ") in " << DebugLocStr << '\n');

+ << ") in " << L->getLocStr() << '\n');

ORE->emit([&]() {

return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,

L->getStartLoc(), L->getHeader())

@@ -10115,7 +9925,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

});

} else if (VectorizeLoop && InterleaveLoop) {

LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width

- << ") in " << DebugLocStr << '\n');

+ << ") in " << L->getLocStr() << '\n');

LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');

}

@@ -10130,7 +9940,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {

InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,

&CM, BFI, PSI, Checks);

- VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);

+ VPlan &BestPlan =

+ UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan();

+ assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&

+ "VPlan cost model and legacy cost model disagreed");

LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);

ORE->emit([&]() {

@@ -10154,9 +9967,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {

EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,

EPI, &LVL, &CM, BFI, PSI, Checks);

- VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);

+ std::unique_ptr<VPlan> BestMainPlan(

+ LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());

const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(

- EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);

+ EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);

++LoopsVectorized;

// Second pass vectorizes the epilogue and adjusts the control flow

@@ -10181,9 +9995,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {

EpilogILV.setTripCount(MainILV.getTripCount());

for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {

auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);

- auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(

+ auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(

ExpandedSCEVs.find(ExpandR->getSCEV())->second);

ExpandR->replaceAllUsesWith(ExpandedVal);

+ if (BestEpiPlan.getTripCount() == ExpandR)

+ BestEpiPlan.resetTripCount(ExpandedVal);

ExpandR->eraseFromParent();

}

@@ -10197,9 +10013,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {

Value *ResumeV = nullptr;

// TODO: Move setting of resume values to prepareToExecute.

if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {

- ResumeV = ReductionResumeValues

- .find(&ReductionPhi->getRecurrenceDescriptor())

- ->second;

+ const RecurrenceDescriptor &RdxDesc =

+ ReductionPhi->getRecurrenceDescriptor();

+ RecurKind RK = RdxDesc.getRecurrenceKind();

+ ResumeV = ReductionResumeValues.find(&RdxDesc)->second;

+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {

+ // VPReductionPHIRecipes for AnyOf reductions expect a boolean as

+ // start value; compare the final value from the main vector loop

+ // to the start value.

+ IRBuilder<> Builder(

+ cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());

+ ResumeV = Builder.CreateICmpNE(ResumeV,

+ RdxDesc.getRecurrenceStartValue());

+ }

} else {

// Create induction resume values for both widened pointer and

// integer/fp inductions and update the start value of the induction

@@ -10220,10 +10046,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {

{EPI.MainLoopIterationCountCheck});

}

assert(ResumeV && "Must have a resume value");

- VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);

+ VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);

cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);

}

+ assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&

+ "DT not preserved correctly");

LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,

DT, true, &ExpandedSCEVs);

++LoopsEpilogueVectorized;

@@ -10231,12 +10059,22 @@ bool LoopVectorizePass::processLoop(Loop *L) {

if (!MainILV.areSafetyChecksAdded())

DisableRuntimeUnroll = true;

} else {

- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,

+ ElementCount Width = VF.Width;

+ VPlan &BestPlan =

+ UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan();

+ if (!UseLegacyCostModel) {

+ assert(size(BestPlan.vectorFactors()) == 1 &&

+ "Plan should have a single VF");

+ Width = *BestPlan.vectorFactors().begin();

+ LLVM_DEBUG(dbgs()

+ << "VF picked by VPlan cost model: " << Width << "\n");

+ assert(VF.Width == Width &&

+ "VPlan cost model and legacy cost model disagreed");

+ }

+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,

VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,

PSI, Checks);

- VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);

- LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);

+ LVP.executePlan(Width, IC, BestPlan, LB, DT, false);

++LoopsVectorized;

// Add metadata to disable runtime unrolling a scalar loop when there

@@ -10376,15 +10214,10 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

RemoveRedundantDbgInstrs(&BB);

}

- // We currently do not preserve loopinfo/dominator analyses with outer loop

- // vectorization. Until this is addressed, mark these analyses as preserved

- // only for non-VPlan-native path.

- // TODO: Preserve Loop and Dominator analyses for VPlan-native path.

- if (!EnableVPlanNativePath) {

- PA.preserve<LoopAnalysis>();

- PA.preserve<DominatorTreeAnalysis>();

- PA.preserve<ScalarEvolutionAnalysis>();

- }

+ PA.preserve<LoopAnalysis>();

+ PA.preserve<DominatorTreeAnalysis>();

+ PA.preserve<ScalarEvolutionAnalysis>();

+ PA.preserve<LoopAccessAnalysis>();

if (Result.MadeCFGChange) {

// Making CFG changes likely means a loop got vectorized. Indicate that