src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:03:47 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:04:23 +0000
commit	7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree	27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent	e3b557809604d036af6e00c60f012c2025b59a5e (diff)

vendor/llvm-project/llvmorg-17-init-19304-gd0b54bb50e51

Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')

-rw-r--r--

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

4239

1 files changed, 2695 insertions, 1544 deletions

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3eb6b1804e7..821a3fa22a85 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -87,7 +87,6 @@

#include "llvm/Transforms/Utils/InjectTLIMappings.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

-#include "llvm/Transforms/Vectorize.h"

#include <algorithm>

#include <cassert>

#include <cstdint>

@@ -126,6 +125,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(

cl::desc(

"Attempt to vectorize horizontal reductions feeding into a store"));

+// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run

+// even if we match a reduction but do not vectorize in the end.

+static cl::opt<bool> AllowHorRdxIdenityOptimization(

+ "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,

+ cl::desc("Allow optimization of original scalar identity operations on "

+ "matched horizontal reductions."));

static cl::opt<int>

MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,

cl::desc("Attempt to vectorize for this register size in bits"));

@@ -287,7 +293,7 @@ static bool isCommutative(Instruction *I) {

/// \returns inserting index of InsertElement or InsertValue instruction,

/// using Offset as base offset for index.

static std::optional<unsigned> getInsertIndex(const Value *InsertInst,

- unsigned Offset = 0) {

+ unsigned Offset = 0) {

int Index = Offset;

if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {

const auto *VT = dyn_cast<FixedVectorType>(IE->getType());

@@ -342,16 +348,16 @@ enum class UseMask {

static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,

UseMask MaskArg) {

SmallBitVector UseMask(VF, true);

- for (auto P : enumerate(Mask)) {

- if (P.value() == UndefMaskElem) {

+ for (auto [Idx, Value] : enumerate(Mask)) {

+ if (Value == PoisonMaskElem) {

if (MaskArg == UseMask::UndefsAsMask)

- UseMask.reset(P.index());

+ UseMask.reset(Idx);

continue;

}

- if (MaskArg == UseMask::FirstArg && P.value() < VF)

- UseMask.reset(P.value());

- else if (MaskArg == UseMask::SecondArg && P.value() >= VF)

- UseMask.reset(P.value() - VF);

+ if (MaskArg == UseMask::FirstArg && Value < VF)

+ UseMask.reset(Value);

+ else if (MaskArg == UseMask::SecondArg && Value >= VF)

+ UseMask.reset(Value - VF);

}

return UseMask;

}

@@ -374,9 +380,9 @@ static SmallBitVector isUndefVector(const Value *V,

if (!UseMask.empty()) {

const Value *Base = V;

while (auto *II = dyn_cast<InsertElementInst>(Base)) {

+ Base = II->getOperand(0);

if (isa<T>(II->getOperand(1)))

continue;

- Base = II->getOperand(0);

std::optional<unsigned> Idx = getInsertIndex(II);

if (!Idx)

continue;

@@ -461,7 +467,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {

Value *Vec2 = nullptr;

enum ShuffleMode { Unknown, Select, Permute };

ShuffleMode CommonShuffleMode = Unknown;

- Mask.assign(VL.size(), UndefMaskElem);

+ Mask.assign(VL.size(), PoisonMaskElem);

for (unsigned I = 0, E = VL.size(); I < E; ++I) {

// Undef can be represented as an undef element in a vector.

if (isa<UndefValue>(VL[I]))

@@ -533,6 +539,117 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {

return *EI->idx_begin();

}

+/// Tries to find extractelement instructions with constant indices from fixed

+/// vector type and gather such instructions into a bunch, which highly likely

+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

+/// successful, the matched scalars are replaced by poison values in \p VL for

+/// future analysis.

+static std::optional<TTI::ShuffleKind>

+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

+ SmallVectorImpl<int> &Mask) {

+ // Scan list of gathered scalars for extractelements that can be represented

+ // as shuffles.

+ MapVector<Value *, SmallVector<int>> VectorOpToIdx;

+ SmallVector<int> UndefVectorExtracts;

+ for (int I = 0, E = VL.size(); I < E; ++I) {

+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

+ if (!EI) {

+ if (isa<UndefValue>(VL[I]))

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

+ if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

+ continue;

+ std::optional<unsigned> Idx = getExtractIndex(EI);

+ // Undefined index.

+ if (!Idx) {

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ SmallBitVector ExtractMask(VecTy->getNumElements(), true);

+ ExtractMask.reset(*Idx);

+ if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ VectorOpToIdx[EI->getVectorOperand()].push_back(I);

+ }

+ // Sort the vector operands by the maximum number of uses in extractelements.

+ MapVector<unsigned, SmallVector<Value *>> VFToVector;

+ for (const auto &Data : VectorOpToIdx)

+ VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]

+ .push_back(Data.first);

+ for (auto &Data : VFToVector) {

+ stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {

+ return VectorOpToIdx.find(V1)->second.size() >

+ VectorOpToIdx.find(V2)->second.size();

+ });

+ }

+ // Find the best pair of the vectors with the same number of elements or a

+ // single vector.

+ const int UndefSz = UndefVectorExtracts.size();

+ unsigned SingleMax = 0;

+ Value *SingleVec = nullptr;

+ unsigned PairMax = 0;

+ std::pair<Value *, Value *> PairVec(nullptr, nullptr);

+ for (auto &Data : VFToVector) {

+ Value *V1 = Data.second.front();

+ if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {

+ SingleMax = VectorOpToIdx[V1].size() + UndefSz;

+ SingleVec = V1;

+ }

+ Value *V2 = nullptr;

+ if (Data.second.size() > 1)

+ V2 = *std::next(Data.second.begin());

+ if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +

+ UndefSz) {

+ PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;

+ PairVec = std::make_pair(V1, V2);

+ }

+ if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

+ return std::nullopt;

+ // Check if better to perform a shuffle of 2 vectors or just of a single

+ // vector.

+ SmallVector<Value *> SavedVL(VL.begin(), VL.end());

+ SmallVector<Value *> GatheredExtracts(

+ VL.size(), PoisonValue::get(VL.front()->getType()));

+ if (SingleMax >= PairMax && SingleMax) {

+ for (int Idx : VectorOpToIdx[SingleVec])

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ } else {

+ for (Value *V : {PairVec.first, PairVec.second})

+ for (int Idx : VectorOpToIdx[V])

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ }

+ // Add extracts from undefs too.

+ for (int Idx : UndefVectorExtracts)

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ // Check that gather of extractelements can be represented as just a

+ // shuffle of a single/two vectors the scalars are extracted from.

+ std::optional<TTI::ShuffleKind> Res =

+ isFixedVectorShuffle(GatheredExtracts, Mask);

+ if (!Res) {

+ // TODO: try to check other subsets if possible.

+ // Restore the original VL if attempt was not successful.

+ VL.swap(SavedVL);

+ return std::nullopt;

+ }

+ // Restore unused scalars from mask, if some of the extractelements were not

+ // selected for shuffle.

+ for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {

+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

+ if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

+ !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

+ is_contained(UndefVectorExtracts, I))

+ continue;

+ if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))

+ std::swap(VL[I], GatheredExtracts[I]);

+ }

+ return Res;

namespace {

/// Main data required for vectorization of instructions.

@@ -829,18 +946,29 @@ static bool isSimple(Instruction *I) {

}

/// Shuffles \p Mask in accordance with the given \p SubMask.

-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {

+/// \param ExtendingManyInputs Supports reshuffling of the mask with not only

+/// one but two input vectors.

+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,

+ bool ExtendingManyInputs = false) {

if (SubMask.empty())

return;

+ assert(

+ (!ExtendingManyInputs || SubMask.size() > Mask.size() ||

+ // Check if input scalars were extended to match the size of other node.

+ (SubMask.size() == Mask.size() &&

+ std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),

+ [](int Idx) { return Idx == PoisonMaskElem; }))) &&

+ "SubMask with many inputs support must be larger than the mask.");

if (Mask.empty()) {

Mask.append(SubMask.begin(), SubMask.end());

return;

}

- SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);

+ SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);

int TermValue = std::min(Mask.size(), SubMask.size());

for (int I = 0, E = SubMask.size(); I < E; ++I) {

- if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||

- Mask[SubMask[I]] >= TermValue)

+ if (SubMask[I] == PoisonMaskElem ||

+ (!ExtendingManyInputs &&

+ (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))

continue;

NewMask[I] = Mask[SubMask[I]];

}

@@ -887,7 +1015,7 @@ static void inversePermutation(ArrayRef<unsigned> Indices,

SmallVectorImpl<int> &Mask) {

Mask.clear();

const unsigned E = Indices.size();

- Mask.resize(E, UndefMaskElem);

+ Mask.resize(E, PoisonMaskElem);

for (unsigned I = 0; I < E; ++I)

Mask[Indices[I]] = I;

}

@@ -900,7 +1028,7 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,

UndefValue::get(Scalars.front()->getType()));

Prev.swap(Scalars);

for (unsigned I = 0, E = Prev.size(); I < E; ++I)

- if (Mask[I] != UndefMaskElem)

+ if (Mask[I] != PoisonMaskElem)

Scalars[Mask[I]] = Prev[I];

}

@@ -962,6 +1090,7 @@ namespace slpvectorizer {

class BoUpSLP {

struct TreeEntry;

struct ScheduleData;

+ class ShuffleCostEstimator;

class ShuffleInstructionBuilder;

public:

@@ -1006,8 +1135,12 @@ public:

/// Vectorize the tree but with the list of externally used values \p

/// ExternallyUsedValues. Values in this MapVector can be replaced but the

/// generated extractvalue instructions.

- Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

- Instruction *ReductionRoot = nullptr);

+ /// \param ReplacedExternals containd list of replaced external values

+ /// {scalar, replace} after emitting extractelement for external uses.

+ Value *

+ vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,

+ SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,

+ Instruction *ReductionRoot = nullptr);

/// \returns the cost incurred by unwanted spills and fills, caused by

/// holding live values over call sites.

@@ -1025,24 +1158,18 @@ public:

/// Construct a vectorizable tree that starts at \p Roots.

void buildTree(ArrayRef<Value *> Roots);

- /// Checks if the very first tree node is going to be vectorized.

- bool isVectorizedFirstNode() const {

- return !VectorizableTree.empty() &&

- VectorizableTree.front()->State == TreeEntry::Vectorize;

- }

- /// Returns the main instruction for the very first node.

- Instruction *getFirstNodeMainOp() const {

- assert(!VectorizableTree.empty() && "No tree to get the first node from");

- return VectorizableTree.front()->getMainOp();

- }

/// Returns whether the root node has in-tree uses.

bool doesRootHaveInTreeUses() const {

return !VectorizableTree.empty() &&

!VectorizableTree.front()->UserTreeIndices.empty();

}

+ /// Return the scalars of the root node.

+ ArrayRef<Value *> getRootNodeScalars() const {

+ assert(!VectorizableTree.empty() && "No graph to get the first node from");

+ return VectorizableTree.front()->Scalars;

+ }

/// Builds external uses of the vectorized scalars, i.e. the list of

/// vectorized scalars to be extracted, their lanes and their scalar users. \p

/// ExternallyUsedValues contains additional list of external uses to handle

@@ -1064,6 +1191,8 @@ public:

MinBWs.clear();

InstrElementSize.clear();

UserIgnoreList = nullptr;

+ PostponedGathers.clear();

+ ValueToGatherNodes.clear();

}

unsigned getTreeSize() const { return VectorizableTree.size(); }

@@ -1083,9 +1212,12 @@ public:

/// Gets reordering data for the given tree entry. If the entry is vectorized

/// - just return ReorderIndices, otherwise check if the scalars can be

/// reordered and return the most optimal order.

+ /// \return std::nullopt if ordering is not important, empty order, if

+ /// identity order is important, or the actual order.

/// \param TopToBottom If true, include the order of vectorized stores and

/// insertelement nodes, otherwise skip them.

- std::optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);

+ std::optional<OrdersType> getReorderingData(const TreeEntry &TE,

+ bool TopToBottom);

/// Reorders the current graph to the most profitable order starting from the

/// root node to the leaf nodes. The best order is chosen only from the nodes

@@ -1328,8 +1460,14 @@ public:

ConstantInt *Ex1Idx;

if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {

// Undefs are always profitable for extractelements.

+ // Compiler can easily combine poison and extractelement <non-poison> or

+ // undef and extractelement <poison>. But combining undef +

+ // extractelement <non-poison-but-may-produce-poison> requires some

+ // extra operations.

if (isa<UndefValue>(V2))

- return LookAheadHeuristics::ScoreConsecutiveExtracts;

+ return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())

+ ? LookAheadHeuristics::ScoreConsecutiveExtracts

+ : LookAheadHeuristics::ScoreSameOpcode;

Value *EV2 = nullptr;

ConstantInt *Ex2Idx = nullptr;

if (match(V2,

@@ -1683,9 +1821,10 @@ public:

// Search all operands in Ops[*][Lane] for the one that matches best

// Ops[OpIdx][LastLane] and return its opreand index.

// If no good match can be found, return std::nullopt.

- std::optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,

- ArrayRef<ReorderingMode> ReorderingModes,

- ArrayRef<Value *> MainAltOps) {

+ std::optional<unsigned>

+ getBestOperand(unsigned OpIdx, int Lane, int LastLane,

+ ArrayRef<ReorderingMode> ReorderingModes,

+ ArrayRef<Value *> MainAltOps) {

unsigned NumOperands = getNumOperands();

// The operand of the previous lane at OpIdx.

@@ -2299,7 +2438,8 @@ private:

/// \returns the cost of the vectorizable entry.

InstructionCost getEntryCost(const TreeEntry *E,

- ArrayRef<Value *> VectorizedVals);

+ ArrayRef<Value *> VectorizedVals,

+ SmallPtrSetImpl<Value *> &CheckedExtracts);

/// This is the recursive part of buildTree.

void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,

@@ -2323,15 +2463,13 @@ private:

/// Create a new vector from a list of scalar values. Produces a sequence

/// which exploits values reused across lanes, and arranges the inserts

/// for ease of later optimization.

- Value *createBuildVector(const TreeEntry *E);

+ template <typename BVTy, typename ResTy, typename... Args>

+ ResTy processBuildVector(const TreeEntry *E, Args &...Params);

- /// \returns the scalarization cost for this type. Scalarization in this

- /// context means the creation of vectors from a group of scalars. If \p

- /// NeedToShuffle is true, need to add a cost of reshuffling some of the

- /// vector elements.

- InstructionCost getGatherCost(FixedVectorType *Ty,

- const APInt &ShuffledIndices,

- bool NeedToShuffle) const;

+ /// Create a new vector from a list of scalar values. Produces a sequence

+ /// which exploits values reused across lanes, and arranges the inserts

+ /// for ease of later optimization.

+ Value *createBuildVector(const TreeEntry *E);

/// Returns the instruction in the bundle, which can be used as a base point

/// for scheduling. Usually it is the last instruction in the bundle, except

@@ -2354,14 +2492,16 @@ private:

/// \returns the scalarization cost for this list of values. Assuming that

/// this subtree gets vectorized, we may need to extract the values from the

/// roots. This method calculates the cost of extracting the values.

- InstructionCost getGatherCost(ArrayRef<Value *> VL) const;

+ /// \param ForPoisonSrc true if initial vector is poison, false otherwise.

+ InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;

/// Set the Builder insert point to one after the last instruction in

/// the bundle

void setInsertPointAfterBundle(const TreeEntry *E);

- /// \returns a vector from a collection of scalars in \p VL.

- Value *gather(ArrayRef<Value *> VL);

+ /// \returns a vector from a collection of scalars in \p VL. if \p Root is not

+ /// specified, the starting vector value is poison.

+ Value *gather(ArrayRef<Value *> VL, Value *Root);

/// \returns whether the VectorizableTree is fully vectorizable and will

/// be beneficial even the tree height is tiny.

@@ -2400,6 +2540,14 @@ private:

using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;

TreeEntry(VecTreeTy &Container) : Container(Container) {}

+ /// \returns Common mask for reorder indices and reused scalars.

+ SmallVector<int> getCommonMask() const {

+ SmallVector<int> Mask;

+ inversePermutation(ReorderIndices, Mask);

+ ::addMask(Mask, ReuseShuffleIndices);

+ return Mask;

+ }

/// \returns true if the scalars in VL are equal to this entry.

bool isSame(ArrayRef<Value *> VL) const {

auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {

@@ -2409,8 +2557,8 @@ private:

std::equal(VL.begin(), VL.end(), Mask.begin(),

[Scalars](Value *V, int Idx) {

return (isa<UndefValue>(V) &&

- Idx == UndefMaskElem) ||

- (Idx != UndefMaskElem && V == Scalars[Idx]);

+ Idx == PoisonMaskElem) ||

+ (Idx != PoisonMaskElem && V == Scalars[Idx]);

});

};

if (!ReorderIndices.empty()) {

@@ -2471,7 +2619,7 @@ private:

ValueList Scalars;

/// The Scalars are vectorized into this value. It is initialized to Null.

- Value *VectorizedValue = nullptr;

+ WeakTrackingVH VectorizedValue = nullptr;

/// Do we need to gather this sequence or vectorize it

/// (either with vector instruction or with scatter/gather

@@ -2684,20 +2832,22 @@ private:

#ifndef NDEBUG

void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,

- InstructionCost VecCost,

- InstructionCost ScalarCost) const {

- dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();

+ InstructionCost VecCost, InstructionCost ScalarCost,

+ StringRef Banner) const {

+ dbgs() << "SLP: " << Banner << ":\n";

+ E->dump();

dbgs() << "SLP: Costs:\n";

dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";

dbgs() << "SLP: VectorCost = " << VecCost << "\n";

dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";

- dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<

- ReuseShuffleCost + VecCost - ScalarCost << "\n";

+ dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "

+ << ReuseShuffleCost + VecCost - ScalarCost << "\n";

}

#endif

/// Create a new VectorizableTree entry.

- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,

+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL,

+ std::optional<ScheduleData *> Bundle,

const InstructionsState &S,

const EdgeInfo &UserTreeIdx,

ArrayRef<int> ReuseShuffleIndices = std::nullopt,

@@ -2791,8 +2941,14 @@ private:

return ScalarToTreeEntry.lookup(V);

}

+ /// Checks if the specified list of the instructions/values can be vectorized

+ /// and fills required data before actual scheduling of the instructions.

+ TreeEntry::EntryState getScalarsVectorizationState(

+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,

+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;

/// Maps a specific scalar to its tree entry.

- SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;

+ SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;

/// Maps a value to the proposed vectorizable size.

SmallDenseMap<Value *, unsigned> InstrElementSize;

@@ -2808,6 +2964,15 @@ private:

/// pre-gather them before.

DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;

+ /// List of gather nodes, depending on other gather/vector nodes, which should

+ /// be emitted after the vector instruction emission process to correctly

+ /// handle order of the vector instructions and shuffles.

+ SetVector<const TreeEntry *> PostponedGathers;

+ using ValueToGatherNodesMap =

+ DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;

+ ValueToGatherNodesMap ValueToGatherNodes;

/// This POD struct describes one external user in the vectorized tree.

struct ExternalUser {

ExternalUser(Value *S, llvm::User *U, int L)

@@ -3235,7 +3400,6 @@ private:

<< "SLP: gets ready (ctl): " << *DepBundle << "\n");

}

@@ -3579,7 +3743,7 @@ static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {

SmallVector<int> Prev(Reuses.begin(), Reuses.end());

Prev.swap(Reuses);

for (unsigned I = 0, E = Prev.size(); I < E; ++I)

- if (Mask[I] != UndefMaskElem)

+ if (Mask[I] != PoisonMaskElem)

Reuses[Mask[I]] = Prev[I];

}

@@ -3603,7 +3767,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {

}

Order.assign(Mask.size(), Mask.size());

for (unsigned I = 0, E = Mask.size(); I < E; ++I)

- if (MaskOrder[I] != UndefMaskElem)

+ if (MaskOrder[I] != PoisonMaskElem)

Order[MaskOrder[I]] = I;

fixupOrderingIndices(Order);

}

@@ -3653,10 +3817,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

return false;

return true;

};

- if (IsIdentityOrder(CurrentOrder)) {

- CurrentOrder.clear();

- return CurrentOrder;

- }

+ if (IsIdentityOrder(CurrentOrder))

+ return OrdersType();

auto *It = CurrentOrder.begin();

for (unsigned I = 0; I < NumScalars;) {

if (UsedPositions.test(I)) {

@@ -3669,7 +3831,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

}

++It;

}

- return CurrentOrder;

+ return std::move(CurrentOrder);

}

return std::nullopt;

}

@@ -3779,9 +3941,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

return LoadsState::Gather;

}

-bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,

- const DataLayout &DL, ScalarEvolution &SE,

- SmallVectorImpl<unsigned> &SortedIndices) {

+static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,

+ const DataLayout &DL, ScalarEvolution &SE,

+ SmallVectorImpl<unsigned> &SortedIndices) {

assert(llvm::all_of(

VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&

"Expected list of pointer operands.");

@@ -3825,7 +3987,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,

return std::get<1>(X) < std::get<1>(Y);

});

int InitialOffset = std::get<1>(Vec[0]);

- AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {

+ AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {

return std::get<1>(P.value()) == int(P.index()) + InitialOffset;

});

}

@@ -3862,7 +4024,7 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {

BoUpSLP::OrdersType Order;

if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))

- return Order;

+ return std::move(Order);

return std::nullopt;

}

@@ -3888,31 +4050,35 @@ static bool areTwoInsertFromSameBuildVector(

// Go through the vector operand of insertelement instructions trying to find

// either VU as the original vector for IE2 or V as the original vector for

// IE1.

+ SmallSet<int, 8> ReusedIdx;

+ bool IsReusedIdx = false;

do {

- if (IE2 == VU)

+ if (IE2 == VU && !IE1)

return VU->hasOneUse();

- if (IE1 == V)

+ if (IE1 == V && !IE2)

return V->hasOneUse();

- if (IE1) {

- if ((IE1 != VU && !IE1->hasOneUse()) ||

- getInsertIndex(IE1).value_or(*Idx2) == *Idx2)

+ if (IE1 && IE1 != V) {

+ IsReusedIdx |=

+ !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;

+ if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)

IE1 = nullptr;

else

IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));

}

- if (IE2) {

- if ((IE2 != V && !IE2->hasOneUse()) ||

- getInsertIndex(IE2).value_or(*Idx1) == *Idx1)

+ if (IE2 && IE2 != VU) {

+ IsReusedIdx |=

+ !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;

+ if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)

IE2 = nullptr;

else

IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));

}

- } while (IE1 || IE2);

+ } while (!IsReusedIdx && (IE1 || IE2));

return false;

}

-std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,

- bool TopToBottom) {

+std::optional<BoUpSLP::OrdersType>

+BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

// No need to reorder if need to shuffle reuses, still need to shuffle the

// node.

if (!TE.ReuseShuffleIndices.empty()) {

@@ -3936,14 +4102,14 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T

std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));

return Idx && *Idx < Sz;

})) {

- SmallVector<int> ReorderMask(Sz, UndefMaskElem);

+ SmallVector<int> ReorderMask(Sz, PoisonMaskElem);

if (TE.ReorderIndices.empty())

std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

else

inversePermutation(TE.ReorderIndices, ReorderMask);

for (unsigned I = 0; I < VF; ++I) {

int &Idx = ReusedMask[I];

- if (Idx == UndefMaskElem)

+ if (Idx == PoisonMaskElem)

continue;

Value *V = TE.Scalars[ReorderMask[Idx]];

std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));

@@ -3958,7 +4124,7 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T

for (unsigned K = 0; K < VF; K += Sz) {

OrdersType CurrentOrder(TE.ReorderIndices);

SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};

- if (SubMask.front() == UndefMaskElem)

+ if (SubMask.front() == PoisonMaskElem)

std::iota(SubMask.begin(), SubMask.end(), 0);

reorderOrder(CurrentOrder, SubMask);

transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });

@@ -3966,8 +4132,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T

}

if (all_of(enumerate(ResOrder),

[](const auto &Data) { return Data.index() == Data.value(); }))

- return {}; // Use identity order.

- return ResOrder;

+ return std::nullopt; // No need to reorder.

+ return std::move(ResOrder);

}

if (TE.State == TreeEntry::Vectorize &&

(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

@@ -3976,6 +4142,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T

return TE.ReorderIndices;

if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {

auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {

+ if (V1 == V2)

+ return false;

if (!V1->hasOneUse() || !V2->hasOneUse())

return false;

auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());

@@ -4023,8 +4191,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T

for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)

ResOrder[Id] = PhiToId[Phis[Id]];

if (IsIdentityOrder(ResOrder))

- return {};

- return ResOrder;

+ return std::nullopt; // No need to reorder.

+ return std::move(ResOrder);

}

if (TE.State == TreeEntry::NeedToGather) {

// TODO: add analysis of other gather nodes with extractelement

@@ -4050,7 +4218,42 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T

if (Reuse || !CurrentOrder.empty()) {

if (!CurrentOrder.empty())

fixupOrderingIndices(CurrentOrder);

- return CurrentOrder;

+ return std::move(CurrentOrder);

+ }

+ // If the gather node is <undef, v, .., poison> and

+ // insertelement poison, v, 0 [+ permute]

+ // is cheaper than

+ // insertelement poison, v, n - try to reorder.

+ // If rotating the whole graph, exclude the permute cost, the whole graph

+ // might be transformed.

+ int Sz = TE.Scalars.size();

+ if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&

+ count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {

+ const auto *It =

+ find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });

+ if (It == TE.Scalars.begin())

+ return OrdersType();

+ auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);

+ if (It != TE.Scalars.end()) {

+ OrdersType Order(Sz, Sz);

+ unsigned Idx = std::distance(TE.Scalars.begin(), It);

+ Order[Idx] = 0;

+ fixupOrderingIndices(Order);

+ SmallVector<int> Mask;

+ inversePermutation(Order, Mask);

+ InstructionCost PermuteCost =

+ TopToBottom

+ ? 0

+ : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask);

+ InstructionCost InsertFirstCost = TTI->getVectorInstrCost(

+ Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,

+ PoisonValue::get(Ty), *It);

+ InstructionCost InsertIdxCost = TTI->getVectorInstrCost(

+ Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,

+ PoisonValue::get(Ty), *It);

+ if (InsertFirstCost + PermuteCost < InsertIdxCost)

+ return std::move(Order);

}

if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))

@@ -4260,7 +4463,7 @@ void BoUpSLP::reorderTopToBottom() {

unsigned E = Order.size();

OrdersType CurrentOrder(E, E);

transform(Mask, CurrentOrder.begin(), [E](int Idx) {

- return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);

+ return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

});

fixupOrderingIndices(CurrentOrder);

++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;

@@ -4285,10 +4488,10 @@ void BoUpSLP::reorderTopToBottom() {

continue;

SmallVector<int> Mask;

inversePermutation(BestOrder, Mask);

- SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);

+ SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);

unsigned E = BestOrder.size();

transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

- return I < E ? static_cast<int>(I) : UndefMaskElem;

+ return I < E ? static_cast<int>(I) : PoisonMaskElem;

});

// Do an actual reordering, if profitable.

for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

@@ -4384,7 +4587,7 @@ bool BoUpSLP::canReorderOperands(

}

return false;

}) > 1 &&

- !all_of(UserTE->getOperand(I), isConstant))

+ !allConstant(UserTE->getOperand(I)))

return false;

if (Gather)

GatherOps.push_back(Gather);

@@ -4499,7 +4702,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

unsigned E = Order.size();

OrdersType CurrentOrder(E, E);

transform(Mask, CurrentOrder.begin(), [E](int Idx) {

- return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);

+ return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

});

fixupOrderingIndices(CurrentOrder);

OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=

@@ -4578,10 +4781,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

VisitedOps.clear();

SmallVector<int> Mask;

inversePermutation(BestOrder, Mask);

- SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);

+ SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);

unsigned E = BestOrder.size();

transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

- return I < E ? static_cast<int>(I) : UndefMaskElem;

+ return I < E ? static_cast<int>(I) : PoisonMaskElem;

});

for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {

TreeEntry *TE = Op.second;

@@ -4779,7 +4982,7 @@ bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

// Check if the stores are consecutive by checking if their difference is 1.

for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))

- if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)

+ if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)

return false;

// Calculate the shuffle indices according to their offset against the sorted

@@ -4976,6 +5179,309 @@ static bool isAlternateInstruction(const Instruction *I,

const Instruction *AltOp,

const TargetLibraryInfo &TLI);

+BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,

+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {

+ assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");

+ unsigned ShuffleOrOp =

+ S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

+ auto *VL0 = cast<Instruction>(S.OpValue);

+ switch (ShuffleOrOp) {

+ case Instruction::PHI: {

+ // Check for terminator values (e.g. invoke).

+ for (Value *V : VL)

+ for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {

+ Instruction *Term = dyn_cast<Instruction>(Incoming);

+ if (Term && Term->isTerminator()) {

+ LLVM_DEBUG(dbgs()

+ << "SLP: Need to swizzle PHINodes (terminator use).\n");

+ return TreeEntry::NeedToGather;

+ }

+ return TreeEntry::Vectorize;

+ }

+ case Instruction::ExtractValue:

+ case Instruction::ExtractElement: {

+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);

+ if (Reuse || !CurrentOrder.empty())

+ return TreeEntry::Vectorize;

+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");

+ return TreeEntry::NeedToGather;

+ }

+ case Instruction::InsertElement: {

+ // Check that we have a buildvector and not a shuffle of 2 or more

+ // different vectors.

+ ValueSet SourceVectors;

+ for (Value *V : VL) {

+ SourceVectors.insert(cast<Instruction>(V)->getOperand(0));

+ assert(getInsertIndex(V) != std::nullopt &&

+ "Non-constant or undef index?");

+ }

+ if (count_if(VL, [&SourceVectors](Value *V) {

+ return !SourceVectors.contains(V);

+ }) >= 2) {

+ // Found 2nd source vector - cancel.

+ LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "

+ "different source vectors.\n");

+ return TreeEntry::NeedToGather;

+ }

+ return TreeEntry::Vectorize;

+ }

+ case Instruction::Load: {

+ // Check that a vectorized load would load the same memory as a scalar

+ // load. For example, we don't want to vectorize loads that are smaller

+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores

+ // from such a struct, we read/write packed bits disagreeing with the

+ // unvectorized version.

+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder,

+ PointerOps)) {

+ case LoadsState::Vectorize:

+ return TreeEntry::Vectorize;

+ case LoadsState::ScatterVectorize:

+ return TreeEntry::ScatterVectorize;

+ case LoadsState::Gather:

+#ifndef NDEBUG

+ Type *ScalarTy = VL0->getType();

+ if (DL->getTypeSizeInBits(ScalarTy) !=

+ DL->getTypeAllocSizeInBits(ScalarTy))

+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");

+ else if (any_of(VL,

+ [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))

+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");

+ else

+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");

+#endif // NDEBUG

+ return TreeEntry::NeedToGather;

+ }

+ llvm_unreachable("Unexpected state of loads");

+ }

+ case Instruction::ZExt:

+ case Instruction::SExt:

+ case Instruction::FPToUI:

+ case Instruction::FPToSI:

+ case Instruction::FPExt:

+ case Instruction::PtrToInt:

+ case Instruction::IntToPtr:

+ case Instruction::SIToFP:

+ case Instruction::UIToFP:

+ case Instruction::Trunc:

+ case Instruction::FPTrunc:

+ case Instruction::BitCast: {

+ Type *SrcTy = VL0->getOperand(0)->getType();

+ for (Value *V : VL) {

+ Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();

+ if (Ty != SrcTy || !isValidElementType(Ty)) {

+ LLVM_DEBUG(

+ dbgs() << "SLP: Gathering casts with different src types.\n");

+ return TreeEntry::NeedToGather;

+ }

+ return TreeEntry::Vectorize;

+ }

+ case Instruction::ICmp:

+ case Instruction::FCmp: {

+ // Check that all of the compares have the same predicate.

+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

+ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);

+ Type *ComparedTy = VL0->getOperand(0)->getType();

+ for (Value *V : VL) {

+ CmpInst *Cmp = cast<CmpInst>(V);

+ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||

+ Cmp->getOperand(0)->getType() != ComparedTy) {

+ LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");

+ return TreeEntry::NeedToGather;

+ }

+ return TreeEntry::Vectorize;

+ }

+ case Instruction::Select:

+ case Instruction::FNeg:

+ case Instruction::Add:

+ case Instruction::FAdd:

+ case Instruction::Sub:

+ case Instruction::FSub:

+ case Instruction::Mul:

+ case Instruction::FMul:

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::FDiv:

+ case Instruction::URem:

+ case Instruction::SRem:

+ case Instruction::FRem:

+ case Instruction::Shl:

+ case Instruction::LShr:

+ case Instruction::AShr:

+ case Instruction::And:

+ case Instruction::Or:

+ case Instruction::Xor:

+ return TreeEntry::Vectorize;

+ case Instruction::GetElementPtr: {

+ // We don't combine GEPs with complicated (nested) indexing.

+ for (Value *V : VL) {

+ auto *I = dyn_cast<GetElementPtrInst>(V);

+ if (!I)

+ continue;

+ if (I->getNumOperands() != 2) {

+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");

+ return TreeEntry::NeedToGather;

+ }

+ // We can't combine several GEPs into one vector if they operate on

+ // different types.

+ Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();

+ for (Value *V : VL) {

+ auto *GEP = dyn_cast<GEPOperator>(V);

+ if (!GEP)

+ continue;

+ Type *CurTy = GEP->getSourceElementType();

+ if (Ty0 != CurTy) {

+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");

+ return TreeEntry::NeedToGather;

+ }

+ // We don't combine GEPs with non-constant indexes.

+ Type *Ty1 = VL0->getOperand(1)->getType();

+ for (Value *V : VL) {

+ auto *I = dyn_cast<GetElementPtrInst>(V);

+ if (!I)

+ continue;

+ auto *Op = I->getOperand(1);

+ if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

+ (Op->getType() != Ty1 &&

+ ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

+ Op->getType()->getScalarSizeInBits() >

+ DL->getIndexSizeInBits(

+ V->getType()->getPointerAddressSpace())))) {

+ LLVM_DEBUG(

+ dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");

+ return TreeEntry::NeedToGather;

+ }

+ return TreeEntry::Vectorize;

+ }

+ case Instruction::Store: {

+ // Check if the stores are consecutive or if we need to swizzle them.

+ llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

+ // Avoid types that are padded when being allocated as scalars, while

+ // being packed together in a vector (such as i1).

+ if (DL->getTypeSizeInBits(ScalarTy) !=

+ DL->getTypeAllocSizeInBits(ScalarTy)) {

+ LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");

+ return TreeEntry::NeedToGather;

+ }

+ // Make sure all stores in the bundle are simple - we can't vectorize

+ // atomic or volatile stores.

+ for (Value *V : VL) {

+ auto *SI = cast<StoreInst>(V);

+ if (!SI->isSimple()) {

+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");

+ return TreeEntry::NeedToGather;

+ }

+ PointerOps.push_back(SI->getPointerOperand());

+ }

+ // Check the order of pointer operands.

+ if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {

+ Value *Ptr0;

+ Value *PtrN;

+ if (CurrentOrder.empty()) {

+ Ptr0 = PointerOps.front();

+ PtrN = PointerOps.back();

+ } else {

+ Ptr0 = PointerOps[CurrentOrder.front()];

+ PtrN = PointerOps[CurrentOrder.back()];

+ }

+ std::optional<int> Dist =

+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

+ // Check that the sorted pointer operands are consecutive.

+ if (static_cast<unsigned>(*Dist) == VL.size() - 1)

+ return TreeEntry::Vectorize;

+ }

+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

+ return TreeEntry::NeedToGather;

+ }

+ case Instruction::Call: {

+ // Check if the calls are all to the same vectorizable intrinsic or

+ // library function.

+ CallInst *CI = cast<CallInst>(VL0);

+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

+ VFShape Shape = VFShape::get(

+ *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

+ false /*HasGlobalPred*/);

+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

+ if (!VecFunc && !isTriviallyVectorizable(ID)) {

+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");

+ return TreeEntry::NeedToGather;

+ }

+ Function *F = CI->getCalledFunction();

+ unsigned NumArgs = CI->arg_size();

+ SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);

+ for (unsigned J = 0; J != NumArgs; ++J)

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J))

+ ScalarArgs[J] = CI->getArgOperand(J);

+ for (Value *V : VL) {

+ CallInst *CI2 = dyn_cast<CallInst>(V);

+ if (!CI2 || CI2->getCalledFunction() != F ||

+ getVectorIntrinsicIDForCall(CI2, TLI) != ID ||

+ (VecFunc &&

+ VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||

+ !CI->hasIdenticalOperandBundleSchema(*CI2)) {

+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V

+ << "\n");

+ return TreeEntry::NeedToGather;

+ }

+ // Some intrinsics have scalar arguments and should be same in order for

+ // them to be vectorized.

+ for (unsigned J = 0; J != NumArgs; ++J) {

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {

+ Value *A1J = CI2->getArgOperand(J);

+ if (ScalarArgs[J] != A1J) {

+ LLVM_DEBUG(dbgs()

+ << "SLP: mismatched arguments in call:" << *CI

+ << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");

+ return TreeEntry::NeedToGather;

+ }

+ // Verify that the bundle operands are identical between the two calls.

+ if (CI->hasOperandBundles() &&

+ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),

+ CI->op_begin() + CI->getBundleOperandsEndIndex(),

+ CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI

+ << "!=" << *V << '\n');

+ return TreeEntry::NeedToGather;

+ }

+ return TreeEntry::Vectorize;

+ }

+ case Instruction::ShuffleVector: {

+ // If this is not an alternate sequence of opcode like add-sub

+ // then do not vectorize this instruction.

+ if (!S.isAltShuffle()) {

+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");

+ return TreeEntry::NeedToGather;

+ }

+ return TreeEntry::Vectorize;

+ }

+ default:

+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");

+ return TreeEntry::NeedToGather;

+ }

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

const EdgeInfo &UserTreeIdx) {

assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");

@@ -4990,7 +5496,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (Value *V : VL) {

if (isConstant(V)) {

ReuseShuffleIndicies.emplace_back(

- isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());

+ isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());

UniqueValues.emplace_back(V);

continue;

}

@@ -5010,7 +5516,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return isa<UndefValue>(V) ||

!isConstant(V);

})) ||

- !llvm::isPowerOf2_32(NumUniqueScalarValues)) {

+ !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {

LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

return false;

@@ -5257,6 +5763,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (!TryToFindDuplicates(S))

return;

+ // Perform specific checks for each particular instruction kind.

+ OrdersType CurrentOrder;

+ SmallVector<Value *> PointerOps;

+ TreeEntry::EntryState State = getScalarsVectorizationState(

+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);

+ if (State == TreeEntry::NeedToGather) {

+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ return;

+ }

auto &BSRef = BlocksSchedules[BB];

if (!BSRef)

BSRef = std::make_unique<BlockScheduling>(BB);

@@ -5285,20 +5802,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::PHI: {

auto *PH = cast<PHINode>(VL0);

- // Check for terminator values (e.g. invoke).

- for (Value *V : VL)

- for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {

- Instruction *Term = dyn_cast<Instruction>(Incoming);

- if (Term && Term->isTerminator()) {

- LLVM_DEBUG(dbgs()

- << "SLP: Need to swizzle PHINodes (terminator use).\n");

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- return;

- }

TreeEntry *TE =

newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

@@ -5326,9 +5829,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

case Instruction::ExtractValue:

case Instruction::ExtractElement: {

- OrdersType CurrentOrder;

- bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);

- if (Reuse) {

+ if (CurrentOrder.empty()) {

LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");

newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

@@ -5339,55 +5840,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

VectorizableTree.back()->setOperand(0, Op0);

return;

}

- if (!CurrentOrder.empty()) {

- LLVM_DEBUG({

- dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "

- "with order";

- for (unsigned Idx : CurrentOrder)

- dbgs() << " " << Idx;

- dbgs() << "\n";

- });

- fixupOrderingIndices(CurrentOrder);

- // Insert new order with initial value 0, if it does not exist,

- // otherwise return the iterator to the existing one.

- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies, CurrentOrder);

- // This is a special case, as it does not gather, but at the same time

- // we are not extending buildTree_rec() towards the operands.

- ValueList Op0;

- Op0.assign(VL.size(), VL0->getOperand(0));

- VectorizableTree.back()->setOperand(0, Op0);

- return;

- }

- LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- BS.cancelScheduling(VL, VL0);

+ LLVM_DEBUG({

+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "

+ "with order";

+ for (unsigned Idx : CurrentOrder)

+ dbgs() << " " << Idx;

+ dbgs() << "\n";

+ });

+ fixupOrderingIndices(CurrentOrder);

+ // Insert new order with initial value 0, if it does not exist,

+ // otherwise return the iterator to the existing one.

+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies, CurrentOrder);

+ // This is a special case, as it does not gather, but at the same time

+ // we are not extending buildTree_rec() towards the operands.

+ ValueList Op0;

+ Op0.assign(VL.size(), VL0->getOperand(0));

+ VectorizableTree.back()->setOperand(0, Op0);

return;

}

case Instruction::InsertElement: {

assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");

- // Check that we have a buildvector and not a shuffle of 2 or more

- // different vectors.

- ValueSet SourceVectors;

- for (Value *V : VL) {

- SourceVectors.insert(cast<Instruction>(V)->getOperand(0));

- assert(getInsertIndex(V) != std::nullopt &&

- "Non-constant or undef index?");

- }

- if (count_if(VL, [&SourceVectors](Value *V) {

- return !SourceVectors.contains(V);

- }) >= 2) {

- // Found 2nd source vector - cancel.

- LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "

- "different source vectors.\n");

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

- BS.cancelScheduling(VL, VL0);

- return;

- }

auto OrdCompare = [](const std::pair<int, int> &P1,

const std::pair<int, int> &P2) {

return P1.first > P2.first;

@@ -5430,12 +5904,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// treats loading/storing it as an i8 struct. If we vectorize loads/stores

// from such a struct, we read/write packed bits disagreeing with the

// unvectorized version.

- SmallVector<Value *> PointerOps;

- OrdersType CurrentOrder;

TreeEntry *TE = nullptr;

- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI,

- CurrentOrder, PointerOps)) {

- case LoadsState::Vectorize:

+ switch (State) {

+ case TreeEntry::Vectorize:

if (CurrentOrder.empty()) {

// Original loads are consecutive and does not require reordering.

TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

@@ -5450,7 +5921,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

TE->setOperandsInOrder();

break;

- case LoadsState::ScatterVectorize:

+ case TreeEntry::ScatterVectorize:

// Vectorizing non-consecutive loads with `llvm.masked.gather`.

TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

UserTreeIdx, ReuseShuffleIndicies);

@@ -5458,23 +5929,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

buildTree_rec(PointerOps, Depth + 1, {TE, 0});

LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");

break;

- case LoadsState::Gather:

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

-#ifndef NDEBUG

- Type *ScalarTy = VL0->getType();

- if (DL->getTypeSizeInBits(ScalarTy) !=

- DL->getTypeAllocSizeInBits(ScalarTy))

- LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");

- else if (any_of(VL, [](Value *V) {

- return !cast<LoadInst>(V)->isSimple();

- }))

- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");

- else

- LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");

-#endif // NDEBUG

- break;

+ case TreeEntry::NeedToGather:

+ llvm_unreachable("Unexpected loads state.");

}

return;

}

@@ -5490,18 +5946,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::Trunc:

case Instruction::FPTrunc:

case Instruction::BitCast: {

- Type *SrcTy = VL0->getOperand(0)->getType();

- for (Value *V : VL) {

- Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();

- if (Ty != SrcTy || !isValidElementType(Ty)) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs()

- << "SLP: Gathering casts with different src types.\n");

- return;

- }

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");

@@ -5521,21 +5965,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::FCmp: {

// Check that all of the compares have the same predicate.

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

- CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);

- Type *ComparedTy = VL0->getOperand(0)->getType();

- for (Value *V : VL) {

- CmpInst *Cmp = cast<CmpInst>(V);

- if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||

- Cmp->getOperand(0)->getType() != ComparedTy) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs()

- << "SLP: Gathering cmp with different predicate.\n");

- return;

- }

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");

@@ -5544,7 +5973,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (cast<CmpInst>(VL0)->isCommutative()) {

// Commutative predicate - collect + sort operands of the instructions

// so that each side is more likely to have the same opcode.

- assert(P0 == SwapP0 && "Commutative Predicate mismatch");

+ assert(P0 == CmpInst::getSwappedPredicate(P0) &&

+ "Commutative Predicate mismatch");

reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);

} else {

// Collect operands - commute if it uses the swapped predicate.

@@ -5612,60 +6042,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return;

}

case Instruction::GetElementPtr: {

- // We don't combine GEPs with complicated (nested) indexing.

- for (Value *V : VL) {

- auto *I = dyn_cast<GetElementPtrInst>(V);

- if (!I)

- continue;

- if (I->getNumOperands() != 2) {

- LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- return;

- }

- // We can't combine several GEPs into one vector if they operate on

- // different types.

- Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();

- for (Value *V : VL) {

- auto *GEP = dyn_cast<GEPOperator>(V);

- if (!GEP)

- continue;

- Type *CurTy = GEP->getSourceElementType();

- if (Ty0 != CurTy) {

- LLVM_DEBUG(dbgs()

- << "SLP: not-vectorizable GEP (different types).\n");

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- return;

- }

- // We don't combine GEPs with non-constant indexes.

- Type *Ty1 = VL0->getOperand(1)->getType();

- for (Value *V : VL) {

- auto *I = dyn_cast<GetElementPtrInst>(V);

- if (!I)

- continue;

- auto *Op = I->getOperand(1);

- if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

- (Op->getType() != Ty1 &&

- ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

- Op->getType()->getScalarSizeInBits() >

- DL->getIndexSizeInBits(

- V->getType()->getPointerAddressSpace())))) {

- LLVM_DEBUG(dbgs()

- << "SLP: not-vectorizable GEP (non-constant indexes).\n");

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- return;

- }

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");

@@ -5722,78 +6098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

case Instruction::Store: {

// Check if the stores are consecutive or if we need to swizzle them.

- llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

- // Avoid types that are padded when being allocated as scalars, while

- // being packed together in a vector (such as i1).

- if (DL->getTypeSizeInBits(ScalarTy) !=

- DL->getTypeAllocSizeInBits(ScalarTy)) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");

- return;

- }

- // Make sure all stores in the bundle are simple - we can't vectorize

- // atomic or volatile stores.

- SmallVector<Value *, 4> PointerOps(VL.size());

ValueList Operands(VL.size());

- auto POIter = PointerOps.begin();

- auto OIter = Operands.begin();

+ auto *OIter = Operands.begin();

for (Value *V : VL) {

auto *SI = cast<StoreInst>(V);

- if (!SI->isSimple()) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");

- return;

- }

- *POIter = SI->getPointerOperand();

*OIter = SI->getValueOperand();

- ++POIter;

++OIter;

}

- OrdersType CurrentOrder;

- // Check the order of pointer operands.

- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {

- Value *Ptr0;

- Value *PtrN;

- if (CurrentOrder.empty()) {

- Ptr0 = PointerOps.front();

- PtrN = PointerOps.back();

- } else {

- Ptr0 = PointerOps[CurrentOrder.front()];

- PtrN = PointerOps[CurrentOrder.back()];

- }

- std::optional<int> Dist =

- getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

- // Check that the sorted pointer operands are consecutive.

- if (static_cast<unsigned>(*Dist) == VL.size() - 1) {

- if (CurrentOrder.empty()) {

- // Original stores are consecutive and does not require reordering.

- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,

- UserTreeIdx, ReuseShuffleIndicies);

- TE->setOperandsInOrder();

- buildTree_rec(Operands, Depth + 1, {TE, 0});

- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

- } else {

- fixupOrderingIndices(CurrentOrder);

- TreeEntry *TE =

- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies, CurrentOrder);

- TE->setOperandsInOrder();

- buildTree_rec(Operands, Depth + 1, {TE, 0});

- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");

- }

- return;

- }

+ // Check that the sorted pointer operands are consecutive.

+ if (CurrentOrder.empty()) {

+ // Original stores are consecutive and does not require reordering.

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ TE->setOperandsInOrder();

+ buildTree_rec(Operands, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

+ } else {

+ fixupOrderingIndices(CurrentOrder);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies, CurrentOrder);

+ TE->setOperandsInOrder();

+ buildTree_rec(Operands, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");

}

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

return;

}

case Instruction::Call: {

@@ -5802,68 +6129,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

CallInst *CI = cast<CallInst>(VL0);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

- VFShape Shape = VFShape::get(

- *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

- false /*HasGlobalPred*/);

- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

- if (!VecFunc && !isTriviallyVectorizable(ID)) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");

- return;

- }

- Function *F = CI->getCalledFunction();

- unsigned NumArgs = CI->arg_size();

- SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);

- for (unsigned j = 0; j != NumArgs; ++j)

- if (isVectorIntrinsicWithScalarOpAtArg(ID, j))

- ScalarArgs[j] = CI->getArgOperand(j);

- for (Value *V : VL) {

- CallInst *CI2 = dyn_cast<CallInst>(V);

- if (!CI2 || CI2->getCalledFunction() != F ||

- getVectorIntrinsicIDForCall(CI2, TLI) != ID ||

- (VecFunc &&

- VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||

- !CI->hasIdenticalOperandBundleSchema(*CI2)) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V

- << "\n");

- return;

- }

- // Some intrinsics have scalar arguments and should be same in order for

- // them to be vectorized.

- for (unsigned j = 0; j != NumArgs; ++j) {

- if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {

- Value *A1J = CI2->getArgOperand(j);

- if (ScalarArgs[j] != A1J) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI

- << " argument " << ScalarArgs[j] << "!=" << A1J

- << "\n");

- return;

- }

- // Verify that the bundle operands are identical between the two calls.

- if (CI->hasOperandBundles() &&

- !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),

- CI->op_begin() + CI->getBundleOperandsEndIndex(),

- CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"

- << *CI << "!=" << *V << '\n');

- return;

- }

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

TE->setOperandsInOrder();

@@ -5883,15 +6148,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return;

}

case Instruction::ShuffleVector: {

- // If this is not an alternate sequence of opcode like add-sub

- // then do not vectorize this instruction.

- if (!S.isAltShuffle()) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");

- return;

- }

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

@@ -5949,19 +6205,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return;

}

default:

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");

- return;

+ break;

}

+ llvm_unreachable("Unexpected vectorization of the instructions.");

}

unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

unsigned N = 1;

Type *EltTy = T;

- while (isa<StructType, ArrayType, VectorType>(EltTy)) {

+ while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {

if (auto *ST = dyn_cast<StructType>(EltTy)) {

// Check that struct is homogeneous.

for (const auto *Ty : ST->elements())

@@ -5982,7 +6235,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

if (!isValidElementType(EltTy))

return 0;

uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));

- if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))

+ if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||

+ VTSize != DL.getTypeStoreSizeInBits(T))

return 0;

return N;

}

@@ -6111,68 +6365,6 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

return {IntrinsicCost, LibCost};

}

-/// Compute the cost of creating a vector of type \p VecTy containing the

-/// extracted values from \p VL.

-static InstructionCost

-computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

- TargetTransformInfo::ShuffleKind ShuffleKind,

- ArrayRef<int> Mask, TargetTransformInfo &TTI) {

- unsigned NumOfParts = TTI.getNumberOfParts(VecTy);

- if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||

- VecTy->getNumElements() < NumOfParts)

- return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);

- bool AllConsecutive = true;

- unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;

- unsigned Idx = -1;

- InstructionCost Cost = 0;

- // Process extracts in blocks of EltsPerVector to check if the source vector

- // operand can be re-used directly. If not, add the cost of creating a shuffle

- // to extract the values into a vector register.

- SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);

- for (auto *V : VL) {

- ++Idx;

- // Reached the start of a new vector registers.

- if (Idx % EltsPerVector == 0) {

- RegMask.assign(EltsPerVector, UndefMaskElem);

- AllConsecutive = true;

- continue;

- }

- // Need to exclude undefs from analysis.

- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)

- continue;

- // Check all extracts for a vector register on the target directly

- // extract values in order.

- unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

- if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {

- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

- CurrentIdx % EltsPerVector == Idx % EltsPerVector;

- RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;

- }

- if (AllConsecutive)

- continue;

- // Skip all indices, except for the last index per vector block.

- if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())

- continue;

- // If we have a series of extracts which are not consecutive and hence

- // cannot re-use the source vector register directly, compute the shuffle

- // cost to extract the vector with EltsPerVector elements.

- Cost += TTI.getShuffleCost(

- TargetTransformInfo::SK_PermuteSingleSrc,

- FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);

- }

- return Cost;

/// Build shuffle mask for shuffle graph entries and lists of main and alternate

/// operations operands.

static void

@@ -6183,7 +6375,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

SmallVectorImpl<Value *> *OpScalars = nullptr,

SmallVectorImpl<Value *> *AltScalars = nullptr) {

unsigned Sz = VL.size();

- Mask.assign(Sz, UndefMaskElem);

+ Mask.assign(Sz, PoisonMaskElem);

SmallVector<int> OrderMask;

if (!ReorderIndices.empty())

inversePermutation(ReorderIndices, OrderMask);

@@ -6203,9 +6395,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

}

if (!ReusesIndices.empty()) {

- SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);

+ SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem);

transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {

- return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;

+ return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;

});

Mask.swap(NewMask);

}

@@ -6325,13 +6517,13 @@ protected:

static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,

ArrayRef<int> ExtMask) {

unsigned VF = Mask.size();

- SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);

+ SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);

for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {

- if (ExtMask[I] == UndefMaskElem)

+ if (ExtMask[I] == PoisonMaskElem)

continue;

int MaskedIdx = Mask[ExtMask[I] % VF];

NewMask[I] =

- MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF;

+ MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;

}

Mask.swap(NewMask);

}

@@ -6418,11 +6610,12 @@ protected:

if (auto *SVOpTy =

dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))

LocalVF = SVOpTy->getNumElements();

- SmallVector<int> ExtMask(Mask.size(), UndefMaskElem);

+ SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);

for (auto [Idx, I] : enumerate(Mask)) {

- if (I == UndefMaskElem)

- continue;

- ExtMask[Idx] = SV->getMaskValue(I);

+ if (I == PoisonMaskElem ||

+ static_cast<unsigned>(I) >= SV->getShuffleMask().size())

+ continue;

+ ExtMask[Idx] = SV->getMaskValue(I);

}

bool IsOp1Undef =

isUndefVector(SV->getOperand(0),

@@ -6435,11 +6628,11 @@ protected:

if (!IsOp1Undef && !IsOp2Undef) {

// Update mask and mark undef elems.

for (int &I : Mask) {

- if (I == UndefMaskElem)

+ if (I == PoisonMaskElem)

continue;

if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==

- UndefMaskElem)

- I = UndefMaskElem;

+ PoisonMaskElem)

+ I = PoisonMaskElem;

}

break;

}

@@ -6453,15 +6646,16 @@ protected:

Op = SV->getOperand(1);

}

if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());

- !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) {

+ !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||

+ ShuffleVectorInst::isZeroEltSplatMask(Mask)) {

if (IdentityOp) {

V = IdentityOp;

assert(Mask.size() == IdentityMask.size() &&

"Expected masks of same sizes.");

// Clear known poison elements.

for (auto [I, Idx] : enumerate(Mask))

- if (Idx == UndefMaskElem)

- IdentityMask[I] = UndefMaskElem;

+ if (Idx == PoisonMaskElem)

+ IdentityMask[I] = PoisonMaskElem;

Mask.swap(IdentityMask);

auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);

return SinglePermute &&

@@ -6481,10 +6675,12 @@ protected:

/// Smart shuffle instruction emission, walks through shuffles trees and

/// tries to find the best matching vector for the actual shuffle

/// instruction.

- template <typename ShuffleBuilderTy>

- static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,

- ShuffleBuilderTy &Builder) {

+ template <typename T, typename ShuffleBuilderTy>

+ static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,

+ ShuffleBuilderTy &Builder) {

assert(V1 && "Expected at least one vector value.");

+ if (V2)

+ Builder.resizeToMatch(V1, V2);

int VF = Mask.size();

if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))

VF = FTy->getNumElements();

@@ -6495,8 +6691,8 @@ protected:

Value *Op2 = V2;

int VF =

cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

- SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);

- SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);

+ SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);

+ SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);

for (int I = 0, E = Mask.size(); I < E; ++I) {

if (Mask[I] < VF)

CombinedMask1[I] = Mask[I];

@@ -6514,9 +6710,9 @@ protected:

// again.

if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))

if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {

- SmallVector<int> ExtMask1(Mask.size(), UndefMaskElem);

+ SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);

for (auto [Idx, I] : enumerate(CombinedMask1)) {

- if (I == UndefMaskElem)

+ if (I == PoisonMaskElem)

continue;

ExtMask1[Idx] = SV1->getMaskValue(I);

}

@@ -6524,9 +6720,9 @@ protected:

cast<FixedVectorType>(SV1->getOperand(1)->getType())

->getNumElements(),

ExtMask1, UseMask::SecondArg);

- SmallVector<int> ExtMask2(CombinedMask2.size(), UndefMaskElem);

+ SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);

for (auto [Idx, I] : enumerate(CombinedMask2)) {

- if (I == UndefMaskElem)

+ if (I == PoisonMaskElem)

continue;

ExtMask2[Idx] = SV2->getMaskValue(I);

}

@@ -6566,64 +6762,360 @@ protected:

->getElementCount()

.getKnownMinValue());

for (int I = 0, E = Mask.size(); I < E; ++I) {

- if (CombinedMask2[I] != UndefMaskElem) {

- assert(CombinedMask1[I] == UndefMaskElem &&

+ if (CombinedMask2[I] != PoisonMaskElem) {

+ assert(CombinedMask1[I] == PoisonMaskElem &&

"Expected undefined mask element");

CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

}

+ const int Limit = CombinedMask1.size() * 2;

+ if (Op1 == Op2 && Limit == 2 * VF &&

+ all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&

+ (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||

+ (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&

+ isa<ShuffleVectorInst>(Op1) &&

+ cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==

+ ArrayRef(CombinedMask1))))

+ return Builder.createIdentity(Op1);

return Builder.createShuffleVector(

Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,

CombinedMask1);

}

if (isa<PoisonValue>(V1))

- return PoisonValue::get(FixedVectorType::get(

- cast<VectorType>(V1->getType())->getElementType(), Mask.size()));

+ return Builder.createPoison(

+ cast<VectorType>(V1->getType())->getElementType(), Mask.size());

SmallVector<int> NewMask(Mask.begin(), Mask.end());

bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);

assert(V1 && "Expected non-null value after looking through shuffles.");

if (!IsIdentity)

return Builder.createShuffleVector(V1, NewMask);

- return V1;

+ return Builder.createIdentity(V1);

}

};

} // namespace

-InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

- ArrayRef<Value *> VectorizedVals) {

- ArrayRef<Value *> VL = E->Scalars;

+/// Merges shuffle masks and emits final shuffle instruction, if required. It

+/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,

+/// when the actual shuffle instruction is generated only if this is actually

+/// required. Otherwise, the shuffle instruction emission is delayed till the

+/// end of the process, to reduce the number of emitted instructions and further

+/// analysis/transformations.

+class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

+ bool IsFinalized = false;

+ SmallVector<int> CommonMask;

+ SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;

+ const TargetTransformInfo &TTI;

+ InstructionCost Cost = 0;

+ ArrayRef<Value *> VectorizedVals;

+ BoUpSLP &R;

+ SmallPtrSetImpl<Value *> &CheckedExtracts;

+ constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {

+ if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))

+ return TTI::TCC_Free;

+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());

+ InstructionCost GatherCost = 0;

+ SmallVector<Value *> Gathers(VL.begin(), VL.end());

+ // Improve gather cost for gather of loads, if we can group some of the

+ // loads into vector loads.

+ InstructionsState S = getSameOpcode(VL, *R.TLI);

+ if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&

+ !S.isAltShuffle() &&

+ !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&

+ !isSplat(Gathers)) {

+ BoUpSLP::ValueSet VectorizedLoads;

+ unsigned StartIdx = 0;

+ unsigned VF = VL.size() / 2;

+ unsigned VectorizedCnt = 0;

+ unsigned ScatterVectorizeCnt = 0;

+ const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());

+ for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {

+ for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;

+ Cnt += VF) {

+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

+ if (!VectorizedLoads.count(Slice.front()) &&

+ !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {

+ SmallVector<Value *> PointerOps;

+ OrdersType CurrentOrder;

+ LoadsState LS =

+ canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,

+ *R.LI, *R.TLI, CurrentOrder, PointerOps);

+ switch (LS) {

+ case LoadsState::Vectorize:

+ case LoadsState::ScatterVectorize:

+ // Mark the vectorized loads so that we don't vectorize them

+ // again.

+ if (LS == LoadsState::Vectorize)

+ ++VectorizedCnt;

+ else

+ ++ScatterVectorizeCnt;

+ VectorizedLoads.insert(Slice.begin(), Slice.end());

+ // If we vectorized initial block, no need to try to vectorize

+ // it again.

+ if (Cnt == StartIdx)

+ StartIdx += VF;

+ break;

+ case LoadsState::Gather:

+ break;

+ }

+ // Check if the whole array was vectorized already - exit.

+ if (StartIdx >= VL.size())

+ break;

+ // Found vectorizable parts - exit.

+ if (!VectorizedLoads.empty())

+ break;

+ }

+ if (!VectorizedLoads.empty()) {

+ unsigned NumParts = TTI.getNumberOfParts(VecTy);

+ bool NeedInsertSubvectorAnalysis =

+ !NumParts || (VL.size() / VF) > NumParts;

+ // Get the cost for gathered loads.

+ for (unsigned I = 0, End = VL.size(); I < End; I += VF) {

+ if (VectorizedLoads.contains(VL[I]))

+ continue;

+ GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);

+ }

+ // Exclude potentially vectorized loads from list of gathered

+ // scalars.

+ auto *LI = cast<LoadInst>(S.MainOp);

+ Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));

+ // The cost for vectorized loads.

+ InstructionCost ScalarsCost = 0;

+ for (Value *V : VectorizedLoads) {

+ auto *LI = cast<LoadInst>(V);

+ ScalarsCost +=

+ TTI.getMemoryOpCost(Instruction::Load, LI->getType(),

+ LI->getAlign(), LI->getPointerAddressSpace(),

+ CostKind, TTI::OperandValueInfo(), LI);

+ }

+ auto *LoadTy = FixedVectorType::get(LI->getType(), VF);

+ Align Alignment = LI->getAlign();

+ GatherCost +=

+ VectorizedCnt *

+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

+ LI->getPointerAddressSpace(), CostKind,

+ TTI::OperandValueInfo(), LI);

+ GatherCost += ScatterVectorizeCnt *

+ TTI.getGatherScatterOpCost(

+ Instruction::Load, LoadTy, LI->getPointerOperand(),

+ /*VariableMask=*/false, Alignment, CostKind, LI);

+ if (NeedInsertSubvectorAnalysis) {

+ // Add the cost for the subvectors insert.

+ for (int I = VF, E = VL.size(); I < E; I += VF)

+ GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,

+ std::nullopt, CostKind, I, LoadTy);

+ }

+ GatherCost -= ScalarsCost;

+ }

+ } else if (!Root && isSplat(VL)) {

+ // Found the broadcasting of the single scalar, calculate the cost as

+ // the broadcast.

+ const auto *It =

+ find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });

+ assert(It != VL.end() && "Expected at least one non-undef value.");

+ // Add broadcast for non-identity shuffle only.

+ bool NeedShuffle =

+ count(VL, *It) > 1 &&

+ (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));

+ InstructionCost InsertCost = TTI.getVectorInstrCost(

+ Instruction::InsertElement, VecTy, CostKind,

+ NeedShuffle ? 0 : std::distance(VL.begin(), It),

+ PoisonValue::get(VecTy), *It);

+ return InsertCost +

+ (NeedShuffle ? TTI.getShuffleCost(

+ TargetTransformInfo::SK_Broadcast, VecTy,

+ /*Mask=*/std::nullopt, CostKind, /*Index=*/0,

+ /*SubTp=*/nullptr, /*Args=*/*It)

+ : TTI::TCC_Free);

+ }

+ return GatherCost +

+ (all_of(Gathers, UndefValue::classof)

+ ? TTI::TCC_Free

+ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));

+ };

- Type *ScalarTy = VL[0]->getType();

- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

- ScalarTy = SI->getValueOperand()->getType();

- else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))

- ScalarTy = CI->getOperand(0)->getType();

- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

- ScalarTy = IE->getOperand(1)->getType();

- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ /// Compute the cost of creating a vector of type \p VecTy containing the

+ /// extracted values from \p VL.

+ InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,

+ TTI::ShuffleKind ShuffleKind) {

+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());

+ unsigned NumOfParts = TTI.getNumberOfParts(VecTy);

- // If we have computed a smaller type for the expression, update VecTy so

- // that the costs will be accurate.

- if (MinBWs.count(VL[0]))

- VecTy = FixedVectorType::get(

- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

- unsigned EntryVF = E->getVectorFactor();

- auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);

+ if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||

+ !NumOfParts || VecTy->getNumElements() < NumOfParts)

+ return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);

- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

- // FIXME: it tries to fix a problem with MSVC buildbots.

- TargetTransformInfo *TTI = this->TTI;

- auto AdjustExtractsCost = [=](InstructionCost &Cost) {

+ bool AllConsecutive = true;

+ unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;

+ unsigned Idx = -1;

+ InstructionCost Cost = 0;

+ // Process extracts in blocks of EltsPerVector to check if the source vector

+ // operand can be re-used directly. If not, add the cost of creating a

+ // shuffle to extract the values into a vector register.

+ SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);

+ for (auto *V : VL) {

+ ++Idx;

+ // Reached the start of a new vector registers.

+ if (Idx % EltsPerVector == 0) {

+ RegMask.assign(EltsPerVector, PoisonMaskElem);

+ AllConsecutive = true;

+ continue;

+ }

+ // Need to exclude undefs from analysis.

+ if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)

+ continue;

+ // Check all extracts for a vector register on the target directly

+ // extract values in order.

+ unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

+ if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) {

+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;

+ RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;

+ }

+ if (AllConsecutive)

+ continue;

+ // Skip all indices, except for the last index per vector block.

+ if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())

+ continue;

+ // If we have a series of extracts which are not consecutive and hence

+ // cannot re-use the source vector register directly, compute the shuffle

+ // cost to extract the vector with EltsPerVector elements.

+ Cost += TTI.getShuffleCost(

+ TargetTransformInfo::SK_PermuteSingleSrc,

+ FixedVectorType::get(VecTy->getElementType(), EltsPerVector),

+ RegMask);

+ }

+ return Cost;

+ }

+ class ShuffleCostBuilder {

+ const TargetTransformInfo &TTI;

+ static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {

+ int Limit = 2 * VF;

+ return Mask.empty() ||

+ (VF == Mask.size() &&

+ all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&

+ ShuffleVectorInst::isIdentityMask(Mask));

+ }

+ public:

+ ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}

+ ~ShuffleCostBuilder() = default;

+ InstructionCost createShuffleVector(Value *V1, Value *,

+ ArrayRef<int> Mask) const {

+ // Empty mask or identity mask are free.

+ unsigned VF =

+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

+ if (isEmptyOrIdentity(Mask, VF))

+ return TTI::TCC_Free;

+ return TTI.getShuffleCost(

+ TTI::SK_PermuteTwoSrc,

+ FixedVectorType::get(

+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()),

+ Mask);

+ }

+ InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {

+ // Empty mask or identity mask are free.

+ if (isEmptyOrIdentity(Mask, Mask.size()))

+ return TTI::TCC_Free;

+ return TTI.getShuffleCost(

+ TTI::SK_PermuteSingleSrc,

+ FixedVectorType::get(

+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()),

+ Mask);

+ }

+ InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }

+ InstructionCost createPoison(Type *Ty, unsigned VF) const {

+ return TTI::TCC_Free;

+ }

+ void resizeToMatch(Value *&, Value *&) const {}

+ };

+ /// Smart shuffle instruction emission, walks through shuffles trees and

+ /// tries to find the best matching vector for the actual shuffle

+ /// instruction.

+ InstructionCost

+ createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,

+ const PointerUnion<Value *, const TreeEntry *> &P2,

+ ArrayRef<int> Mask) {

+ ShuffleCostBuilder Builder(TTI);

+ Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();

+ unsigned CommonVF = 0;

+ if (!V1) {

+ const TreeEntry *E = P1.get<const TreeEntry *>();

+ unsigned VF = E->getVectorFactor();

+ if (V2) {

+ unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();

+ if (V2VF != VF && V2VF == E->Scalars.size())

+ VF = E->Scalars.size();

+ } else if (!P2.isNull()) {

+ const TreeEntry *E2 = P2.get<const TreeEntry *>();

+ if (E->Scalars.size() == E2->Scalars.size())

+ CommonVF = VF = E->Scalars.size();

+ } else {

+ // P2 is empty, check that we have same node + reshuffle (if any).

+ if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

+ VF = E->Scalars.size();

+ SmallVector<int> CommonMask(Mask.begin(), Mask.end());

+ ::addMask(CommonMask, E->getCommonMask());

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E->Scalars.front()->getType(), VF));

+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(

+ V1, nullptr, CommonMask, Builder);

+ }

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E->Scalars.front()->getType(), VF));

+ }

+ if (!V2 && !P2.isNull()) {

+ const TreeEntry *E = P2.get<const TreeEntry *>();

+ unsigned VF = E->getVectorFactor();

+ unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ if (!CommonVF && V1VF == E->Scalars.size())

+ CommonVF = E->Scalars.size();

+ if (CommonVF)

+ VF = CommonVF;

+ V2 = Constant::getNullValue(

+ FixedVectorType::get(E->Scalars.front()->getType(), VF));

+ }

+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,

+ Builder);

+ }

+public:

+ ShuffleCostEstimator(TargetTransformInfo &TTI,

+ ArrayRef<Value *> VectorizedVals, BoUpSLP &R,

+ SmallPtrSetImpl<Value *> &CheckedExtracts)

+ : TTI(TTI), VectorizedVals(VectorizedVals), R(R),

+ CheckedExtracts(CheckedExtracts) {}

+ Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,

+ TTI::ShuffleKind ShuffleKind) {

+ if (Mask.empty())

+ return nullptr;

+ Value *VecBase = nullptr;

+ ArrayRef<Value *> VL = E->Scalars;

+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());

// If the resulting type is scalarized, do not adjust the cost.

- unsigned VecNumParts = TTI->getNumberOfParts(VecTy);

+ unsigned VecNumParts = TTI.getNumberOfParts(VecTy);

if (VecNumParts == VecTy->getNumElements())

- return;

+ return nullptr;

DenseMap<Value *, int> ExtractVectorsTys;

- SmallPtrSet<Value *, 4> CheckedExtracts;

- for (auto *V : VL) {

- if (isa<UndefValue>(V))

+ for (auto [I, V] : enumerate(VL)) {

+ // Ignore non-extractelement scalars.

+ if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))

continue;

// If all users of instruction are going to be vectorized and this

// instruction itself is not going to be vectorized, consider this

@@ -6631,17 +7123,18 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// vectorized tree.

// Also, avoid adjusting the cost for extractelements with multiple uses

// in different graph entries.

- const TreeEntry *VE = getTreeEntry(V);

+ const TreeEntry *VE = R.getTreeEntry(V);

if (!CheckedExtracts.insert(V).second ||

- !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

+ !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

(VE && VE != E))

continue;

auto *EE = cast<ExtractElementInst>(V);

+ VecBase = EE->getVectorOperand();

std::optional<unsigned> EEIdx = getExtractIndex(EE);

if (!EEIdx)

continue;

unsigned Idx = *EEIdx;

- if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) {

+ if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {

auto It =

ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;

It->getSecond() = std::min<int>(It->second, Idx);

@@ -6654,18 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

})) {

// Use getExtractWithExtendCost() to calculate the cost of

// extractelement/ext pair.

- Cost -=

- TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

- EE->getVectorOperandType(), Idx);

+ Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

+ EE->getVectorOperandType(), Idx);

// Add back the cost of s|zext which is subtracted separately.

- Cost += TTI->getCastInstrCost(

+ Cost += TTI.getCastInstrCost(

Ext->getOpcode(), Ext->getType(), EE->getType(),

TTI::getCastContextHint(Ext), CostKind, Ext);

continue;

}

- Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,

- Idx);

+ Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,

+ Idx);

}

// Add a cost for subvector extracts/inserts if required.

for (const auto &Data : ExtractVectorsTys) {

@@ -6673,34 +7165,148 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

unsigned NumElts = VecTy->getNumElements();

if (Data.second % NumElts == 0)

continue;

- if (TTI->getNumberOfParts(EEVTy) > VecNumParts) {

+ if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {

unsigned Idx = (Data.second / NumElts) * NumElts;

unsigned EENumElts = EEVTy->getNumElements();

+ if (Idx % NumElts == 0)

+ continue;

if (Idx + NumElts <= EENumElts) {

- Cost +=

- TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

- EEVTy, std::nullopt, CostKind, Idx, VecTy);

+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

+ EEVTy, std::nullopt, CostKind, Idx, VecTy);

} else {

// Need to round up the subvector type vectorization factor to avoid a

// crash in cost model functions. Make SubVT so that Idx + VF of SubVT

// <= EENumElts.

auto *SubVT =

FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);

- Cost +=

- TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

- EEVTy, std::nullopt, CostKind, Idx, SubVT);

+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

+ EEVTy, std::nullopt, CostKind, Idx, SubVT);

}

} else {

- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,

- VecTy, std::nullopt, CostKind, 0, EEVTy);

+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,

+ VecTy, std::nullopt, CostKind, 0, EEVTy);

}

- };

+ // Check that gather of extractelements can be represented as just a

+ // shuffle of a single/two vectors the scalars are extracted from.

+ // Found the bunch of extractelement instructions that must be gathered

+ // into a vector and can be represented as a permutation elements in a

+ // single input vector or of 2 input vectors.

+ Cost += computeExtractCost(VL, Mask, ShuffleKind);

+ return VecBase;

+ }

+ void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign({E1, E2});

+ }

+ void add(const TreeEntry *E1, ArrayRef<int> Mask) {

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign(1, E1);

+ }

+ /// Adds another one input vector and the mask for the shuffling.

+ void add(Value *V1, ArrayRef<int> Mask) {

+ assert(CommonMask.empty() && InVectors.empty() &&

+ "Expected empty input mask/vectors.");

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign(1, V1);

+ }

+ Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {

+ Cost += getBuildVectorCost(VL, Root);

+ if (!Root) {

+ assert(InVectors.empty() && "Unexpected input vectors for buildvector.");

+ // FIXME: Need to find a way to avoid use of getNullValue here.

+ SmallVector<Constant *> Vals;

+ for (Value *V : VL) {

+ if (isa<UndefValue>(V)) {

+ Vals.push_back(cast<Constant>(V));

+ continue;

+ }

+ Vals.push_back(Constant::getNullValue(V->getType()));

+ }

+ return ConstantVector::get(Vals);

+ }

+ return ConstantVector::getSplat(

+ ElementCount::getFixed(VL.size()),

+ Constant::getNullValue(VL.front()->getType()));

+ }

+ /// Finalize emission of the shuffles.

+ InstructionCost

+ finalize(ArrayRef<int> ExtMask, unsigned VF = 0,

+ function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {

+ IsFinalized = true;

+ if (Action) {

+ const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

+ if (InVectors.size() == 2) {

+ Cost += createShuffle(Vec, InVectors.back(), CommonMask);

+ InVectors.pop_back();

+ } else {

+ Cost += createShuffle(Vec, nullptr, CommonMask);

+ }

+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

+ if (CommonMask[Idx] != PoisonMaskElem)

+ CommonMask[Idx] = Idx;

+ assert(VF > 0 &&

+ "Expected vector length for the final value before action.");

+ Value *V = Vec.dyn_cast<Value *>();

+ if (!Vec.isNull() && !V)

+ V = Constant::getNullValue(FixedVectorType::get(

+ Vec.get<const TreeEntry *>()->Scalars.front()->getType(),

+ CommonMask.size()));

+ Action(V, CommonMask);

+ }

+ ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);

+ if (CommonMask.empty())

+ return Cost;

+ int Limit = CommonMask.size() * 2;

+ if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&

+ ShuffleVectorInst::isIdentityMask(CommonMask))

+ return Cost;

+ return Cost +

+ createShuffle(InVectors.front(),

+ InVectors.size() == 2 ? InVectors.back() : nullptr,

+ CommonMask);

+ }

+ ~ShuffleCostEstimator() {

+ assert((IsFinalized || CommonMask.empty()) &&

+ "Shuffle construction must be finalized.");

+ }

+};

+InstructionCost

+BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

+ SmallPtrSetImpl<Value *> &CheckedExtracts) {

+ ArrayRef<Value *> VL = E->Scalars;

+ Type *ScalarTy = VL[0]->getType();

+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))

+ ScalarTy = SI->getValueOperand()->getType();

+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))

+ ScalarTy = CI->getOperand(0)->getType();

+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

+ ScalarTy = IE->getOperand(1)->getType();

+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ // If we have computed a smaller type for the expression, update VecTy so

+ // that the costs will be accurate.

+ if (MinBWs.count(VL[0]))

+ VecTy = FixedVectorType::get(

+ IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

+ unsigned EntryVF = E->getVectorFactor();

+ auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);

+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

if (E->State == TreeEntry::NeedToGather) {

if (allConstant(VL))

return 0;

if (isa<InsertElementInst>(VL[0]))

return InstructionCost::getInvalid();

+ ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,

+ CheckedExtracts);

+ unsigned VF = E->getVectorFactor();

+ SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),

+ E->ReuseShuffleIndices.end());

SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

// Build a mask out of the reorder indices and reorder scalars per this

// mask.

@@ -6709,195 +7315,104 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

if (!ReorderMask.empty())

reorderScalars(GatheredScalars, ReorderMask);

SmallVector<int> Mask;

+ SmallVector<int> ExtractMask;

+ std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;

std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;

SmallVector<const TreeEntry *> Entries;

+ Type *ScalarTy = GatheredScalars.front()->getType();

+ // Check for gathered extracts.

+ ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);

+ SmallVector<Value *> IgnoredVals;

+ if (UserIgnoreList)

+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

+ bool Resized = false;

+ if (Value *VecBase = Estimator.adjustExtracts(

+ E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))

+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

+ if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {

+ Resized = true;

+ GatheredScalars.append(VF - GatheredScalars.size(),

+ PoisonValue::get(ScalarTy));

+ }

// Do not try to look for reshuffled loads for gathered loads (they will be

// handled later), for vectorized scalars, and cases, which are definitely

// not profitable (splats and small gather nodes.)

- if (E->getOpcode() != Instruction::Load || E->isAltShuffle() ||

+ if (ExtractShuffle || E->getOpcode() != Instruction::Load ||

+ E->isAltShuffle() ||

all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

isSplat(E->Scalars) ||

(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))

GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);

if (GatherShuffle) {

- // Remove shuffled elements from list of gathers.

- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

- if (Mask[I] != UndefMaskElem)

- GatheredScalars[I] = PoisonValue::get(ScalarTy);

- }

assert((Entries.size() == 1 || Entries.size() == 2) &&

"Expected shuffle of 1 or 2 entries.");

- InstructionCost GatherCost = 0;

- int Limit = Mask.size() * 2;

- if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&

- ShuffleVectorInst::isIdentityMask(Mask)) {

+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&

+ Entries.front()->isSame(E->Scalars)) {

// Perfect match in the graph, will reuse the previously vectorized

// node. Cost is 0.

LLVM_DEBUG(

dbgs()

<< "SLP: perfect diamond match for gather bundle that starts with "

<< *VL.front() << ".\n");

- if (NeedToShuffleReuses)

- GatherCost =

- TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

- FinalVecTy, E->ReuseShuffleIndices);

- } else {

- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()

- << " entries for bundle that starts with "

- << *VL.front() << ".\n");

- // Detected that instead of gather we can emit a shuffle of single/two

- // previously vectorized nodes. Add the cost of the permutation rather

- // than gather.

- ::addMask(Mask, E->ReuseShuffleIndices);

- GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);

- }

- if (!all_of(GatheredScalars, UndefValue::classof))

- GatherCost += getGatherCost(GatheredScalars);

- return GatherCost;

- }

- if ((E->getOpcode() == Instruction::ExtractElement ||

- all_of(E->Scalars,

- [](Value *V) {

- return isa<ExtractElementInst, UndefValue>(V);

- })) &&

- allSameType(VL)) {

- // Check that gather of extractelements can be represented as just a

- // shuffle of a single/two vectors the scalars are extracted from.

- SmallVector<int> Mask;

- std::optional<TargetTransformInfo::ShuffleKind> ShuffleKind =

- isFixedVectorShuffle(VL, Mask);

- if (ShuffleKind) {

- // Found the bunch of extractelement instructions that must be gathered

- // into a vector and can be represented as a permutation elements in a

- // single input vector or of 2 input vectors.

- InstructionCost Cost =

- computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);

- AdjustExtractsCost(Cost);

- if (NeedToShuffleReuses)

- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

- FinalVecTy, E->ReuseShuffleIndices);

- return Cost;

- }

- if (isSplat(VL)) {

- // Found the broadcasting of the single scalar, calculate the cost as the

- // broadcast.

- assert(VecTy == FinalVecTy &&

- "No reused scalars expected for broadcast.");

- const auto *It =

- find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });

- // If all values are undefs - consider cost free.

- if (It == VL.end())

- return TTI::TCC_Free;

- // Add broadcast for non-identity shuffle only.

- bool NeedShuffle =

- VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof);

- InstructionCost InsertCost =

- TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,

- /*Index=*/0, PoisonValue::get(VecTy), *It);

- return InsertCost + (NeedShuffle

- ? TTI->getShuffleCost(

- TargetTransformInfo::SK_Broadcast, VecTy,

- /*Mask=*/std::nullopt, CostKind,

- /*Index=*/0,

- /*SubTp=*/nullptr, /*Args=*/VL[0])

- : TTI::TCC_Free);

- }

- InstructionCost ReuseShuffleCost = 0;

- if (NeedToShuffleReuses)

- ReuseShuffleCost = TTI->getShuffleCost(

- TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);

- // Improve gather cost for gather of loads, if we can group some of the

- // loads into vector loads.

- if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&

- !E->isAltShuffle()) {

- BoUpSLP::ValueSet VectorizedLoads;

- unsigned StartIdx = 0;

- unsigned VF = VL.size() / 2;

- unsigned VectorizedCnt = 0;

- unsigned ScatterVectorizeCnt = 0;

- const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());

- for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {

- for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;

- Cnt += VF) {

- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

- if (!VectorizedLoads.count(Slice.front()) &&

- !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {

- SmallVector<Value *> PointerOps;

- OrdersType CurrentOrder;

- LoadsState LS =

- canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,

- *TLI, CurrentOrder, PointerOps);

- switch (LS) {

- case LoadsState::Vectorize:

- case LoadsState::ScatterVectorize:

- // Mark the vectorized loads so that we don't vectorize them

- // again.

- if (LS == LoadsState::Vectorize)

- ++VectorizedCnt;

- else

- ++ScatterVectorizeCnt;

- VectorizedLoads.insert(Slice.begin(), Slice.end());

- // If we vectorized initial block, no need to try to vectorize it

- // again.

- if (Cnt == StartIdx)

- StartIdx += VF;

- break;

- case LoadsState::Gather:

- break;

- }

+ // Restore the mask for previous partially matched values.

+ for (auto [I, V] : enumerate(E->Scalars)) {

+ if (isa<PoisonValue>(V)) {

+ Mask[I] = PoisonMaskElem;

+ continue;

}

+ if (Mask[I] == PoisonMaskElem)

+ Mask[I] = Entries.front()->findLaneForValue(V);

}

- // Check if the whole array was vectorized already - exit.

- if (StartIdx >= VL.size())

- break;

- // Found vectorizable parts - exit.

- if (!VectorizedLoads.empty())

- break;

+ Estimator.add(Entries.front(), Mask);

+ return Estimator.finalize(E->ReuseShuffleIndices);

}

- if (!VectorizedLoads.empty()) {

- InstructionCost GatherCost = 0;

- unsigned NumParts = TTI->getNumberOfParts(VecTy);

- bool NeedInsertSubvectorAnalysis =

- !NumParts || (VL.size() / VF) > NumParts;

- // Get the cost for gathered loads.

- for (unsigned I = 0, End = VL.size(); I < End; I += VF) {

- if (VectorizedLoads.contains(VL[I]))

- continue;

- GatherCost += getGatherCost(VL.slice(I, VF));

- }

- // The cost for vectorized loads.

- InstructionCost ScalarsCost = 0;

- for (Value *V : VectorizedLoads) {

- auto *LI = cast<LoadInst>(V);

- ScalarsCost +=

- TTI->getMemoryOpCost(Instruction::Load, LI->getType(),

- LI->getAlign(), LI->getPointerAddressSpace(),

- CostKind, TTI::OperandValueInfo(), LI);

- }

- auto *LI = cast<LoadInst>(E->getMainOp());

- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);

- Align Alignment = LI->getAlign();

- GatherCost +=

- VectorizedCnt *

- TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

- LI->getPointerAddressSpace(), CostKind,

- TTI::OperandValueInfo(), LI);

- GatherCost += ScatterVectorizeCnt *

- TTI->getGatherScatterOpCost(

- Instruction::Load, LoadTy, LI->getPointerOperand(),

- /*VariableMask=*/false, Alignment, CostKind, LI);

- if (NeedInsertSubvectorAnalysis) {

- // Add the cost for the subvectors insert.

- for (int I = VF, E = VL.size(); I < E; I += VF)

- GatherCost +=

- TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,

- std::nullopt, CostKind, I, LoadTy);

- }

- return ReuseShuffleCost + GatherCost - ScalarsCost;

+ if (!Resized) {

+ unsigned VF1 = Entries.front()->getVectorFactor();

+ unsigned VF2 = Entries.back()->getVectorFactor();

+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)

+ GatheredScalars.append(VF - GatheredScalars.size(),

+ PoisonValue::get(ScalarTy));

}

+ // Remove shuffled elements from list of gathers.

+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

+ if (Mask[I] != PoisonMaskElem)

+ GatheredScalars[I] = PoisonValue::get(ScalarTy);

+ }

+ LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()

+ << " entries for bundle that starts with "

+ << *VL.front() << ".\n";);

+ if (Entries.size() == 1)

+ Estimator.add(Entries.front(), Mask);

+ else

+ Estimator.add(Entries.front(), Entries.back(), Mask);

+ if (all_of(GatheredScalars, PoisonValue ::classof))

+ return Estimator.finalize(E->ReuseShuffleIndices);

+ return Estimator.finalize(

+ E->ReuseShuffleIndices, E->Scalars.size(),

+ [&](Value *&Vec, SmallVectorImpl<int> &Mask) {

+ Vec = Estimator.gather(GatheredScalars,

+ Constant::getNullValue(FixedVectorType::get(

+ GatheredScalars.front()->getType(),

+ GatheredScalars.size())));

+ });

}

- return ReuseShuffleCost + getGatherCost(VL);

+ if (!all_of(GatheredScalars, PoisonValue::classof)) {

+ auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());

+ bool SameGathers = VL.equals(Gathers);

+ Value *BV = Estimator.gather(

+ Gathers, SameGathers ? nullptr

+ : Constant::getNullValue(FixedVectorType::get(

+ GatheredScalars.front()->getType(),

+ GatheredScalars.size())));

+ SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);

+ std::iota(ReuseMask.begin(), ReuseMask.end(), 0);

+ Estimator.add(BV, ReuseMask);

+ }

+ if (ExtractShuffle)

+ Estimator.add(E, std::nullopt);

+ return Estimator.finalize(E->ReuseShuffleIndices);

}

InstructionCost CommonCost = 0;

SmallVector<int> Mask;

@@ -6945,48 +7460,89 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

}

InstructionCost VecCost = VectorCost(CommonCost);

- LLVM_DEBUG(

- dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost));

- // Disable warnings for `this` and `E` are unused. Required for

- // `dumpTreeCosts`.

- (void)this;

- (void)E;

+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,

+ ScalarCost, "Calculated costs for Tree"));

return VecCost - ScalarCost;

};

// Calculate cost difference from vectorizing set of GEPs.

// Negative value means vectorizing is profitable.

auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {

- InstructionCost CostSavings = 0;

- for (Value *V : Ptrs) {

- if (V == BasePtr)

- continue;

- auto *Ptr = dyn_cast<GetElementPtrInst>(V);

- // GEPs may contain just addresses without instructions, considered free.

- // GEPs with all constant indices also considered to have zero cost.

- if (!Ptr || Ptr->hasAllConstantIndices())

- continue;

- // Here we differentiate two cases: when GEPs represent a regular

- // vectorization tree node (and hence vectorized) and when the set is

- // arguments of a set of loads or stores being vectorized. In the former

- // case all the scalar GEPs will be removed as a result of vectorization.

+ InstructionCost ScalarCost = 0;

+ InstructionCost VecCost = 0;

+ // Here we differentiate two cases: (1) when Ptrs represent a regular

+ // vectorization tree node (as they are pointer arguments of scattered

+ // loads) or (2) when Ptrs are the arguments of loads or stores being

+ // vectorized as plane wide unit-stride load/store since all the

+ // loads/stores are known to be from/to adjacent locations.

+ assert(E->State == TreeEntry::Vectorize &&

+ "Entry state expected to be Vectorize here.");

+ if (isa<LoadInst, StoreInst>(VL0)) {

+ // Case 2: estimate costs for pointer related costs when vectorizing to

+ // a wide load/store.

+ // Scalar cost is estimated as a set of pointers with known relationship

+ // between them.

+ // For vector code we will use BasePtr as argument for the wide load/store

+ // but we also need to account all the instructions which are going to

+ // stay in vectorized code due to uses outside of these scalar

+ // loads/stores.

+ ScalarCost = TTI->getPointersChainCost(

+ Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,

+ CostKind);

+ SmallVector<const Value *> PtrsRetainedInVecCode;

+ for (Value *V : Ptrs) {

+ if (V == BasePtr) {

+ PtrsRetainedInVecCode.push_back(V);

+ continue;

+ }

+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);

+ // For simplicity assume Ptr to stay in vectorized code if it's not a

+ // GEP instruction. We don't care since it's cost considered free.

+ // TODO: We should check for any uses outside of vectorizable tree

+ // rather than just single use.

+ if (!Ptr || !Ptr->hasOneUse())

+ PtrsRetainedInVecCode.push_back(V);

+ }

+ if (PtrsRetainedInVecCode.size() == Ptrs.size()) {

+ // If all pointers stay in vectorized code then we don't have

+ // any savings on that.

+ LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost,

+ "Calculated GEPs cost for Tree"));

+ return InstructionCost{TTI::TCC_Free};

+ }

+ VecCost = TTI->getPointersChainCost(

+ PtrsRetainedInVecCode, BasePtr,

+ TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind);

+ } else {

+ // Case 1: Ptrs are the arguments of loads that we are going to transform

+ // into masked gather load intrinsic.

+ // All the scalar GEPs will be removed as a result of vectorization.

// For any external uses of some lanes extract element instructions will

- // be generated (which cost is estimated separately). For the latter case

- // since the set of GEPs itself is not vectorized those used more than

- // once will remain staying in vectorized code as well. So we should not

- // count them as savings.

- if (!Ptr->hasOneUse() && isa<LoadInst, StoreInst>(VL0))

- continue;

- // TODO: it is target dependent, so need to implement and then use a TTI

- // interface.

- CostSavings += TTI->getArithmeticInstrCost(Instruction::Add,

- Ptr->getType(), CostKind);

- }

- LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n";

- E->dump());

- LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n");

- return InstructionCost() - CostSavings;

+ // be generated (which cost is estimated separately).

+ TTI::PointersChainInfo PtrsInfo =

+ all_of(Ptrs,

+ [](const Value *V) {

+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);

+ return Ptr && !Ptr->hasAllConstantIndices();

+ })

+ ? TTI::PointersChainInfo::getUnknownStride()

+ : TTI::PointersChainInfo::getKnownStride();

+ ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,

+ CostKind);

+ if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {

+ SmallVector<const Value *> Indices(BaseGEP->indices());

+ VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(),

+ BaseGEP->getPointerOperand(), Indices, VecTy,

+ CostKind);

+ }

+ LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,

+ "Calculated GEPs cost for Tree"));

+ return VecCost - ScalarCost;

};

switch (ShuffleOrOp) {

@@ -7062,7 +7618,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);

- SmallVector<int> InsertMask(NumElts, UndefMaskElem);

+ SmallVector<int> InsertMask(NumElts, PoisonMaskElem);

unsigned OffsetBeg = *getInsertIndex(VL.front());

unsigned OffsetEnd = OffsetBeg;

InsertMask[OffsetBeg] = 0;

@@ -7099,13 +7655,13 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

SmallVector<int> Mask;

if (!E->ReorderIndices.empty()) {

inversePermutation(E->ReorderIndices, Mask);

- Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);

+ Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);

} else {

- Mask.assign(VecSz, UndefMaskElem);

+ Mask.assign(VecSz, PoisonMaskElem);

std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);

}

bool IsIdentity = true;

- SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);

+ SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);

Mask.swap(PrevMask);

for (unsigned I = 0; I < NumScalars; ++I) {

unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);

@@ -7148,14 +7704,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

InsertVecTy);

} else {

for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)

- Mask[I] = InMask.test(I) ? UndefMaskElem : I;

+ Mask[I] = InMask.test(I) ? PoisonMaskElem : I;

for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;

I <= End; ++I)

- if (Mask[I] != UndefMaskElem)

+ if (Mask[I] != PoisonMaskElem)

Mask[I] = I + VecSz;

for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)

Mask[I] =

- ((I >= InMask.size()) || InMask.test(I)) ? UndefMaskElem : I;

+ ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;

Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);

}

@@ -7422,11 +7978,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

VecCost +=

TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);

} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

- VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,

- Builder.getInt1Ty(),

+ auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());

+ VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,

CI0->getPredicate(), CostKind, VL0);

VecCost += TTI->getCmpSelInstrCost(

- E->getOpcode(), ScalarTy, Builder.getInt1Ty(),

+ E->getOpcode(), VecTy, MaskTy,

cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,

E->getAltOp());

} else {

@@ -7615,7 +8171,7 @@ InstructionCost BoUpSLP::getSpillCost() const {

unsigned BundleWidth = VectorizableTree.front()->Scalars.size();

InstructionCost Cost = 0;

- SmallPtrSet<Instruction*, 4> LiveValues;

+ SmallPtrSet<Instruction *, 4> LiveValues;

Instruction *PrevInst = nullptr;

// The entries in VectorizableTree are not necessarily ordered by their

@@ -7626,6 +8182,8 @@ InstructionCost BoUpSLP::getSpillCost() const {

// are grouped together. Using dominance ensures a deterministic order.

SmallVector<Instruction *, 16> OrderedScalars;

for (const auto &TEPtr : VectorizableTree) {

+ if (TEPtr->State != TreeEntry::Vectorize)

+ continue;

Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);

if (!Inst)

continue;

@@ -7639,7 +8197,7 @@ InstructionCost BoUpSLP::getSpillCost() const {

assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

"Different nodes should have different DFS numbers");

if (NodeA != NodeB)

- return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();

+ return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();

return B->comesBefore(A);

});

@@ -7698,7 +8256,7 @@ InstructionCost BoUpSLP::getSpillCost() const {

};

// Debug information does not impact spill cost.

- if (isa<CallInst>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&

+ if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&

&*PrevInstIt != PrevInst)

NumCalls++;

@@ -7706,7 +8264,7 @@ InstructionCost BoUpSLP::getSpillCost() const {

}

if (NumCalls) {

- SmallVector<Type*, 4> V;

+ SmallVector<Type *, 4> V;

for (auto *II : LiveValues) {

auto *ScalarTy = II->getType();

if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))

@@ -7797,8 +8355,8 @@ static T *performExtractsShuffleAction(

ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);

SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);

for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {

- if (Mask[Idx] == UndefMaskElem)

- Mask[Idx] = IsBasePoison.test(Idx) ? UndefMaskElem : Idx;

+ if (Mask[Idx] == PoisonMaskElem)

+ Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;

else

Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;

}

@@ -7827,8 +8385,8 @@ static T *performExtractsShuffleAction(

// can shuffle them directly.

ArrayRef<int> SecMask = VMIt->second;

for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

- if (SecMask[I] != UndefMaskElem) {

- assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");

+ if (SecMask[I] != PoisonMaskElem) {

+ assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");

Mask[I] = SecMask[I] + Vec1VF;

}

@@ -7841,12 +8399,12 @@ static T *performExtractsShuffleAction(

ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);

ArrayRef<int> SecMask = VMIt->second;

for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

- if (Mask[I] != UndefMaskElem) {

- assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");

+ if (Mask[I] != PoisonMaskElem) {

+ assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");

if (Res1.second)

Mask[I] = I;

- } else if (SecMask[I] != UndefMaskElem) {

- assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");

+ } else if (SecMask[I] != PoisonMaskElem) {

+ assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");

Mask[I] = (Res2.second ? I : SecMask[I]) + VF;

}

@@ -7863,11 +8421,11 @@ static T *performExtractsShuffleAction(

ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);

ArrayRef<int> SecMask = VMIt->second;

for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

- if (SecMask[I] != UndefMaskElem) {

- assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&

+ if (SecMask[I] != PoisonMaskElem) {

+ assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&

"Multiple uses of scalars.");

Mask[I] = (Res.second ? I : SecMask[I]) + VF;

- } else if (Mask[I] != UndefMaskElem) {

+ } else if (Mask[I] != PoisonMaskElem) {

Mask[I] = I;

}

@@ -7877,12 +8435,23 @@ static T *performExtractsShuffleAction(

}

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

+ // Build a map for gathered scalars to the nodes where they are used.

+ ValueToGatherNodes.clear();

+ for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {

+ if (EntryPtr->State != TreeEntry::NeedToGather)

+ continue;

+ for (Value *V : EntryPtr->Scalars)

+ if (!isConstant(V))

+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(

+ EntryPtr.get());

+ }

InstructionCost Cost = 0;

LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "

<< VectorizableTree.size() << ".\n");

unsigned BundleWidth = VectorizableTree[0]->Scalars.size();

+ SmallPtrSet<Value *, 4> CheckedExtracts;

for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {

TreeEntry &TE = *VectorizableTree[I];

if (TE.State == TreeEntry::NeedToGather) {

@@ -7898,7 +8467,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

}

- InstructionCost C = getEntryCost(&TE, VectorizedVals);

+ InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);

Cost += C;

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

<< " for bundle that starts with " << *TE.Scalars[0]

@@ -7951,7 +8520,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

(void)ShuffleMasks.emplace_back();

SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];

if (Mask.empty())

- Mask.assign(FTy->getNumElements(), UndefMaskElem);

+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);

// Find the insertvector, vectorized in tree, if any.

Value *Base = VU;

while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {

@@ -7965,7 +8534,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

do {

IEBase = cast<InsertElementInst>(Base);

int Idx = *getInsertIndex(IEBase);

- assert(Mask[Idx] == UndefMaskElem &&

+ assert(Mask[Idx] == PoisonMaskElem &&

"InsertElementInstruction used already.");

Mask[Idx] = Idx;

Base = IEBase->getOperand(0);

@@ -7985,7 +8554,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

int InIdx = *InsertIdx;

SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];

if (Mask.empty())

- Mask.assign(FTy->getNumElements(), UndefMaskElem);

+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);

Mask[InIdx] = EU.Lane;

DemandedElts[VecId].setBit(InIdx);

continue;

@@ -8024,7 +8593,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

(all_of(Mask,

[VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&

!ShuffleVectorInst::isIdentityMask(Mask)))) {

- SmallVector<int> OrigMask(VecVF, UndefMaskElem);

+ SmallVector<int> OrigMask(VecVF, PoisonMaskElem);

std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

OrigMask.begin());

C = TTI->getShuffleCost(

@@ -8110,17 +8679,23 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

// No need to check for the topmost gather node.

if (TE == VectorizableTree.front().get())

return std::nullopt;

- Mask.assign(VL.size(), UndefMaskElem);

+ Mask.assign(VL.size(), PoisonMaskElem);

assert(TE->UserTreeIndices.size() == 1 &&

"Expected only single user of the gather node.");

// TODO: currently checking only for Scalars in the tree entry, need to count

// reused elements too for better cost estimation.

Instruction &UserInst =

getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);

- auto *PHI = dyn_cast<PHINode>(&UserInst);

- auto *NodeUI = DT->getNode(

- PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx)

- : UserInst.getParent());

+ BasicBlock *ParentBB = nullptr;

+ // Main node of PHI entries keeps the correct order of operands/incoming

+ // blocks.

+ if (auto *PHI =

+ dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) {

+ ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx);

+ } else {

+ ParentBB = UserInst.getParent();

+ }

+ auto *NodeUI = DT->getNode(ParentBB);

assert(NodeUI && "Should only process reachable instructions");

SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());

auto CheckOrdering = [&](Instruction *LastEI) {

@@ -8147,45 +8722,6 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

return false;

return true;

};

- // Build a lists of values to tree entries.

- DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;

- for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {

- if (EntryPtr.get() == TE)

- continue;

- if (EntryPtr->State != TreeEntry::NeedToGather)

- continue;

- if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) {

- return GatheredScalars.contains(V);

- }))

- continue;

- assert(EntryPtr->UserTreeIndices.size() == 1 &&

- "Expected only single user of the gather node.");

- Instruction &EntryUserInst =

- getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE);

- if (&UserInst == &EntryUserInst) {

- // If 2 gathers are operands of the same entry, compare operands indices,

- // use the earlier one as the base.

- if (TE->UserTreeIndices.front().UserTE ==

- EntryPtr->UserTreeIndices.front().UserTE &&

- TE->UserTreeIndices.front().EdgeIdx <

- EntryPtr->UserTreeIndices.front().EdgeIdx)

- continue;

- }

- // Check if the user node of the TE comes after user node of EntryPtr,

- // otherwise EntryPtr depends on TE.

- auto *EntryPHI = dyn_cast<PHINode>(&EntryUserInst);

- auto *EntryI =

- EntryPHI

- ? EntryPHI

- ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx)

- ->getTerminator()

- : &EntryUserInst;

- if (!CheckOrdering(EntryI))

- continue;

- for (Value *V : EntryPtr->Scalars)

- if (!isConstant(V))

- ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());

- }

// Find all tree entries used by the gathered values. If no common entries

// found - not a shuffle.

// Here we build a set of tree nodes for each gathered value and trying to

@@ -8195,16 +8731,58 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

// have a permutation of 2 input vectors.

SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;

DenseMap<Value *, int> UsedValuesEntry;

- for (Value *V : TE->Scalars) {

+ for (Value *V : VL) {

if (isConstant(V))

continue;

// Build a list of tree entries where V is used.

SmallPtrSet<const TreeEntry *, 4> VToTEs;

- auto It = ValueToTEs.find(V);

- if (It != ValueToTEs.end())

- VToTEs = It->second;

- if (const TreeEntry *VTE = getTreeEntry(V))

+ for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {

+ if (TEPtr == TE)

+ continue;

+ assert(any_of(TEPtr->Scalars,

+ [&](Value *V) { return GatheredScalars.contains(V); }) &&

+ "Must contain at least single gathered value.");

+ assert(TEPtr->UserTreeIndices.size() == 1 &&

+ "Expected only single user of the gather node.");

+ PHINode *EntryPHI =

+ dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp());

+ Instruction *EntryUserInst =

+ EntryPHI ? nullptr

+ : &getLastInstructionInBundle(

+ TEPtr->UserTreeIndices.front().UserTE);

+ if (&UserInst == EntryUserInst) {

+ assert(!EntryPHI && "Unexpected phi node entry.");

+ // If 2 gathers are operands of the same entry, compare operands

+ // indices, use the earlier one as the base.

+ if (TE->UserTreeIndices.front().UserTE ==

+ TEPtr->UserTreeIndices.front().UserTE &&

+ TE->UserTreeIndices.front().EdgeIdx <

+ TEPtr->UserTreeIndices.front().EdgeIdx)

+ continue;

+ }

+ // Check if the user node of the TE comes after user node of EntryPtr,

+ // otherwise EntryPtr depends on TE.

+ auto *EntryI =

+ EntryPHI

+ ? EntryPHI

+ ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx)

+ ->getTerminator()

+ : EntryUserInst;

+ if ((ParentBB != EntryI->getParent() ||

+ TE->UserTreeIndices.front().EdgeIdx <

+ TEPtr->UserTreeIndices.front().EdgeIdx ||

+ TE->UserTreeIndices.front().UserTE !=

+ TEPtr->UserTreeIndices.front().UserTE) &&

+ !CheckOrdering(EntryI))

+ continue;

+ VToTEs.insert(TEPtr);

+ }

+ if (const TreeEntry *VTE = getTreeEntry(V)) {

+ Instruction &EntryUserInst = getLastInstructionInBundle(VTE);

+ if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst))

+ continue;

VToTEs.insert(VTE);

+ }

if (VToTEs.empty())

continue;

if (UsedTEs.empty()) {

@@ -8260,13 +8838,13 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {

return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);

});

- if (It != FirstEntries.end()) {

+ if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {

Entries.push_back(*It);

std::iota(Mask.begin(), Mask.end(), 0);

// Clear undef scalars.

for (int I = 0, Sz = VL.size(); I < Sz; ++I)

- if (isa<PoisonValue>(TE->Scalars[I]))

- Mask[I] = UndefMaskElem;

+ if (isa<PoisonValue>(VL[I]))

+ Mask[I] = PoisonMaskElem;

return TargetTransformInfo::SK_PermuteSingleSrc;

}

// No perfect match, just shuffle, so choose the first tree node from the

@@ -8302,10 +8880,18 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

break;

}

- // No 2 source vectors with the same vector factor - give up and do regular

- // gather.

- if (Entries.empty())

- return std::nullopt;

+ // No 2 source vectors with the same vector factor - just choose 2 with max

+ // index.

+ if (Entries.empty()) {

+ Entries.push_back(

+ *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(),

+ [](const TreeEntry *TE1, const TreeEntry *TE2) {

+ return TE1->Idx < TE2->Idx;

+ }));

+ Entries.push_back(SecondEntries.front());

+ VF = std::max(Entries.front()->getVectorFactor(),

+ Entries.back()->getVectorFactor());

+ }

}

bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof);

@@ -8427,19 +9013,8 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

return std::nullopt;

}

-InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,

- const APInt &ShuffledIndices,

- bool NeedToShuffle) const {

- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

- InstructionCost Cost =

- TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,

- /*Extract*/ false, CostKind);

- if (NeedToShuffle)

- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);

- return Cost;

-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,

+ bool ForPoisonSrc) const {

// Find the type of the operands in VL.

Type *ScalarTy = VL[0]->getType();

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

@@ -8451,20 +9026,36 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// shuffle candidates.

APInt ShuffledElements = APInt::getZero(VL.size());

DenseSet<Value *> UniqueElements;

- // Iterate in reverse order to consider insert elements with the high cost.

- for (unsigned I = VL.size(); I > 0; --I) {

- unsigned Idx = I - 1;

+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ InstructionCost Cost;

+ auto EstimateInsertCost = [&](unsigned I, Value *V) {

+ if (!ForPoisonSrc)

+ Cost +=

+ TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,

+ I, Constant::getNullValue(VecTy), V);

+ };

+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {

+ Value *V = VL[I];

// No need to shuffle duplicates for constants.

- if (isConstant(VL[Idx])) {

- ShuffledElements.setBit(Idx);

+ if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {

+ ShuffledElements.setBit(I);

continue;

}

- if (!UniqueElements.insert(VL[Idx]).second) {

+ if (!UniqueElements.insert(V).second) {

DuplicateNonConst = true;

- ShuffledElements.setBit(Idx);

+ ShuffledElements.setBit(I);

+ continue;

}

+ EstimateInsertCost(I, V);

}

- return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);

+ if (ForPoisonSrc)

+ Cost =

+ TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,

+ /*Extract*/ false, CostKind);

+ if (DuplicateNonConst)

+ Cost +=

+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

+ return Cost;

}

// Perform operand reordering on the instructions in VL and return the reordered

@@ -8483,6 +9074,9 @@ void BoUpSLP::reorderInputsAccordingToOpcode(

}

Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

+ auto &Res = EntryToLastInstruction.FindAndConstruct(E);

+ if (Res.second)

+ return *Res.second;

// Get the basic block this bundle is in. All instructions in the bundle

// should be in this block (except for extractelement-like instructions with

// constant indeces).

@@ -8497,7 +9091,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

isVectorLikeInstWithConstOps(I);

}));

- auto &&FindLastInst = [E, Front, this, &BB]() {

+ auto FindLastInst = [&]() {

Instruction *LastInst = Front;

for (Value *V : E->Scalars) {

auto *I = dyn_cast<Instruction>(V);

@@ -8508,9 +9102,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

LastInst = I;

continue;

}

- assert(isVectorLikeInstWithConstOps(LastInst) &&

- isVectorLikeInstWithConstOps(I) &&

- "Expected vector-like insts only.");

+ assert(((E->getOpcode() == Instruction::GetElementPtr &&

+ !isa<GetElementPtrInst>(I)) ||

+ (isVectorLikeInstWithConstOps(LastInst) &&

+ isVectorLikeInstWithConstOps(I))) &&

+ "Expected vector-like or non-GEP in GEP node insts only.");

if (!DT->isReachableFromEntry(LastInst->getParent())) {

LastInst = I;

continue;

@@ -8531,7 +9127,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

return LastInst;

};

- auto &&FindFirstInst = [E, Front, this]() {

+ auto FindFirstInst = [&]() {

Instruction *FirstInst = Front;

for (Value *V : E->Scalars) {

auto *I = dyn_cast<Instruction>(V);

@@ -8542,9 +9138,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

FirstInst = I;

continue;

}

- assert(isVectorLikeInstWithConstOps(FirstInst) &&

- isVectorLikeInstWithConstOps(I) &&

- "Expected vector-like insts only.");

+ assert(((E->getOpcode() == Instruction::GetElementPtr &&

+ !isa<GetElementPtrInst>(I)) ||

+ (isVectorLikeInstWithConstOps(FirstInst) &&

+ isVectorLikeInstWithConstOps(I))) &&

+ "Expected vector-like or non-GEP in GEP node insts only.");

if (!DT->isReachableFromEntry(FirstInst->getParent())) {

FirstInst = I;

continue;

@@ -8566,22 +9164,23 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

// Set the insert point to the beginning of the basic block if the entry

// should not be scheduled.

- if (E->State != TreeEntry::NeedToGather &&

- (doesNotNeedToSchedule(E->Scalars) ||

+ if (doesNotNeedToSchedule(E->Scalars) ||

+ (E->State != TreeEntry::NeedToGather &&

all_of(E->Scalars, isVectorLikeInstWithConstOps))) {

- Instruction *InsertInst;

- if (all_of(E->Scalars, [](Value *V) {

+ if ((E->getOpcode() == Instruction::GetElementPtr &&

+ any_of(E->Scalars,

+ [](Value *V) {

+ return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);

+ })) ||

+ all_of(E->Scalars, [](Value *V) {

return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V);

}))

- InsertInst = FindLastInst();

+ Res.second = FindLastInst();

else

- InsertInst = FindFirstInst();

- return *InsertInst;

+ Res.second = FindFirstInst();

+ return *Res.second;

}

- // The last instruction in the bundle in program order.

- Instruction *LastInst = nullptr;

// Find the last instruction. The common case should be that BB has been

// scheduled, and the last instruction is VL.back(). So we start with

// VL.back() and iterate over schedule data until we reach the end of the

@@ -8594,7 +9193,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

if (Bundle && Bundle->isPartOfBundle())

for (; Bundle; Bundle = Bundle->NextInBundle)

if (Bundle->OpValue == Bundle->Inst)

- LastInst = Bundle->Inst;

+ Res.second = Bundle->Inst;

}

// LastInst can still be null at this point if there's either not an entry

@@ -8615,15 +9214,15 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

// not ideal. However, this should be exceedingly rare since it requires that

// we both exit early from buildTree_rec and that the bundle be out-of-order

// (causing us to iterate all the way to the end of the block).

- if (!LastInst)

- LastInst = FindLastInst();

- assert(LastInst && "Failed to find last instruction in bundle");

- return *LastInst;

+ if (!Res.second)

+ Res.second = FindLastInst();

+ assert(Res.second && "Failed to find last instruction in bundle");

+ return *Res.second;

}

void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

auto *Front = E->getMainOp();

- Instruction *LastInst = EntryToLastInstruction.lookup(E);

+ Instruction *LastInst = &getLastInstructionInBundle(E);

assert(LastInst && "Failed to find last instruction in bundle");

// If the instruction is PHI, set the insert point after all the PHIs.

bool IsPHI = isa<PHINode>(LastInst);

@@ -8641,7 +9240,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

Builder.SetCurrentDebugLocation(Front->getDebugLoc());

}

-Value *BoUpSLP::gather(ArrayRef<Value *> VL) {

+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {

// List of instructions/lanes from current block and/or the blocks which are

// part of the current loop. These instructions will be inserted at the end to

// make it possible to optimize loops and hoist invariant instructions out of

@@ -8658,7 +9257,8 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {

for (int I = 0, E = VL.size(); I < E; ++I) {

if (auto *Inst = dyn_cast<Instruction>(VL[I]))

if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||

- getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&

+ getTreeEntry(Inst) ||

+ (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&

PostponedIndices.insert(I).second)

PostponedInsts.emplace_back(Inst, I);

}

@@ -8681,7 +9281,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {

Value *Val0 =

isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];

FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());

- Value *Vec = PoisonValue::get(VecTy);

+ Value *Vec = Root ? Root : PoisonValue::get(VecTy);

SmallVector<int> NonConsts;

// Insert constant values at first.

for (int I = 0, E = VL.size(); I < E; ++I) {

@@ -8691,6 +9291,18 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {

NonConsts.push_back(I);

continue;

}

+ if (Root) {

+ if (!isa<UndefValue>(VL[I])) {

+ NonConsts.push_back(I);

+ continue;

+ }

+ if (isa<PoisonValue>(VL[I]))

+ continue;

+ if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {

+ if (SV->getMaskValue(I) == PoisonMaskElem)

+ continue;

+ }

Vec = CreateInsertElement(Vec, VL[I], I);

}

// Insert non-constant values.

@@ -8789,6 +9401,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

}

return Vec;

}

+ Value *createIdentity(Value *V) { return V; }

+ Value *createPoison(Type *Ty, unsigned VF) {

+ return PoisonValue::get(FixedVectorType::get(Ty, VF));

+ }

/// Resizes 2 input vector to match the sizes, if the they are not equal

/// yet. The smallest vector is resized to the size of the larger vector.

void resizeToMatch(Value *&V1, Value *&V2) {

@@ -8798,7 +9414,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();

int VF = std::max(V1VF, V2VF);

int MinVF = std::min(V1VF, V2VF);

- SmallVector<int> IdentityMask(VF, UndefMaskElem);

+ SmallVector<int> IdentityMask(VF, PoisonMaskElem);

std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),

0);

Value *&Op = MinVF == V1VF ? V1 : V2;

@@ -8821,7 +9437,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

assert(V1 && "Expected at least one vector value.");

ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,

R.CSEBlocks);

- return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder);

+ return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,

+ ShuffleBuilder);

}

/// Transforms mask \p CommonMask per given \p Mask to make proper set after

@@ -8829,7 +9446,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,

ArrayRef<int> Mask) {

for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

- if (Mask[Idx] != UndefMaskElem)

+ if (Mask[Idx] != PoisonMaskElem)

CommonMask[Idx] = Idx;

}

@@ -8837,6 +9454,39 @@ public:

ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)

: Builder(Builder), R(R) {}

+ /// Adjusts extractelements after reusing them.

+ Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {

+ Value *VecBase = nullptr;

+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

+ int Idx = Mask[I];

+ if (Idx == PoisonMaskElem)

+ continue;

+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

+ VecBase = EI->getVectorOperand();

+ // If the only one use is vectorized - can delete the extractelement

+ // itself.

+ if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {

+ return !R.ScalarToTreeEntry.count(U);

+ }))

+ continue;

+ R.eraseInstruction(EI);

+ }

+ return VecBase;

+ }

+ /// Checks if the specified entry \p E needs to be delayed because of its

+ /// dependency nodes.

+ Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {

+ // No need to delay emission if all deps are ready.

+ if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))

+ return nullptr;

+ // Postpone gather emission, will be emitted after the end of the

+ // process to keep correct order.

+ auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),

+ E->getVectorFactor());

+ return Builder.CreateAlignedLoad(

+ VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),

+ MaybeAlign());

+ }

/// Adds 2 input vectors and the mask for their shuffling.

void add(Value *V1, Value *V2, ArrayRef<int> Mask) {

assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");

@@ -8849,15 +9499,15 @@ public:

Value *Vec = InVectors.front();

if (InVectors.size() == 2) {

Vec = createShuffle(Vec, InVectors.back(), CommonMask);

- transformMaskAfterShuffle(CommonMask, Mask);

+ transformMaskAfterShuffle(CommonMask, CommonMask);

} else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=

Mask.size()) {

Vec = createShuffle(Vec, nullptr, CommonMask);

- transformMaskAfterShuffle(CommonMask, Mask);

+ transformMaskAfterShuffle(CommonMask, CommonMask);

}

V1 = createShuffle(V1, V2, Mask);

for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

- if (Mask[Idx] != UndefMaskElem)

+ if (Mask[Idx] != PoisonMaskElem)

CommonMask[Idx] = Idx + Sz;

InVectors.front() = Vec;

if (InVectors.size() == 2)

@@ -8870,7 +9520,7 @@ public:

if (InVectors.empty()) {

if (!isa<FixedVectorType>(V1->getType())) {

V1 = createShuffle(V1, nullptr, CommonMask);

- CommonMask.assign(Mask.size(), UndefMaskElem);

+ CommonMask.assign(Mask.size(), PoisonMaskElem);

transformMaskAfterShuffle(CommonMask, Mask);

}

InVectors.push_back(V1);

@@ -8892,7 +9542,7 @@ public:

transformMaskAfterShuffle(CommonMask, CommonMask);

}

for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

- if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem)

+ if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)

CommonMask[Idx] =

V->getType() != V1->getType()

? Idx + Sz

@@ -8910,7 +9560,7 @@ public:

// Check if second vector is required if the used elements are already

// used from the first one.

for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

- if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) {

+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {

InVectors.push_back(V1);

break;

}

@@ -8919,7 +9569,7 @@ public:

if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))

VF = FTy->getNumElements();

for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

- if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem)

+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)

CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);

}

/// Adds another one input vector and the mask for the shuffling.

@@ -8928,17 +9578,46 @@ public:

inversePermutation(Order, NewMask);

add(V1, NewMask);

}

+ Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {

+ return R.gather(VL, Root);

+ }

+ Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }

/// Finalize emission of the shuffles.

+ /// \param Action the action (if any) to be performed before final applying of

+ /// the \p ExtMask mask.

Value *

- finalize(ArrayRef<int> ExtMask = std::nullopt) {

+ finalize(ArrayRef<int> ExtMask, unsigned VF = 0,

+ function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {

IsFinalized = true;

+ if (Action) {

+ Value *Vec = InVectors.front();

+ if (InVectors.size() == 2) {

+ Vec = createShuffle(Vec, InVectors.back(), CommonMask);

+ InVectors.pop_back();

+ } else {

+ Vec = createShuffle(Vec, nullptr, CommonMask);

+ }

+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

+ if (CommonMask[Idx] != PoisonMaskElem)

+ CommonMask[Idx] = Idx;

+ assert(VF > 0 &&

+ "Expected vector length for the final value before action.");

+ unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

+ if (VecVF < VF) {

+ SmallVector<int> ResizeMask(VF, PoisonMaskElem);

+ std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);

+ Vec = createShuffle(Vec, nullptr, ResizeMask);

+ }

+ Action(Vec, CommonMask);

+ InVectors.front() = Vec;

+ }

if (!ExtMask.empty()) {

if (CommonMask.empty()) {

CommonMask.assign(ExtMask.begin(), ExtMask.end());

} else {

- SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);

+ SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);

for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {

- if (ExtMask[I] == UndefMaskElem)

+ if (ExtMask[I] == PoisonMaskElem)

continue;

NewMask[I] = CommonMask[ExtMask[I]];

}

@@ -9009,18 +9688,18 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

// ... (use %2)

// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}

// br %block

- SmallVector<int> UniqueIdxs(VF, UndefMaskElem);

+ SmallVector<int> UniqueIdxs(VF, PoisonMaskElem);

SmallSet<int, 4> UsedIdxs;

int Pos = 0;

for (int Idx : VE->ReuseShuffleIndices) {

- if (Idx != static_cast<int>(VF) && Idx != UndefMaskElem &&

+ if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem &&

UsedIdxs.insert(Idx).second)

UniqueIdxs[Idx] = Pos;

++Pos;

}

assert(VF >= UsedIdxs.size() && "Expected vectorization factor "

"less than original vector size.");

- UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);

+ UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem);

V = FinalShuffle(V, UniqueIdxs);

} else {

assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&

@@ -9031,6 +9710,21 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

V = FinalShuffle(V, UniformMask);

}

+ // Need to update the operand gather node, if actually the operand is not a

+ // vectorized node, but the buildvector/gather node, which matches one of

+ // the vectorized nodes.

+ if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {

+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

+ }) == VE->UserTreeIndices.end()) {

+ auto *It = find_if(

+ VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

+ return TE->State == TreeEntry::NeedToGather &&

+ TE->UserTreeIndices.front().UserTE == E &&

+ TE->UserTreeIndices.front().EdgeIdx == NodeIdx;

+ });

+ assert(It != VectorizableTree.end() && "Expected gather node operand.");

+ (*It)->VectorizedValue = V;

+ }

return V;

}

@@ -9049,108 +9743,370 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

IRBuilder<>::InsertPointGuard Guard(Builder);

if (E->getOpcode() != Instruction::InsertElement &&

E->getOpcode() != Instruction::PHI) {

- Instruction *LastInst = EntryToLastInstruction.lookup(E);

+ Instruction *LastInst = &getLastInstructionInBundle(E);

assert(LastInst && "Failed to find last instruction in bundle");

Builder.SetInsertPoint(LastInst);

}

return vectorizeTree(I->get());

}

-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {

+template <typename BVTy, typename ResTy, typename... Args>

+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");

unsigned VF = E->getVectorFactor();

- ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);

- SmallVector<Value *> Gathered(

- VF, PoisonValue::get(E->Scalars.front()->getType()));

bool NeedFreeze = false;

- SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());

- // Build a mask out of the redorder indices and reorder scalars per this mask.

+ SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),

+ E->ReuseShuffleIndices.end());

+ SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

+ // Build a mask out of the reorder indices and reorder scalars per this

+ // mask.

SmallVector<int> ReorderMask;

inversePermutation(E->ReorderIndices, ReorderMask);

if (!ReorderMask.empty())

- reorderScalars(VL, ReorderMask);

- SmallVector<int> ReuseMask(VF, UndefMaskElem);

- if (!allConstant(VL)) {

+ reorderScalars(GatheredScalars, ReorderMask);

+ auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {

+ if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {

+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);

+ }))

+ return false;

+ TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;

+ unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;

+ if (UserTE->getNumOperands() != 2)

+ return false;

+ auto *It =

+ find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {

+ return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {

+ return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;

+ }) != TE->UserTreeIndices.end();

+ });

+ if (It == VectorizableTree.end())

+ return false;

+ unsigned I =

+ *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });

+ int Sz = Mask.size();

+ if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&

+ ShuffleVectorInst::isIdentityMask(Mask))

+ std::iota(Mask.begin(), Mask.end(), 0);

+ else

+ std::fill(Mask.begin(), Mask.end(), I);

+ return true;

+ };

+ BVTy ShuffleBuilder(Params...);

+ ResTy Res = ResTy();

+ SmallVector<int> Mask;

+ SmallVector<int> ExtractMask;

+ std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;

+ std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;

+ SmallVector<const TreeEntry *> Entries;

+ Type *ScalarTy = GatheredScalars.front()->getType();

+ if (!all_of(GatheredScalars, UndefValue::classof)) {

+ // Check for gathered extracts.

+ ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);

+ SmallVector<Value *> IgnoredVals;

+ if (UserIgnoreList)

+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

+ bool Resized = false;

+ if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))

+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

+ if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {

+ Resized = true;

+ GatheredScalars.append(VF - GatheredScalars.size(),

+ PoisonValue::get(ScalarTy));

+ }

+ // Gather extracts after we check for full matched gathers only.

+ if (ExtractShuffle || E->getOpcode() != Instruction::Load ||

+ E->isAltShuffle() ||

+ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

+ isSplat(E->Scalars) ||

+ (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {

+ GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);

+ }

+ if (GatherShuffle) {

+ if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {

+ // Delay emission of gathers which are not ready yet.

+ PostponedGathers.insert(E);

+ // Postpone gather emission, will be emitted after the end of the

+ // process to keep correct order.

+ return Delayed;

+ }

+ assert((Entries.size() == 1 || Entries.size() == 2) &&

+ "Expected shuffle of 1 or 2 entries.");

+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&

+ Entries.front()->isSame(E->Scalars)) {

+ // Perfect match in the graph, will reuse the previously vectorized

+ // node. Cost is 0.

+ LLVM_DEBUG(

+ dbgs()

+ << "SLP: perfect diamond match for gather bundle that starts with "

+ << *E->Scalars.front() << ".\n");

+ // Restore the mask for previous partially matched values.

+ if (Entries.front()->ReorderIndices.empty() &&

+ ((Entries.front()->ReuseShuffleIndices.empty() &&

+ E->Scalars.size() == Entries.front()->Scalars.size()) ||

+ (E->Scalars.size() ==

+ Entries.front()->ReuseShuffleIndices.size()))) {

+ std::iota(Mask.begin(), Mask.end(), 0);

+ } else {

+ for (auto [I, V] : enumerate(E->Scalars)) {

+ if (isa<PoisonValue>(V)) {

+ Mask[I] = PoisonMaskElem;

+ continue;

+ }

+ Mask[I] = Entries.front()->findLaneForValue(V);

+ }

+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);

+ Res = ShuffleBuilder.finalize(E->getCommonMask());

+ return Res;

+ }

+ if (!Resized) {

+ unsigned VF1 = Entries.front()->getVectorFactor();

+ unsigned VF2 = Entries.back()->getVectorFactor();

+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)

+ GatheredScalars.append(VF - GatheredScalars.size(),

+ PoisonValue::get(ScalarTy));

+ }

+ // Remove shuffled elements from list of gathers.

+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

+ if (Mask[I] != PoisonMaskElem)

+ GatheredScalars[I] = PoisonValue::get(ScalarTy);

+ }

+ auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,

+ SmallVectorImpl<int> &ReuseMask,

+ bool IsRootPoison) {

// For splats with can emit broadcasts instead of gathers, so try to find

// such sequences.

- bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back());

+ bool IsSplat = IsRootPoison && isSplat(Scalars) &&

+ (Scalars.size() > 2 || Scalars.front() == Scalars.back());

+ Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));

SmallVector<int> UndefPos;

DenseMap<Value *, unsigned> UniquePositions;

// Gather unique non-const values and all constant values.

// For repeated values, just shuffle them.

- for (auto [I, V] : enumerate(VL)) {

+ int NumNonConsts = 0;

+ int SinglePos = 0;

+ for (auto [I, V] : enumerate(Scalars)) {

if (isa<UndefValue>(V)) {

if (!isa<PoisonValue>(V)) {

- Gathered[I] = V;

ReuseMask[I] = I;

UndefPos.push_back(I);

}

continue;

}

if (isConstant(V)) {

- Gathered[I] = V;

ReuseMask[I] = I;

continue;

}

+ ++NumNonConsts;

+ SinglePos = I;

+ Value *OrigV = V;

+ Scalars[I] = PoisonValue::get(ScalarTy);

if (IsSplat) {

- Gathered.front() = V;

+ Scalars.front() = OrigV;

ReuseMask[I] = 0;

} else {

- const auto Res = UniquePositions.try_emplace(V, I);

- Gathered[Res.first->second] = V;

+ const auto Res = UniquePositions.try_emplace(OrigV, I);

+ Scalars[Res.first->second] = OrigV;

ReuseMask[I] = Res.first->second;

}

- if (!UndefPos.empty() && IsSplat) {

+ if (NumNonConsts == 1) {

+ // Restore single insert element.

+ if (IsSplat) {

+ ReuseMask.assign(VF, PoisonMaskElem);

+ std::swap(Scalars.front(), Scalars[SinglePos]);

+ if (!UndefPos.empty() && UndefPos.front() == 0)

+ Scalars.front() = UndefValue::get(ScalarTy);

+ }

+ ReuseMask[SinglePos] = SinglePos;

+ } else if (!UndefPos.empty() && IsSplat) {

// For undef values, try to replace them with the simple broadcast.

// We can do it if the broadcasted value is guaranteed to be

// non-poisonous, or by freezing the incoming scalar value first.

- auto *It = find_if(Gathered, [this, E](Value *V) {

+ auto *It = find_if(Scalars, [this, E](Value *V) {

return !isa<UndefValue>(V) &&

(getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||

- any_of(V->uses(), [E](const Use &U) {

- // Check if the value already used in the same operation in

- // one of the nodes already.

- return E->UserTreeIndices.size() == 1 &&

- is_contained(

- E->UserTreeIndices.front().UserTE->Scalars,

- U.getUser()) &&

- E->UserTreeIndices.front().EdgeIdx != U.getOperandNo();

- }));

+ (E->UserTreeIndices.size() == 1 &&

+ any_of(V->uses(), [E](const Use &U) {

+ // Check if the value already used in the same operation in

+ // one of the nodes already.

+ return E->UserTreeIndices.front().EdgeIdx !=

+ U.getOperandNo() &&

+ is_contained(

+ E->UserTreeIndices.front().UserTE->Scalars,

+ U.getUser());

+ })));

});

- if (It != Gathered.end()) {

+ if (It != Scalars.end()) {

// Replace undefs by the non-poisoned scalars and emit broadcast.

- int Pos = std::distance(Gathered.begin(), It);

+ int Pos = std::distance(Scalars.begin(), It);

for_each(UndefPos, [&](int I) {

// Set the undef position to the non-poisoned scalar.

ReuseMask[I] = Pos;

- // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already.

+ // Replace the undef by the poison, in the mask it is replaced by

+ // non-poisoned scalar already.

if (I != Pos)

- Gathered[I] = PoisonValue::get(Gathered[I]->getType());

+ Scalars[I] = PoisonValue::get(ScalarTy);

});

} else {

// Replace undefs by the poisons, emit broadcast and then emit

// freeze.

for_each(UndefPos, [&](int I) {

- ReuseMask[I] = UndefMaskElem;

- if (isa<UndefValue>(Gathered[I]))

- Gathered[I] = PoisonValue::get(Gathered[I]->getType());

+ ReuseMask[I] = PoisonMaskElem;

+ if (isa<UndefValue>(Scalars[I]))

+ Scalars[I] = PoisonValue::get(ScalarTy);

});

NeedFreeze = true;

}

+ };

+ if (ExtractShuffle || GatherShuffle) {

+ bool IsNonPoisoned = true;

+ bool IsUsedInExpr = false;

+ Value *Vec1 = nullptr;

+ if (ExtractShuffle) {

+ // Gather of extractelements can be represented as just a shuffle of

+ // a single/two vectors the scalars are extracted from.

+ // Find input vectors.

+ Value *Vec2 = nullptr;

+ for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {

+ if (ExtractMask[I] == PoisonMaskElem ||

+ (!Mask.empty() && Mask[I] != PoisonMaskElem)) {

+ ExtractMask[I] = PoisonMaskElem;

+ continue;

+ }

+ if (isa<UndefValue>(E->Scalars[I]))

+ continue;

+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

+ if (!Vec1) {

+ Vec1 = EI->getVectorOperand();

+ } else if (Vec1 != EI->getVectorOperand()) {

+ assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&

+ "Expected only 1 or 2 vectors shuffle.");

+ Vec2 = EI->getVectorOperand();

+ }

+ if (Vec2) {

+ IsNonPoisoned &=

+ isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);

+ ShuffleBuilder.add(Vec1, Vec2, ExtractMask);

+ } else if (Vec1) {

+ IsUsedInExpr = FindReusedSplat(ExtractMask);

+ ShuffleBuilder.add(Vec1, ExtractMask);

+ IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);

+ } else {

+ ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(

+ ScalarTy, GatheredScalars.size())),

+ ExtractMask);

+ }

+ if (GatherShuffle) {

+ if (Entries.size() == 1) {

+ IsUsedInExpr = FindReusedSplat(Mask);

+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);

+ IsNonPoisoned &=

+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);

+ } else {

+ ShuffleBuilder.add(Entries.front()->VectorizedValue,

+ Entries.back()->VectorizedValue, Mask);

+ IsNonPoisoned &=

+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&

+ isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);

+ }

+ // Try to figure out best way to combine values: build a shuffle and insert

+ // elements or just build several shuffles.

+ // Insert non-constant scalars.

+ SmallVector<Value *> NonConstants(GatheredScalars);

+ int EMSz = ExtractMask.size();

+ int MSz = Mask.size();

+ // Try to build constant vector and shuffle with it only if currently we

+ // have a single permutation and more than 1 scalar constants.

+ bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;

+ bool IsIdentityShuffle =

+ (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==

+ TTI::SK_PermuteSingleSrc &&

+ none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&

+ ShuffleVectorInst::isIdentityMask(ExtractMask)) ||

+ (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==

+ TTI::SK_PermuteSingleSrc &&

+ none_of(Mask, [&](int I) { return I >= MSz; }) &&

+ ShuffleVectorInst::isIdentityMask(Mask));

+ bool EnoughConstsForShuffle =

+ IsSingleShuffle &&

+ (none_of(GatheredScalars,

+ [](Value *V) {

+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);

+ }) ||

+ any_of(GatheredScalars,

+ [](Value *V) {

+ return isa<Constant>(V) && !isa<UndefValue>(V);

+ })) &&

+ (!IsIdentityShuffle ||

+ (GatheredScalars.size() == 2 &&

+ any_of(GatheredScalars,

+ [](Value *V) { return !isa<UndefValue>(V); })) ||

+ count_if(GatheredScalars, [](Value *V) {

+ return isa<Constant>(V) && !isa<PoisonValue>(V);

+ }) > 1);

+ // NonConstants array contains just non-constant values, GatheredScalars

+ // contains only constant to build final vector and then shuffle.

+ for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {

+ if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))

+ NonConstants[I] = PoisonValue::get(ScalarTy);

+ else

+ GatheredScalars[I] = PoisonValue::get(ScalarTy);

+ }

+ // Generate constants for final shuffle and build a mask for them.

+ if (!all_of(GatheredScalars, PoisonValue::classof)) {

+ SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);

+ TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);

+ Value *BV = ShuffleBuilder.gather(GatheredScalars);

+ ShuffleBuilder.add(BV, BVMask);

+ }

+ if (all_of(NonConstants, [=](Value *V) {

+ return isa<PoisonValue>(V) ||

+ (IsSingleShuffle && ((IsIdentityShuffle &&

+ IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));

+ }))

+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);

+ else

+ Res = ShuffleBuilder.finalize(

+ E->ReuseShuffleIndices, E->Scalars.size(),

+ [&](Value *&Vec, SmallVectorImpl<int> &Mask) {

+ TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);

+ Vec = ShuffleBuilder.gather(NonConstants, Vec);

+ });

+ } else if (!allConstant(GatheredScalars)) {

+ // Gather unique scalars and all constants.

+ SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);

+ TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);

+ Value *BV = ShuffleBuilder.gather(GatheredScalars);

+ ShuffleBuilder.add(BV, ReuseMask);

+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);

} else {

- ReuseMask.clear();

- copy(VL, Gathered.begin());

+ // Gather all constants.

+ SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);

+ for (auto [I, V] : enumerate(E->Scalars)) {

+ if (!isa<PoisonValue>(V))

+ Mask[I] = I;

+ }

+ Value *BV = ShuffleBuilder.gather(E->Scalars);

+ ShuffleBuilder.add(BV, Mask);

+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);

}

- // Gather unique scalars and all constants.

- Value *Vec = gather(Gathered);

- ShuffleBuilder.add(Vec, ReuseMask);

- Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);

if (NeedFreeze)

- Vec = Builder.CreateFreeze(Vec);

- return Vec;

+ Res = ShuffleBuilder.createFreeze(Res);

+ return Res;

+Value *BoUpSLP::createBuildVector(const TreeEntry *E) {

+ return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,

+ *this);

}

Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

@@ -9161,10 +10117,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return E->VectorizedValue;

}

+ if (E->State == TreeEntry::NeedToGather) {

+ if (E->getMainOp() && E->Idx == 0)

+ setInsertPointAfterBundle(E);

+ Value *Vec = createBuildVector(E);

+ E->VectorizedValue = Vec;

+ return Vec;

+ }

auto FinalShuffle = [&](Value *V, const TreeEntry *E) {

ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);

- if (E->State != TreeEntry::NeedToGather &&

- E->getOpcode() == Instruction::Store) {

+ if (E->getOpcode() == Instruction::Store) {

ArrayRef<int> Mask =

ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),

E->ReorderIndices.size());

@@ -9175,45 +10138,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return ShuffleBuilder.finalize(E->ReuseShuffleIndices);

};

- if (E->State == TreeEntry::NeedToGather) {

- if (E->Idx > 0) {

- // We are in the middle of a vectorizable chain. We need to gather the

- // scalars from the users.

- Value *Vec = createBuildVector(E);

- E->VectorizedValue = Vec;

- return Vec;

- }

- if (E->getMainOp())

- setInsertPointAfterBundle(E);

- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

- // Build a mask out of the reorder indices and reorder scalars per this

- // mask.

- SmallVector<int> ReorderMask;

- inversePermutation(E->ReorderIndices, ReorderMask);

- if (!ReorderMask.empty())

- reorderScalars(GatheredScalars, ReorderMask);

- Value *Vec;

- SmallVector<int> Mask;

- SmallVector<const TreeEntry *> Entries;

- std::optional<TargetTransformInfo::ShuffleKind> Shuffle =

- isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);

- if (Shuffle) {

- assert((Entries.size() == 1 || Entries.size() == 2) &&

- "Expected shuffle of 1 or 2 entries.");

- Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,

- Entries.back()->VectorizedValue, Mask);

- if (auto *I = dyn_cast<Instruction>(Vec)) {

- GatherShuffleExtractSeq.insert(I);

- CSEBlocks.insert(I->getParent());

- }

- } else {

- Vec = gather(E->Scalars);

- }

- Vec = FinalShuffle(Vec, E);

- E->VectorizedValue = Vec;

- return Vec;

- }

assert((E->State == TreeEntry::Vectorize ||

E->State == TreeEntry::ScatterVectorize) &&

"Unhandled state");

@@ -9248,7 +10172,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// PHINodes may have multiple entries from the same block. We want to

// visit every block once.

- SmallPtrSet<BasicBlock*, 4> VisitedBBs;

+ SmallPtrSet<BasicBlock *, 4> VisitedBBs;

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

ValueList Operands;

@@ -9314,14 +10238,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

SmallVector<int> Mask;

if (!E->ReorderIndices.empty()) {

inversePermutation(E->ReorderIndices, Mask);

- Mask.append(NumElts - NumScalars, UndefMaskElem);

+ Mask.append(NumElts - NumScalars, PoisonMaskElem);

} else {

- Mask.assign(NumElts, UndefMaskElem);

+ Mask.assign(NumElts, PoisonMaskElem);

std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

}

// Create InsertVector shuffle if necessary

bool IsIdentity = true;

- SmallVector<int> PrevMask(NumElts, UndefMaskElem);

+ SmallVector<int> PrevMask(NumElts, PoisonMaskElem);

Mask.swap(PrevMask);

for (unsigned I = 0; I < NumScalars; ++I) {

Value *Scalar = E->Scalars[PrevMask[I]];

@@ -9337,9 +10261,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

- SmallVector<int> InsertMask(NumElts, UndefMaskElem);

+ SmallVector<int> InsertMask(NumElts, PoisonMaskElem);

for (unsigned I = 0; I < NumElts; I++) {

- if (Mask[I] != UndefMaskElem)

+ if (Mask[I] != PoisonMaskElem)

InsertMask[Offset + I] = I;

}

SmallBitVector UseMask =

@@ -9354,10 +10278,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

if (!IsFirstPoison.all()) {

for (unsigned I = 0; I < NumElts; I++) {

- if (InsertMask[I] == UndefMaskElem && !IsFirstPoison.test(I))

+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))

InsertMask[I] = I + NumElts;

}

- }

+ }

V = Builder.CreateShuffleVector(

IsFirstPoison.all() ? PoisonValue::get(V->getType())

@@ -9372,8 +10296,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

SmallBitVector IsFirstPoison =

isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

for (unsigned I = 0; I < NumElts; I++) {

- if (InsertMask[I] == UndefMaskElem)

- InsertMask[I] = IsFirstPoison.test(I) ? UndefMaskElem : I;

+ if (InsertMask[I] == PoisonMaskElem)

+ InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;

else

InsertMask[I] += NumElts;

}

@@ -9544,20 +10468,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

LoadInst *LI = cast<LoadInst>(VL0);

Instruction *NewLI;

- unsigned AS = LI->getPointerAddressSpace();

Value *PO = LI->getPointerOperand();

if (E->State == TreeEntry::Vectorize) {

- Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));

- NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());

+ NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());

- // The pointer operand uses an in-tree scalar so we add the new BitCast

- // or LoadInst to ExternalUses list to make sure that an extract will

+ // The pointer operand uses an in-tree scalar so we add the new

+ // LoadInst to ExternalUses list to make sure that an extract will

// be generated in the future.

if (TreeEntry *Entry = getTreeEntry(PO)) {

// Find which lane we need to extract.

unsigned FoundLane = Entry->findLaneForValue(PO);

- ExternalUses.emplace_back(

- PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);

+ ExternalUses.emplace_back(PO, NewLI, FoundLane);

}

} else {

assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");

@@ -9653,7 +10574,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

CallInst *CI = cast<CallInst>(VL0);

setInsertPointAfterBundle(E);

- Intrinsic::ID IID = Intrinsic::not_intrinsic;

+ Intrinsic::ID IID = Intrinsic::not_intrinsic;

if (Function *FI = CI->getCalledFunction())

IID = FI->getIntrinsicID();

@@ -9665,8 +10586,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Value *ScalarArg = nullptr;

std::vector<Value *> OpVecs;

- SmallVector<Type *, 2> TysForDecl =

- {FixedVectorType::get(CI->getType(), E->Scalars.size())};

+ SmallVector<Type *, 2> TysForDecl;

+ // Add return type if intrinsic is overloaded on it.

+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))

+ TysForDecl.push_back(

+ FixedVectorType::get(CI->getType(), E->Scalars.size()));

for (int j = 0, e = CI->arg_size(); j < e; ++j) {

ValueList OpVL;

// Some intrinsics have scalar arguments. This argument should not be

@@ -9808,14 +10732,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

default:

- llvm_unreachable("unknown inst");

+ llvm_unreachable("unknown inst");

}

return nullptr;

}

Value *BoUpSLP::vectorizeTree() {

ExtraValueToDebugLocsMap ExternallyUsedValues;

- return vectorizeTree(ExternallyUsedValues);

+ SmallVector<std::pair<Value *, Value *>> ReplacedExternals;

+ return vectorizeTree(ExternallyUsedValues, ReplacedExternals);

}

namespace {

@@ -9829,28 +10754,51 @@ struct ShuffledInsertData {

};

} // namespace

-Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

- Instruction *ReductionRoot) {

+Value *BoUpSLP::vectorizeTree(

+ const ExtraValueToDebugLocsMap &ExternallyUsedValues,

+ SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,

+ Instruction *ReductionRoot) {

// All blocks must be scheduled before any instructions are inserted.

for (auto &BSIter : BlocksSchedules) {

scheduleBlock(BSIter.second.get());

}

- // Pre-gather last instructions.

- for (const std::unique_ptr<TreeEntry> &E : VectorizableTree) {

- if ((E->State == TreeEntry::NeedToGather &&

- (!E->getMainOp() || E->Idx > 0)) ||

- (E->State != TreeEntry::NeedToGather &&

- E->getOpcode() == Instruction::ExtractValue) ||

- E->getOpcode() == Instruction::InsertElement)

- continue;

- Instruction *LastInst = &getLastInstructionInBundle(E.get());

- EntryToLastInstruction.try_emplace(E.get(), LastInst);

- }

+ // Clean Entry-to-LastInstruction table. It can be affected after scheduling,

+ // need to rebuild it.

+ EntryToLastInstruction.clear();

Builder.SetInsertPoint(ReductionRoot ? ReductionRoot

: &F->getEntryBlock().front());

auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());

+ // Run through the list of postponed gathers and emit them, replacing the temp

+ // emitted allocas with actual vector instructions.

+ ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();

+ DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;

+ for (const TreeEntry *E : PostponedNodes) {

+ auto *TE = const_cast<TreeEntry *>(E);

+ if (auto *VecTE = getTreeEntry(TE->Scalars.front()))

+ if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(

+ TE->UserTreeIndices.front().EdgeIdx)))

+ // Found gather node which is absolutely the same as one of the

+ // vectorized nodes. It may happen after reordering.

+ continue;

+ auto *PrevVec = cast<Instruction>(TE->VectorizedValue);

+ TE->VectorizedValue = nullptr;

+ auto *UserI =

+ cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);

+ Builder.SetInsertPoint(PrevVec);

+ Builder.SetCurrentDebugLocation(UserI->getDebugLoc());

+ Value *Vec = vectorizeTree(TE);

+ PrevVec->replaceAllUsesWith(Vec);

+ PostponedValues.try_emplace(Vec).first->second.push_back(TE);

+ // Replace the stub vector node, if it was used before for one of the

+ // buildvector nodes already.

+ auto It = PostponedValues.find(PrevVec);

+ if (It != PostponedValues.end()) {

+ for (TreeEntry *VTE : It->getSecond())

+ VTE->VectorizedValue = Vec;

+ }

+ eraseInstruction(PrevVec);

+ }

// If the vectorized tree can be rewritten in a smaller type, we truncate the

// vectorized root. InstCombine will then rewrite the entire expression. We

@@ -9968,14 +10916,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

Builder.SetInsertPoint(&F->getEntryBlock().front());

}

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

- auto &NewInstLocs = ExternallyUsedValues[NewInst];

- auto It = ExternallyUsedValues.find(Scalar);

- assert(It != ExternallyUsedValues.end() &&

- "Externally used scalar is not found in ExternallyUsedValues");

- NewInstLocs.append(It->second);

- ExternallyUsedValues.erase(Scalar);

// Required to update internally referenced instructions.

Scalar->replaceAllUsesWith(NewInst);

+ ReplacedExternals.emplace_back(Scalar, NewInst);

continue;

}

@@ -10004,7 +10947,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

ShuffledInserts.size() - 1);

SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

if (Mask.empty())

- Mask.assign(FTy->getNumElements(), UndefMaskElem);

+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);

// Find the insertvector, vectorized in tree, if any.

Value *Base = VU;

while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {

@@ -10017,7 +10960,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

do {

IEBase = cast<InsertElementInst>(Base);

int IEIdx = *getInsertIndex(IEBase);

- assert(Mask[Idx] == UndefMaskElem &&

+ assert(Mask[Idx] == PoisonMaskElem &&

"InsertElementInstruction used already.");

Mask[IEIdx] = IEIdx;

Base = IEBase->getOperand(0);

@@ -10035,7 +10978,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

}

SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

if (Mask.empty())

- Mask.assign(FTy->getNumElements(), UndefMaskElem);

+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);

Mask[Idx] = ExternalUse.Lane;

It->InsertElements.push_back(cast<InsertElementInst>(User));

continue;

@@ -10077,8 +11020,8 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

}

auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {

- SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);

- SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);

+ SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);

+ SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);

int VF = cast<FixedVectorType>(V1->getType())->getNumElements();

for (int I = 0, E = Mask.size(); I < E; ++I) {

if (Mask[I] < VF)

@@ -10103,9 +11046,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,

return std::make_pair(Vec, true);

}

if (!ForSingleMask) {

- SmallVector<int> ResizeMask(VF, UndefMaskElem);

+ SmallVector<int> ResizeMask(VF, PoisonMaskElem);

for (unsigned I = 0; I < VF; ++I) {

- if (Mask[I] != UndefMaskElem)

+ if (Mask[I] != PoisonMaskElem)

ResizeMask[Mask[I]] = Mask[I];

}

Vec = CreateShuffle(Vec, nullptr, ResizeMask);

@@ -10308,14 +11251,14 @@ void BoUpSLP::optimizeGatherSequence() {

// registers.

unsigned LastUndefsCnt = 0;

for (int I = 0, E = NewMask.size(); I < E; ++I) {

- if (SM1[I] == UndefMaskElem)

+ if (SM1[I] == PoisonMaskElem)

++LastUndefsCnt;

else

LastUndefsCnt = 0;

- if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&

+ if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&

NewMask[I] != SM1[I])

return false;

- if (NewMask[I] == UndefMaskElem)

+ if (NewMask[I] == PoisonMaskElem)

NewMask[I] = SM1[I];

}

// Check if the last undefs actually change the final number of used vector

@@ -10590,11 +11533,20 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

}

// Search up and down at the same time, because we don't know if the new

// instruction is above or below the existing scheduling region.

+ // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted

+ // against the budget. Otherwise debug info could affect codegen.

BasicBlock::reverse_iterator UpIter =

++ScheduleStart->getIterator().getReverse();

BasicBlock::reverse_iterator UpperEnd = BB->rend();

BasicBlock::iterator DownIter = ScheduleEnd->getIterator();

BasicBlock::iterator LowerEnd = BB->end();

+ auto IsAssumeLikeIntr = [](const Instruction &I) {

+ if (auto *II = dyn_cast<IntrinsicInst>(&I))

+ return II->isAssumeLikeIntrinsic();

+ return false;

+ };

+ UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

+ DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&

&*DownIter != I) {

if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {

@@ -10604,6 +11556,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

++UpIter;

++DownIter;

+ UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

+ DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

}

if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {

assert(I->getParent() == ScheduleStart->getParent() &&

@@ -10804,7 +11759,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

unsigned numAliased = 0;

unsigned DistToSrc = 1;

- for ( ; DepDest; DepDest = DepDest->NextLoadStore) {

+ for (; DepDest; DepDest = DepDest->NextLoadStore) {

assert(isInSchedulingRegion(DepDest));

// We have two limits to reduce the complexity:

@@ -11163,8 +12118,8 @@ void BoUpSLP::computeMinimumValueSizes() {

// we can truncate the roots to this narrower type.

for (auto *Root : TreeRoot) {

auto Mask = DB->getDemandedBits(cast<Instruction>(Root));

- MaxBitWidth = std::max<unsigned>(

- Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);

+ MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),

+ MaxBitWidth);

}

// True if the roots can be zero-extended back to their original type, rather

@@ -11223,8 +12178,7 @@ void BoUpSLP::computeMinimumValueSizes() {

}

// Round MaxBitWidth up to the next power-of-two.

- if (!isPowerOf2_64(MaxBitWidth))

- MaxBitWidth = NextPowerOf2(MaxBitWidth);

+ MaxBitWidth = llvm::bit_ceil(MaxBitWidth);

// If the maximum bit width we compute is less than the with of the roots'

// type, we can proceed with the narrowing. Otherwise, do nothing.

@@ -11242,60 +12196,6 @@ void BoUpSLP::computeMinimumValueSizes() {

MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);

}

-namespace {

-/// The SLPVectorizer Pass.

-struct SLPVectorizer : public FunctionPass {

- SLPVectorizerPass Impl;

- /// Pass identification, replacement for typeid

- static char ID;

- explicit SLPVectorizer() : FunctionPass(ID) {

- initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());

- }

- bool doInitialization(Module &M) override { return false; }

- bool runOnFunction(Function &F) override {

- if (skipFunction(F))

- return false;

- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();

- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;

- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();

- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);

- }

- void getAnalysisUsage(AnalysisUsage &AU) const override {

- FunctionPass::getAnalysisUsage(AU);

- AU.addRequired<AssumptionCacheTracker>();

- AU.addRequired<ScalarEvolutionWrapperPass>();

- AU.addRequired<AAResultsWrapperPass>();

- AU.addRequired<TargetTransformInfoWrapperPass>();

- AU.addRequired<LoopInfoWrapperPass>();

- AU.addRequired<DominatorTreeWrapperPass>();

- AU.addRequired<DemandedBitsWrapperPass>();

- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();

- AU.addRequired<InjectTLIMappingsLegacy>();

- AU.addPreserved<LoopInfoWrapperPass>();

- AU.addPreserved<DominatorTreeWrapperPass>();

- AU.addPreserved<AAResultsWrapperPass>();

- AU.addPreserved<GlobalsAAWrapperPass>();

- AU.setPreservesCFG();

- }

-};

-} // end anonymous namespace

PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);

auto *TTI = &AM.getResult<TargetIRAnalysis>(F);

@@ -11536,7 +12436,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

unsigned MaxVecRegSize = R.getMaxVecRegSize();

unsigned EltSize = R.getVectorElementSize(Operands[0]);

- unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);

+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);

unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),

MaxElts);

@@ -11618,17 +12518,8 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

}

-bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {

- if (!A || !B)

- return false;

- if (isa<InsertElementInst>(A) || isa<InsertElementInst>(B))

- return false;

- Value *VL[] = {A, B};

- return tryToVectorizeList(VL, R);

bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

- bool LimitForRegisterSize) {

+ bool MaxVFOnly) {

if (VL.size() < 2)

return false;

@@ -11663,7 +12554,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

unsigned Sz = R.getVectorElementSize(I0);

unsigned MinVF = R.getMinVF(Sz);

- unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);

+ unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);

MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);

if (MaxVF < 2) {

R.getORE()->emit([&]() {

@@ -11690,21 +12581,17 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

if (TTI->getNumberOfParts(VecTy) == VF)

continue;

for (unsigned I = NextInst; I < MaxInst; ++I) {

- unsigned OpsWidth = 0;

+ unsigned ActualVF = std::min(MaxInst - I, VF);

- if (I + VF > MaxInst)

- OpsWidth = MaxInst - I;

- else

- OpsWidth = VF;

- if (!isPowerOf2_32(OpsWidth))

+ if (!isPowerOf2_32(ActualVF))

continue;

- if ((LimitForRegisterSize && OpsWidth < MaxVF) ||

- (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))

+ if (MaxVFOnly && ActualVF < MaxVF)

+ break;

+ if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))

break;

- ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

+ ArrayRef<Value *> Ops = VL.slice(I, ActualVF);

// Check that a previous iteration of this loop did not delete the Value.

if (llvm::any_of(Ops, [&R](Value *V) {

auto *I = dyn_cast<Instruction>(V);

@@ -11712,7 +12599,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

}))

continue;

- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "

+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "

<< "\n");

R.buildTree(Ops);

@@ -11730,7 +12617,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

MinCost = std::min(MinCost, Cost);

LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost

- << " for VF=" << OpsWidth << "\n");

+ << " for VF=" << ActualVF << "\n");

if (Cost < -SLPCostThreshold) {

LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");

R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",

@@ -11806,14 +12693,14 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {

}

if (Candidates.size() == 1)

- return tryToVectorizePair(Op0, Op1, R);

+ return tryToVectorizeList({Op0, Op1}, R);

// We have multiple options. Try to pick the single best.

std::optional<int> BestCandidate = R.findBestRootPair(Candidates);

if (!BestCandidate)

return false;

- return tryToVectorizePair(Candidates[*BestCandidate].first,

- Candidates[*BestCandidate].second, R);

+ return tryToVectorizeList(

+ {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);

}

namespace {

@@ -11857,6 +12744,9 @@ class HorizontalReduction {

WeakTrackingVH ReductionRoot;

/// The type of reduction operation.

RecurKind RdxKind;

+ /// Checks if the optimization of original scalar identity operations on

+ /// matched horizontal reductions is enabled and allowed.

+ bool IsSupportedHorRdxIdentityOp = false;

static bool isCmpSelMinMax(Instruction *I) {

return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&

@@ -11888,6 +12778,9 @@ class HorizontalReduction {

return I->getFastMathFlags().noNaNs();

}

+ if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)

+ return true;

return I->isAssociative();

}

@@ -11905,6 +12798,7 @@ class HorizontalReduction {

static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,

Value *RHS, const Twine &Name, bool UseSelect) {

unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);

+ bool IsConstant = isConstant(LHS) && isConstant(RHS);

switch (Kind) {

case RecurKind::Or:

if (UseSelect &&

@@ -11926,29 +12820,49 @@ class HorizontalReduction {

return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

Name);

case RecurKind::FMax:

+ if (IsConstant)

+ return ConstantFP::get(LHS->getType(),

+ maxnum(cast<ConstantFP>(LHS)->getValueAPF(),

+ cast<ConstantFP>(RHS)->getValueAPF()));

return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);

case RecurKind::FMin:

+ if (IsConstant)

+ return ConstantFP::get(LHS->getType(),

+ minnum(cast<ConstantFP>(LHS)->getValueAPF(),

+ cast<ConstantFP>(RHS)->getValueAPF()));

return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);

+ case RecurKind::FMaximum:

+ if (IsConstant)

+ return ConstantFP::get(LHS->getType(),

+ maximum(cast<ConstantFP>(LHS)->getValueAPF(),

+ cast<ConstantFP>(RHS)->getValueAPF()));

+ return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);

+ case RecurKind::FMinimum:

+ if (IsConstant)

+ return ConstantFP::get(LHS->getType(),

+ minimum(cast<ConstantFP>(LHS)->getValueAPF(),

+ cast<ConstantFP>(RHS)->getValueAPF()));

+ return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);

case RecurKind::SMax:

- if (UseSelect) {

+ if (IsConstant || UseSelect) {

Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

}

return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);

case RecurKind::SMin:

- if (UseSelect) {

+ if (IsConstant || UseSelect) {

Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

}

return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);

case RecurKind::UMax:

- if (UseSelect) {

+ if (IsConstant || UseSelect) {

Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

}

return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);

case RecurKind::UMin:

- if (UseSelect) {

+ if (IsConstant || UseSelect) {

Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

}

@@ -11984,6 +12898,7 @@ class HorizontalReduction {

return Op;

}

+public:

static RecurKind getRdxKind(Value *V) {

auto *I = dyn_cast<Instruction>(V);

if (!I)

@@ -12010,6 +12925,10 @@ class HorizontalReduction {

if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))

return RecurKind::FMin;

+ if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))

+ return RecurKind::FMaximum;

+ if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))

+ return RecurKind::FMinimum;

// This matches either cmp+select or intrinsics. SLP is expected to handle

// either form.

// TODO: If we are canonicalizing to intrinsics, we can remove several

@@ -12086,6 +13005,7 @@ class HorizontalReduction {

return isCmpSelMinMax(I) ? 1 : 0;

}

+private:

/// Total number of operands in the reduction operation.

static unsigned getNumberOfOperands(Instruction *I) {

return isCmpSelMinMax(I) ? 3 : 2;

@@ -12134,17 +13054,6 @@ class HorizontalReduction {

}

- static Value *getLHS(RecurKind Kind, Instruction *I) {

- if (Kind == RecurKind::None)

- return nullptr;

- return I->getOperand(getFirstOperandIndex(I));

- }

- static Value *getRHS(RecurKind Kind, Instruction *I) {

- if (Kind == RecurKind::None)

- return nullptr;

- return I->getOperand(getFirstOperandIndex(I) + 1);

- }

static bool isGoodForReduction(ArrayRef<Value *> Data) {

int Sz = Data.size();

auto *I = dyn_cast<Instruction>(Data.front());

@@ -12156,65 +13065,39 @@ public:

HorizontalReduction() = default;

/// Try to find a reduction tree.

- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,

+ bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,

ScalarEvolution &SE, const DataLayout &DL,

const TargetLibraryInfo &TLI) {

- assert((!Phi || is_contained(Phi->operands(), Inst)) &&

- "Phi needs to use the binary operator");

- assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||

- isa<IntrinsicInst>(Inst)) &&

- "Expected binop, select, or intrinsic for reduction matching");

- RdxKind = getRdxKind(Inst);

- // We could have a initial reductions that is not an add.

- // r *= v1 + v2 + v3 + v4

- // In such a case start looking for a tree rooted in the first '+'.

- if (Phi) {

- if (getLHS(RdxKind, Inst) == Phi) {

- Phi = nullptr;

- Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst));

- if (!Inst)

- return false;

- RdxKind = getRdxKind(Inst);

- } else if (getRHS(RdxKind, Inst) == Phi) {

- Phi = nullptr;

- Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst));

- if (!Inst)

- return false;

- RdxKind = getRdxKind(Inst);

- }

- if (!isVectorizable(RdxKind, Inst))

+ RdxKind = HorizontalReduction::getRdxKind(Root);

+ if (!isVectorizable(RdxKind, Root))

return false;

// Analyze "regular" integer/FP types for reductions - no target-specific

// types or pointers.

- Type *Ty = Inst->getType();

+ Type *Ty = Root->getType();

if (!isValidElementType(Ty) || Ty->isPointerTy())

return false;

// Though the ultimate reduction may have multiple uses, its condition must

// have only single use.

- if (auto *Sel = dyn_cast<SelectInst>(Inst))

+ if (auto *Sel = dyn_cast<SelectInst>(Root))

if (!Sel->getCondition()->hasOneUse())

return false;

- ReductionRoot = Inst;

+ ReductionRoot = Root;

// Iterate through all the operands of the possible reduction tree and

// gather all the reduced values, sorting them by their value id.

- BasicBlock *BB = Inst->getParent();

- bool IsCmpSelMinMax = isCmpSelMinMax(Inst);

- SmallVector<Instruction *> Worklist(1, Inst);

+ BasicBlock *BB = Root->getParent();

+ bool IsCmpSelMinMax = isCmpSelMinMax(Root);

+ SmallVector<Instruction *> Worklist(1, Root);

// Checks if the operands of the \p TreeN instruction are also reduction

// operations or should be treated as reduced values or an extra argument,

// which is not part of the reduction.

- auto &&CheckOperands = [this, IsCmpSelMinMax,

- BB](Instruction *TreeN,

- SmallVectorImpl<Value *> &ExtraArgs,

- SmallVectorImpl<Value *> &PossibleReducedVals,

- SmallVectorImpl<Instruction *> &ReductionOps) {

+ auto CheckOperands = [&](Instruction *TreeN,

+ SmallVectorImpl<Value *> &ExtraArgs,

+ SmallVectorImpl<Value *> &PossibleReducedVals,

+ SmallVectorImpl<Instruction *> &ReductionOps) {

for (int I = getFirstOperandIndex(TreeN),

End = getNumberOfOperands(TreeN);

I < End; ++I) {

@@ -12229,10 +13112,14 @@ public:

}

// If the edge is not an instruction, or it is different from the main

// reduction opcode or has too many uses - possible reduced value.

+ // Also, do not try to reduce const values, if the operation is not

+ // foldable.

if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||

IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||

!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||

- !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {

+ !isVectorizable(RdxKind, EdgeInst) ||

+ (R.isAnalyzedReductionRoot(EdgeInst) &&

+ all_of(EdgeInst->operands(), Constant::classof))) {

PossibleReducedVals.push_back(EdgeVal);

continue;

}

@@ -12246,10 +13133,43 @@ public:

// instructions (grouping them by the predicate).

MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>

PossibleReducedVals;

- initReductionOps(Inst);

+ initReductionOps(Root);

DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;

SmallSet<size_t, 2> LoadKeyUsed;

SmallPtrSet<Value *, 4> DoNotReverseVals;

+ auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {

+ Value *Ptr = getUnderlyingObject(LI->getPointerOperand());

+ if (LoadKeyUsed.contains(Key)) {

+ auto LIt = LoadsMap.find(Ptr);

+ if (LIt != LoadsMap.end()) {

+ for (LoadInst *RLI : LIt->second) {

+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

+ LI->getType(), LI->getPointerOperand(), DL, SE,

+ /*StrictCheck=*/true))

+ return hash_value(RLI->getPointerOperand());

+ }

+ for (LoadInst *RLI : LIt->second) {

+ if (arePointersCompatible(RLI->getPointerOperand(),

+ LI->getPointerOperand(), TLI)) {

+ hash_code SubKey = hash_value(RLI->getPointerOperand());

+ DoNotReverseVals.insert(RLI);

+ return SubKey;

+ }

+ if (LIt->second.size() > 2) {

+ hash_code SubKey =

+ hash_value(LIt->second.back()->getPointerOperand());

+ DoNotReverseVals.insert(LIt->second.back());

+ return SubKey;

+ }

+ LoadKeyUsed.insert(Key);

+ LoadsMap.try_emplace(Ptr).first->second.push_back(LI);

+ return hash_value(LI->getPointerOperand());

+ };

while (!Worklist.empty()) {

Instruction *TreeN = Worklist.pop_back_val();

SmallVector<Value *> Args;

@@ -12269,41 +13189,8 @@ public:

// results.

for (Value *V : PossibleRedVals) {

size_t Key, Idx;

- std::tie(Key, Idx) = generateKeySubkey(

- V, &TLI,

- [&](size_t Key, LoadInst *LI) {

- Value *Ptr = getUnderlyingObject(LI->getPointerOperand());

- if (LoadKeyUsed.contains(Key)) {

- auto LIt = LoadsMap.find(Ptr);

- if (LIt != LoadsMap.end()) {

- for (LoadInst *RLI: LIt->second) {

- if (getPointersDiff(

- RLI->getType(), RLI->getPointerOperand(),

- LI->getType(), LI->getPointerOperand(), DL, SE,

- /*StrictCheck=*/true))

- return hash_value(RLI->getPointerOperand());

- }

- for (LoadInst *RLI : LIt->second) {

- if (arePointersCompatible(RLI->getPointerOperand(),

- LI->getPointerOperand(), TLI)) {

- hash_code SubKey = hash_value(RLI->getPointerOperand());

- DoNotReverseVals.insert(RLI);

- return SubKey;

- }

- if (LIt->second.size() > 2) {

- hash_code SubKey =

- hash_value(LIt->second.back()->getPointerOperand());

- DoNotReverseVals.insert(LIt->second.back());

- return SubKey;

- }

- LoadKeyUsed.insert(Key);

- LoadsMap.try_emplace(Ptr).first->second.push_back(LI);

- return hash_value(LI->getPointerOperand());

- },

- /*AllowAlternate=*/false);

+ std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,

+ /*AllowAlternate=*/false);

++PossibleReducedVals[Key][Idx]

.insert(std::make_pair(V, 0))

.first->second;

@@ -12312,40 +13199,8 @@ public:

PossibleReductionOps.rend());

} else {

size_t Key, Idx;

- std::tie(Key, Idx) = generateKeySubkey(

- TreeN, &TLI,

- [&](size_t Key, LoadInst *LI) {

- Value *Ptr = getUnderlyingObject(LI->getPointerOperand());

- if (LoadKeyUsed.contains(Key)) {

- auto LIt = LoadsMap.find(Ptr);

- if (LIt != LoadsMap.end()) {

- for (LoadInst *RLI: LIt->second) {

- if (getPointersDiff(RLI->getType(),

- RLI->getPointerOperand(), LI->getType(),

- LI->getPointerOperand(), DL, SE,

- /*StrictCheck=*/true))

- return hash_value(RLI->getPointerOperand());

- }

- for (LoadInst *RLI : LIt->second) {

- if (arePointersCompatible(RLI->getPointerOperand(),

- LI->getPointerOperand(), TLI)) {

- hash_code SubKey = hash_value(RLI->getPointerOperand());

- DoNotReverseVals.insert(RLI);

- return SubKey;

- }

- if (LIt->second.size() > 2) {

- hash_code SubKey = hash_value(LIt->second.back()->getPointerOperand());

- DoNotReverseVals.insert(LIt->second.back());

- return SubKey;

- }

- LoadKeyUsed.insert(Key);

- LoadsMap.try_emplace(Ptr).first->second.push_back(LI);

- return hash_value(LI->getPointerOperand());

- },

- /*AllowAlternate=*/false);

+ std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,

+ /*AllowAlternate=*/false);

++PossibleReducedVals[Key][Idx]

.insert(std::make_pair(TreeN, 0))

.first->second;

@@ -12407,14 +13262,18 @@ public:

// If there are a sufficient number of reduction values, reduce

// to a nearby power-of-2. We can safely generate oversized

// vectors and rely on the backend to split them to legal sizes.

- size_t NumReducedVals =

+ unsigned NumReducedVals =

std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,

- [](size_t Num, ArrayRef<Value *> Vals) {

+ [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {

if (!isGoodForReduction(Vals))

return Num;

return Num + Vals.size();

});

- if (NumReducedVals < ReductionLimit) {

+ if (NumReducedVals < ReductionLimit &&

+ (!AllowHorRdxIdenityOptimization ||

+ all_of(ReducedVals, [](ArrayRef<Value *> RedV) {

+ return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);

+ }))) {

for (ReductionOpsType &RdxOps : ReductionOps)

for (Value *RdxOp : RdxOps)

V.analyzedReductionRoot(cast<Instruction>(RdxOp));

@@ -12428,6 +13287,7 @@ public:

DenseMap<Value *, WeakTrackingVH> TrackedVals(

ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());

BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

+ SmallVector<std::pair<Value *, Value *>> ReplacedExternals;

ExternallyUsedValues.reserve(ExtraArgs.size() + 1);

// The same extra argument may be used several times, so log each attempt

// to use it.

@@ -12448,6 +13308,18 @@ public:

return cast<Instruction>(ScalarCond);

};

+ // Return new VectorizedTree, based on previous value.

+ auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {

+ if (VectorizedTree) {

+ // Update the final value in the reduction.

+ Builder.SetCurrentDebugLocation(

+ cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

+ return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",

+ ReductionOps);

+ }

+ // Initialize the final value in the reduction.

+ return Res;

+ };

// The reduction root is used as the insertion point for new instructions,

// so set it as externally used to prevent it from being deleted.

ExternallyUsedValues[ReductionRoot];

@@ -12459,6 +13331,12 @@ public:

continue;

IgnoreList.insert(RdxOp);

}

+ // Intersect the fast-math-flags from all reduction operations.

+ FastMathFlags RdxFMF;

+ RdxFMF.set();

+ for (Value *U : IgnoreList)

+ if (auto *FPMO = dyn_cast<FPMathOperator>(U))

+ RdxFMF &= FPMO->getFastMathFlags();

bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));

// Need to track reduced vals, they may be changed during vectorization of

@@ -12519,16 +13397,82 @@ public:

}

+ // Emit code for constant values.

+ if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&

+ allConstant(Candidates)) {

+ Value *Res = Candidates.front();

+ ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();

+ for (Value *VC : ArrayRef(Candidates).drop_front()) {

+ Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);

+ ++VectorizedVals.try_emplace(VC, 0).first->getSecond();

+ if (auto *ResI = dyn_cast<Instruction>(Res))

+ V.analyzedReductionRoot(ResI);

+ }

+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);

+ continue;

+ }

unsigned NumReducedVals = Candidates.size();

- if (NumReducedVals < ReductionLimit)

+ if (NumReducedVals < ReductionLimit &&

+ (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||

+ !isSplat(Candidates)))

continue;

+ // Check if we support repeated scalar values processing (optimization of

+ // original scalar identity operations on matched horizontal reductions).

+ IsSupportedHorRdxIdentityOp =

+ AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&

+ RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;

+ // Gather same values.

+ MapVector<Value *, unsigned> SameValuesCounter;

+ if (IsSupportedHorRdxIdentityOp)

+ for (Value *V : Candidates)

+ ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;

+ // Used to check if the reduced values used same number of times. In this

+ // case the compiler may produce better code. E.g. if reduced values are

+ // aabbccdd (8 x values), then the first node of the tree will have a node

+ // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.

+ // Plus, the final reduction will be performed on <8 x aabbccdd>.

+ // Instead compiler may build <4 x abcd> tree immediately, + reduction (4

+ // x abcd) * 2.

+ // Currently it only handles add/fadd/xor. and/or/min/max do not require

+ // this analysis, other operations may require an extra estimation of

+ // the profitability.

+ bool SameScaleFactor = false;

+ bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&

+ SameValuesCounter.size() != Candidates.size();

+ if (OptReusedScalars) {

+ SameScaleFactor =

+ (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||

+ RdxKind == RecurKind::Xor) &&

+ all_of(drop_begin(SameValuesCounter),

+ [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {

+ return P.second == SameValuesCounter.front().second;

+ });

+ Candidates.resize(SameValuesCounter.size());

+ transform(SameValuesCounter, Candidates.begin(),

+ [](const auto &P) { return P.first; });

+ NumReducedVals = Candidates.size();

+ // Have a reduction of the same element.

+ if (NumReducedVals == 1) {

+ Value *OrigV = TrackedToOrig.find(Candidates.front())->second;

+ unsigned Cnt = SameValuesCounter.lookup(OrigV);

+ Value *RedVal =

+ emitScaleForReusedOps(Candidates.front(), Builder, Cnt);

+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

+ VectorizedVals.try_emplace(OrigV, Cnt);

+ continue;

+ }

unsigned MaxVecRegSize = V.getMaxVecRegSize();

unsigned EltSize = V.getVectorElementSize(Candidates[0]);

- unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);

+ unsigned MaxElts =

+ RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);

unsigned ReduxWidth = std::min<unsigned>(

- PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));

+ llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));

unsigned Start = 0;

unsigned Pos = Start;

// Restarts vectorization attempt with lower vector factor.

@@ -12551,6 +13495,7 @@ public:

ReduxWidth /= 2;

return IsAnyRedOpGathered;

};

+ bool AnyVectorized = false;

while (Pos < NumReducedVals - ReduxWidth + 1 &&

ReduxWidth >= ReductionLimit) {

// Dependency in tree of the reduction ops - drop this attempt, try

@@ -12603,34 +13548,24 @@ public:

LocalExternallyUsedValues[TrackedVals[V]];

});

}

- // Number of uses of the candidates in the vector of values.

- SmallDenseMap<Value *, unsigned> NumUses(Candidates.size());

- for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {

- Value *V = Candidates[Cnt];

- ++NumUses.try_emplace(V, 0).first->getSecond();

- }

- for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {

- Value *V = Candidates[Cnt];

- ++NumUses.try_emplace(V, 0).first->getSecond();

+ if (!IsSupportedHorRdxIdentityOp) {

+ // Number of uses of the candidates in the vector of values.

+ assert(SameValuesCounter.empty() &&

+ "Reused values counter map is not empty");

+ for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

+ if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

+ continue;

+ Value *V = Candidates[Cnt];

+ Value *OrigV = TrackedToOrig.find(V)->second;

+ ++SameValuesCounter[OrigV];

+ }

}

SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());

// Gather externally used values.

SmallPtrSet<Value *, 4> Visited;

- for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {

- Value *RdxVal = Candidates[Cnt];

- if (!Visited.insert(RdxVal).second)

+ for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

+ if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

continue;

- // Check if the scalar was vectorized as part of the vectorization

- // tree but not the top node.

- if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {

- LocalExternallyUsedValues[RdxVal];

- continue;

- }

- unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];

- if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())

- LocalExternallyUsedValues[RdxVal];

- }

- for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {

Value *RdxVal = Candidates[Cnt];

if (!Visited.insert(RdxVal).second)

continue;

@@ -12640,42 +13575,34 @@ public:

LocalExternallyUsedValues[RdxVal];

continue;

}

- unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];

- if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())

+ Value *OrigV = TrackedToOrig.find(RdxVal)->second;

+ unsigned NumOps =

+ VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];

+ if (NumOps != ReducedValsToOps.find(OrigV)->second.size())

LocalExternallyUsedValues[RdxVal];

}

+ // Do not need the list of reused scalars in regular mode anymore.

+ if (!IsSupportedHorRdxIdentityOp)

+ SameValuesCounter.clear();

for (Value *RdxVal : VL)

if (RequiredExtract.contains(RdxVal))

LocalExternallyUsedValues[RdxVal];

+ // Update LocalExternallyUsedValues for the scalar, replaced by

+ // extractelement instructions.

+ for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {

+ auto It = ExternallyUsedValues.find(Pair.first);

+ if (It == ExternallyUsedValues.end())

+ continue;

+ LocalExternallyUsedValues[Pair.second].append(It->second);

+ }

V.buildExternalUses(LocalExternallyUsedValues);

V.computeMinimumValueSizes();

- // Intersect the fast-math-flags from all reduction operations.

- FastMathFlags RdxFMF;

- RdxFMF.set();

- for (Value *U : IgnoreList)

- if (auto *FPMO = dyn_cast<FPMathOperator>(U))

- RdxFMF &= FPMO->getFastMathFlags();

// Estimate cost.

InstructionCost TreeCost = V.getTreeCost(VL);

InstructionCost ReductionCost =

- getReductionCost(TTI, VL, ReduxWidth, RdxFMF);

- if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {

- Instruction *MainOp = V.getFirstNodeMainOp();

- for (Value *V : VL) {

- auto *VI = dyn_cast<LoadInst>(V);

- // Add the costs of scalar GEP pointers, to be removed from the

- // code.

- if (!VI || VI == MainOp)

- continue;

- auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());

- if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())

- continue;

- TreeCost -= TTI->getArithmeticInstrCost(

- Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);

- }

+ getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);

InstructionCost Cost = TreeCost + ReductionCost;

LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");

if (!Cost.isValid())

@@ -12716,8 +13643,8 @@ public:

InsertPt = GetCmpForMinMaxReduction(RdxRootInst);

// Vectorize a tree.

- Value *VectorizedRoot =

- V.vectorizeTree(LocalExternallyUsedValues, InsertPt);

+ Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,

+ ReplacedExternals, InsertPt);

Builder.SetInsertPoint(InsertPt);

@@ -12727,29 +13654,48 @@ public:

if (isBoolLogicOp(RdxRootInst))

VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

+ // Emit code to correctly handle reused reduced values, if required.

+ if (OptReusedScalars && !SameScaleFactor) {

+ VectorizedRoot =

+ emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),

+ SameValuesCounter, TrackedToOrig);

+ }

Value *ReducedSubTree =

emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

- if (!VectorizedTree) {

- // Initialize the final value in the reduction.

- VectorizedTree = ReducedSubTree;

- } else {

- // Update the final value in the reduction.

- Builder.SetCurrentDebugLocation(

- cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,

- ReducedSubTree, "op.rdx", ReductionOps);

- }

+ // Improved analysis for add/fadd/xor reductions with same scale factor

+ // for all operands of reductions. We can emit scalar ops for them

+ // instead.

+ if (OptReusedScalars && SameScaleFactor)

+ ReducedSubTree = emitScaleForReusedOps(

+ ReducedSubTree, Builder, SameValuesCounter.front().second);

+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);

// Count vectorized reduced values to exclude them from final reduction.

for (Value *RdxVal : VL) {

- ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0)

- .first->getSecond();

+ Value *OrigV = TrackedToOrig.find(RdxVal)->second;

+ if (IsSupportedHorRdxIdentityOp) {

+ VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);

+ continue;

+ }

+ ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();

if (!V.isVectorized(RdxVal))

RequiredExtract.insert(RdxVal);

}

Pos += ReduxWidth;

Start = Pos;

- ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);

+ ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);

+ AnyVectorized = true;

+ }

+ if (OptReusedScalars && !AnyVectorized) {

+ for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {

+ Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);

+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

+ Value *OrigV = TrackedToOrig.find(P.first)->second;

+ VectorizedVals.try_emplace(OrigV, P.second);

+ }

+ continue;

}

if (VectorizedTree) {

@@ -12757,7 +13703,7 @@ public:

// possible problem with poison propagation. If not possible to reorder

// (both operands are originally RHS), emit an extra freeze instruction

// for the LHS operand.

- //I.e., if we have original code like this:

+ // I.e., if we have original code like this:

// RedOp1 = select i1 ?, i1 LHS, i1 false

// RedOp2 = select i1 RHS, i1 ?, i1 false

@@ -12892,7 +13838,8 @@ private:

/// Calculate the cost of a reduction.

InstructionCost getReductionCost(TargetTransformInfo *TTI,

ArrayRef<Value *> ReducedVals,

- unsigned ReduxWidth, FastMathFlags FMF) {

+ bool IsCmpSelMinMax, unsigned ReduxWidth,

+ FastMathFlags FMF) {

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

Value *FirstReducedVal = ReducedVals.front();

Type *ScalarTy = FirstReducedVal->getType();

@@ -12900,7 +13847,36 @@ private:

InstructionCost VectorCost = 0, ScalarCost;

// If all of the reduced values are constant, the vector cost is 0, since

// the reduction value can be calculated at the compile time.

- bool AllConsts = all_of(ReducedVals, isConstant);

+ bool AllConsts = allConstant(ReducedVals);

+ auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {

+ InstructionCost Cost = 0;

+ // Scalar cost is repeated for N-1 elements.

+ int Cnt = ReducedVals.size();

+ for (Value *RdxVal : ReducedVals) {

+ if (Cnt == 1)

+ break;

+ --Cnt;

+ if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {

+ Cost += GenCostFn();

+ continue;

+ }

+ InstructionCost ScalarCost = 0;

+ for (User *U : RdxVal->users()) {

+ auto *RdxOp = cast<Instruction>(U);

+ if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {

+ ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);

+ continue;

+ }

+ ScalarCost = InstructionCost::getInvalid();

+ break;

+ }

+ if (ScalarCost.isValid())

+ Cost += ScalarCost;

+ else

+ Cost += GenCostFn();

+ }

+ return Cost;

+ };

switch (RdxKind) {

case RecurKind::Add:

case RecurKind::Mul:

@@ -12913,52 +13889,32 @@ private:

if (!AllConsts)

VectorCost =

TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);

- ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);

+ ScalarCost = EvaluateScalarCost([&]() {

+ return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);

+ });

break;

}

case RecurKind::FMax:

- case RecurKind::FMin: {

- auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

- if (!AllConsts) {

- auto *VecCondTy =

- cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

- VectorCost =

- TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

- /*IsUnsigned=*/false, CostKind);

- }

- CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

- ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,

- SclCondTy, RdxPred, CostKind) +

- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

- SclCondTy, RdxPred, CostKind);

- break;

- }

+ case RecurKind::FMin:

+ case RecurKind::FMaximum:

+ case RecurKind::FMinimum:

case RecurKind::SMax:

case RecurKind::SMin:

case RecurKind::UMax:

case RecurKind::UMin: {

- auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

- if (!AllConsts) {

- auto *VecCondTy =

- cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

- bool IsUnsigned =

- RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;

- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

- IsUnsigned, CostKind);

- }

- CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

- ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,

- SclCondTy, RdxPred, CostKind) +

- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

- SclCondTy, RdxPred, CostKind);

+ Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);

+ if (!AllConsts)

+ VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);

+ ScalarCost = EvaluateScalarCost([&]() {

+ IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);

+ return TTI->getIntrinsicInstrCost(ICA, CostKind);

+ });

break;

}

default:

llvm_unreachable("Expected arithmetic or min/max reduction operation");

}

- // Scalar cost is repeated for N-1 elements.

- ScalarCost *= (ReduxWidth - 1);

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost

<< " for reduction that starts with " << *FirstReducedVal

<< " (It is a splitting reduction)\n");

@@ -12977,8 +13933,148 @@ private:

++NumVectorInstructions;

return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);

}

-};

+ /// Emits optimized code for unique scalar value reused \p Cnt times.

+ Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,

+ unsigned Cnt) {

+ assert(IsSupportedHorRdxIdentityOp &&

+ "The optimization of matched scalar identity horizontal reductions "

+ "must be supported.");

+ switch (RdxKind) {

+ case RecurKind::Add: {

+ // res = mul vv, n

+ Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);

+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "

+ << VectorizedValue << ". (HorRdx)\n");

+ return Builder.CreateMul(VectorizedValue, Scale);

+ }

+ case RecurKind::Xor: {

+ // res = n % 2 ? 0 : vv

+ LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue

+ << ". (HorRdx)\n");

+ if (Cnt % 2 == 0)

+ return Constant::getNullValue(VectorizedValue->getType());

+ return VectorizedValue;

+ }

+ case RecurKind::FAdd: {

+ // res = fmul v, n

+ Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);

+ LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "

+ << VectorizedValue << ". (HorRdx)\n");

+ return Builder.CreateFMul(VectorizedValue, Scale);

+ }

+ case RecurKind::And:

+ case RecurKind::Or:

+ case RecurKind::SMax:

+ case RecurKind::SMin:

+ case RecurKind::UMax:

+ case RecurKind::UMin:

+ case RecurKind::FMax:

+ case RecurKind::FMin:

+ case RecurKind::FMaximum:

+ case RecurKind::FMinimum:

+ // res = vv

+ return VectorizedValue;

+ case RecurKind::Mul:

+ case RecurKind::FMul:

+ case RecurKind::FMulAdd:

+ case RecurKind::SelectICmp:

+ case RecurKind::SelectFCmp:

+ case RecurKind::None:

+ llvm_unreachable("Unexpected reduction kind for repeated scalar.");

+ }

+ return nullptr;

+ }

+ /// Emits actual operation for the scalar identity values, found during

+ /// horizontal reduction analysis.

+ Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,

+ ArrayRef<Value *> VL,

+ const MapVector<Value *, unsigned> &SameValuesCounter,

+ const DenseMap<Value *, Value *> &TrackedToOrig) {

+ assert(IsSupportedHorRdxIdentityOp &&

+ "The optimization of matched scalar identity horizontal reductions "

+ "must be supported.");

+ switch (RdxKind) {

+ case RecurKind::Add: {

+ // root = mul prev_root, <1, 1, n, 1>

+ SmallVector<Constant *> Vals;

+ for (Value *V : VL) {

+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);

+ Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));

+ }

+ auto *Scale = ConstantVector::get(Vals);

+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "

+ << VectorizedValue << ". (HorRdx)\n");

+ return Builder.CreateMul(VectorizedValue, Scale);

+ }

+ case RecurKind::And:

+ case RecurKind::Or:

+ // No need for multiple or/and(s).

+ LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue

+ << ". (HorRdx)\n");

+ return VectorizedValue;

+ case RecurKind::SMax:

+ case RecurKind::SMin:

+ case RecurKind::UMax:

+ case RecurKind::UMin:

+ case RecurKind::FMax:

+ case RecurKind::FMin:

+ case RecurKind::FMaximum:

+ case RecurKind::FMinimum:

+ // No need for multiple min/max(s) of the same value.

+ LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue

+ << ". (HorRdx)\n");

+ return VectorizedValue;

+ case RecurKind::Xor: {

+ // Replace values with even number of repeats with 0, since

+ // x xor x = 0.

+ // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,

+ // 7>, if elements 4th and 6th elements have even number of repeats.

+ SmallVector<int> Mask(

+ cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),

+ PoisonMaskElem);

+ std::iota(Mask.begin(), Mask.end(), 0);

+ bool NeedShuffle = false;

+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {

+ Value *V = VL[I];

+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);

+ if (Cnt % 2 == 0) {

+ Mask[I] = VF;

+ NeedShuffle = true;

+ }

+ LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I

+ : Mask) dbgs()

+ << I << " ";

+ dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");

+ if (NeedShuffle)

+ VectorizedValue = Builder.CreateShuffleVector(

+ VectorizedValue,

+ ConstantVector::getNullValue(VectorizedValue->getType()), Mask);

+ return VectorizedValue;

+ }

+ case RecurKind::FAdd: {

+ // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>

+ SmallVector<Constant *> Vals;

+ for (Value *V : VL) {

+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);

+ Vals.push_back(ConstantFP::get(V->getType(), Cnt));

+ }

+ auto *Scale = ConstantVector::get(Vals);

+ return Builder.CreateFMul(VectorizedValue, Scale);

+ }

+ case RecurKind::Mul:

+ case RecurKind::FMul:

+ case RecurKind::FMulAdd:

+ case RecurKind::SelectICmp:

+ case RecurKind::SelectFCmp:

+ case RecurKind::None:

+ llvm_unreachable("Unexpected reduction kind for reused scalars.");

+ }

+ return nullptr;

+ }

+};

} // end anonymous namespace

static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {

@@ -13075,15 +14171,15 @@ static bool findBuildAggregate(Instruction *LastInsertInst,

return false;

}

-/// Try and get a reduction value from a phi node.

+/// Try and get a reduction instruction from a phi node.

///

/// Given a phi node \p P in a block \p ParentBB, consider possible reductions

/// if they come from either \p ParentBB or a containing loop latch.

///

/// \returns A candidate reduction value if possible, or \code nullptr \endcode

/// if not possible.

-static Value *getReductionValue(const DominatorTree *DT, PHINode *P,

- BasicBlock *ParentBB, LoopInfo *LI) {

+static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,

+ BasicBlock *ParentBB, LoopInfo *LI) {

// There are situations where the reduction value is not dominated by the

// reduction phi. Vectorizing such cases has been reported to cause

// miscompiles. See PR25787.

@@ -13092,13 +14188,13 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,

DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());

};

- Value *Rdx = nullptr;

+ Instruction *Rdx = nullptr;

// Return the incoming value if it comes from the same BB as the phi node.

if (P->getIncomingBlock(0) == ParentBB) {

- Rdx = P->getIncomingValue(0);

+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

} else if (P->getIncomingBlock(1) == ParentBB) {

- Rdx = P->getIncomingValue(1);

+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

}

if (Rdx && DominatedReduxValue(Rdx))

@@ -13115,9 +14211,9 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,

// There is a loop latch, return the incoming value if it comes from

// that. This reduction pattern occasionally turns up.

if (P->getIncomingBlock(0) == BBLatch) {

- Rdx = P->getIncomingValue(0);

+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

} else if (P->getIncomingBlock(1) == BBLatch) {

- Rdx = P->getIncomingValue(1);

+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

}

if (Rdx && DominatedReduxValue(Rdx))

@@ -13133,6 +14229,10 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {

return true;

if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))

return true;

+ if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))

+ return true;

+ if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))

+ return true;

if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))

return true;

if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))

@@ -13144,21 +14244,63 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {

return false;

}

+/// We could have an initial reduction that is not an add.

+/// r *= v1 + v2 + v3 + v4

+/// In such a case start looking for a tree rooted in the first '+'.

+/// \Returns the new root if found, which may be nullptr if not an instruction.

+static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,

+ Instruction *Root) {

+ assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||

+ isa<IntrinsicInst>(Root)) &&

+ "Expected binop, select, or intrinsic for reduction matching");

+ Value *LHS =

+ Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));

+ Value *RHS =

+ Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);

+ if (LHS == Phi)

+ return dyn_cast<Instruction>(RHS);

+ if (RHS == Phi)

+ return dyn_cast<Instruction>(LHS);

+ return nullptr;

+/// \p Returns the first operand of \p I that does not match \p Phi. If

+/// operand is not an instruction it returns nullptr.

+static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {

+ Value *Op0 = nullptr;

+ Value *Op1 = nullptr;

+ if (!matchRdxBop(I, Op0, Op1))

+ return nullptr;

+ return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);

+/// \Returns true if \p I is a candidate instruction for reduction vectorization.

+static bool isReductionCandidate(Instruction *I) {

+ bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));

+ Value *B0 = nullptr, *B1 = nullptr;

+ bool IsBinop = matchRdxBop(I, B0, B1);

+ return IsBinop || IsSelect;

bool SLPVectorizerPass::vectorizeHorReduction(

- PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,

+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,

SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {

if (!ShouldVectorizeHor)

return false;

+ bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);

- auto *Root = dyn_cast_or_null<Instruction>(V);

- if (!Root)

+ if (Root->getParent() != BB || isa<PHINode>(Root))

return false;

- if (!isa<BinaryOperator>(Root))

- P = nullptr;

+ // If we can find a secondary reduction root, use that instead.

+ auto SelectRoot = [&]() {

+ if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&

+ HorizontalReduction::getRdxKind(Root) != RecurKind::None)

+ if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))

+ return NewRoot;

+ return Root;

+ };

- if (Root->getParent() != BB || isa<PHINode>(Root))

- return false;

// Start analysis starting from Root instruction. If horizontal reduction is

// found, try to vectorize it. If it is not a horizontal reduction or

// vectorization is not possible or not effective, and currently analyzed

@@ -13171,22 +14313,32 @@ bool SLPVectorizerPass::vectorizeHorReduction(

// If a horizintal reduction was not matched or vectorized we collect

// instructions for possible later attempts for vectorization.

std::queue<std::pair<Instruction *, unsigned>> Stack;

- Stack.emplace(Root, 0);

+ Stack.emplace(SelectRoot(), 0);

SmallPtrSet<Value *, 8> VisitedInstrs;

bool Res = false;

- auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0,

- Value *&B1) -> Value * {

+ auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {

if (R.isAnalyzedReductionRoot(Inst))

return nullptr;

- bool IsBinop = matchRdxBop(Inst, B0, B1);

- bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));

- if (IsBinop || IsSelect) {

- HorizontalReduction HorRdx;

- if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI))

- return HorRdx.tryToReduce(R, TTI, *TLI);

+ if (!isReductionCandidate(Inst))

+ return nullptr;

+ HorizontalReduction HorRdx;

+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))

+ return nullptr;

+ return HorRdx.tryToReduce(R, TTI, *TLI);

+ };

+ auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {

+ if (TryOperandsAsNewSeeds && FutureSeed == Root) {

+ FutureSeed = getNonPhiOperand(Root, P);

+ if (!FutureSeed)

+ return false;

}

- return nullptr;

+ // Do not collect CmpInst or InsertElementInst/InsertValueInst as their

+ // analysis is done separately.

+ if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))

+ PostponedInsts.push_back(FutureSeed);

+ return true;

};

while (!Stack.empty()) {

Instruction *Inst;

unsigned Level;

@@ -13197,37 +14349,19 @@ bool SLPVectorizerPass::vectorizeHorReduction(

// iteration while stack was populated before that happened.

if (R.isDeleted(Inst))

continue;

- Value *B0 = nullptr, *B1 = nullptr;

- if (Value *V = TryToReduce(Inst, B0, B1)) {

+ if (Value *VectorizedV = TryToReduce(Inst)) {

Res = true;

- // Set P to nullptr to avoid re-analysis of phi node in

- // matchAssociativeReduction function unless this is the root node.

- P = nullptr;

- if (auto *I = dyn_cast<Instruction>(V)) {

+ if (auto *I = dyn_cast<Instruction>(VectorizedV)) {

// Try to find another reduction.

Stack.emplace(I, Level);

continue;

}

} else {

- bool IsBinop = B0 && B1;

- if (P && IsBinop) {

- Inst = dyn_cast<Instruction>(B0);

- if (Inst == P)

- Inst = dyn_cast<Instruction>(B1);

- if (!Inst) {

- // Set P to nullptr to avoid re-analysis of phi node in

- // matchAssociativeReduction function unless this is the root node.

- P = nullptr;

- continue;

- }

+ // We could not vectorize `Inst` so try to use it as a future seed.

+ if (!TryAppendToPostponedInsts(Inst)) {

+ assert(Stack.empty() && "Expected empty stack");

+ break;

}

- // Set P to nullptr to avoid re-analysis of phi node in

- // matchAssociativeReduction function unless this is the root node.

- P = nullptr;

- // Do not collect CmpInst or InsertElementInst/InsertValueInst as their

- // analysis is done separately.

- if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))

- PostponedInsts.push_back(Inst);

}

// Try to vectorize operands.

@@ -13246,11 +14380,11 @@ bool SLPVectorizerPass::vectorizeHorReduction(

return Res;

}

-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,

+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,

BasicBlock *BB, BoUpSLP &R,

TargetTransformInfo *TTI) {

SmallVector<WeakTrackingVH> PostponedInsts;

- bool Res = vectorizeHorReduction(P, V, BB, R, TTI, PostponedInsts);

+ bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);

Res |= tryToVectorize(PostponedInsts, R);

return Res;

}

@@ -13297,13 +14431,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

}

template <typename T>

-static bool

-tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

- function_ref<unsigned(T *)> Limit,

- function_ref<bool(T *, T *)> Comparator,

- function_ref<bool(T *, T *)> AreCompatible,

- function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,

- bool LimitForRegisterSize) {

+static bool tryToVectorizeSequence(

+ SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,

+ function_ref<bool(T *, T *)> AreCompatible,

+ function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,

+ bool MaxVFOnly, BoUpSLP &R) {

bool Changed = false;

// Sort by type, parent, operands.

stable_sort(Incoming, Comparator);

@@ -13331,21 +14463,29 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

// same/alternate ops only, this may result in some extra final

// vectorization.

if (NumElts > 1 &&

- TryToVectorizeHelper(ArrayRef(IncIt, NumElts), LimitForRegisterSize)) {

+ TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {

// Success start over because instructions might have been changed.

Changed = true;

- } else if (NumElts < Limit(*IncIt) &&

- (Candidates.empty() ||

- Candidates.front()->getType() == (*IncIt)->getType())) {

- Candidates.append(IncIt, std::next(IncIt, NumElts));

+ } else {

+ /// \Returns the minimum number of elements that we will attempt to

+ /// vectorize.

+ auto GetMinNumElements = [&R](Value *V) {

+ unsigned EltSize = R.getVectorElementSize(V);

+ return std::max(2U, R.getMaxVecRegSize() / EltSize);

+ };

+ if (NumElts < GetMinNumElements(*IncIt) &&

+ (Candidates.empty() ||

+ Candidates.front()->getType() == (*IncIt)->getType())) {

+ Candidates.append(IncIt, std::next(IncIt, NumElts));

+ }

}

// Final attempt to vectorize instructions with the same types.

if (Candidates.size() > 1 &&

(SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {

- if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) {

+ if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {

// Success start over because instructions might have been changed.

Changed = true;

- } else if (LimitForRegisterSize) {

+ } else if (MaxVFOnly) {

// Try to vectorize using small vectors.

for (auto *It = Candidates.begin(), *End = Candidates.end();

It != End;) {

@@ -13353,9 +14493,8 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))

++SameTypeIt;

unsigned NumElts = (SameTypeIt - It);

- if (NumElts > 1 &&

- TryToVectorizeHelper(ArrayRef(It, NumElts),

- /*LimitForRegisterSize=*/false))

+ if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),

+ /*MaxVFOnly=*/false))

Changed = true;

It = SameTypeIt;

}

@@ -13378,11 +14517,12 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

/// of the second cmp instruction.

template <bool IsCompatibility>

static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,

- function_ref<bool(Instruction *)> IsDeleted) {

+ const DominatorTree &DT) {

+ assert(isValidElementType(V->getType()) &&

+ isValidElementType(V2->getType()) &&

+ "Expected valid element types only.");

auto *CI1 = cast<CmpInst>(V);

auto *CI2 = cast<CmpInst>(V2);

- if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))

- return false;

if (CI1->getOperand(0)->getType()->getTypeID() <

CI2->getOperand(0)->getType()->getTypeID())

return !IsCompatibility;

@@ -13411,31 +14551,102 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,

return false;

if (auto *I1 = dyn_cast<Instruction>(Op1))

if (auto *I2 = dyn_cast<Instruction>(Op2)) {

- if (I1->getParent() != I2->getParent())

- return false;

+ if (IsCompatibility) {

+ if (I1->getParent() != I2->getParent())

+ return false;

+ } else {

+ // Try to compare nodes with same parent.

+ DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());

+ DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());

+ if (!NodeI1)

+ return NodeI2 != nullptr;

+ if (!NodeI2)

+ return false;

+ assert((NodeI1 == NodeI2) ==

+ (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

+ "Different nodes should have different DFS numbers");

+ if (NodeI1 != NodeI2)

+ return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

+ }

InstructionsState S = getSameOpcode({I1, I2}, TLI);

- if (S.getOpcode())

+ if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))

continue;

- return false;

+ return !IsCompatibility && I1->getOpcode() < I2->getOpcode();

}

return IsCompatibility;

}

-bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,

- BasicBlock *BB, BoUpSLP &R,

- bool AtTerminator) {

+template <typename ItT>

+bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,

+ BasicBlock *BB, BoUpSLP &R) {

+ bool Changed = false;

+ // Try to find reductions first.

+ for (CmpInst *I : CmpInsts) {

+ if (R.isDeleted(I))

+ continue;

+ for (Value *Op : I->operands())

+ if (auto *RootOp = dyn_cast<Instruction>(Op))

+ Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);

+ }

+ // Try to vectorize operands as vector bundles.

+ for (CmpInst *I : CmpInsts) {

+ if (R.isDeleted(I))

+ continue;

+ Changed |= tryToVectorize(I, R);

+ }

+ // Try to vectorize list of compares.

+ // Sort by type, compare predicate, etc.

+ auto CompareSorter = [&](Value *V, Value *V2) {

+ if (V == V2)

+ return false;

+ return compareCmp<false>(V, V2, *TLI, *DT);

+ };

+ auto AreCompatibleCompares = [&](Value *V1, Value *V2) {

+ if (V1 == V2)

+ return true;

+ return compareCmp<true>(V1, V2, *TLI, *DT);

+ };

+ SmallVector<Value *> Vals;

+ for (Instruction *V : CmpInsts)

+ if (!R.isDeleted(V) && isValidElementType(V->getType()))

+ Vals.push_back(V);

+ if (Vals.size() <= 1)

+ return Changed;

+ Changed |= tryToVectorizeSequence<Value>(

+ Vals, CompareSorter, AreCompatibleCompares,

+ [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {

+ // Exclude possible reductions from other blocks.

+ bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {

+ return any_of(V->users(), [V](User *U) {

+ auto *Select = dyn_cast<SelectInst>(U);

+ return Select &&

+ Select->getParent() != cast<Instruction>(V)->getParent();

+ });

+ if (ArePossiblyReducedInOtherBlock)

+ return false;

+ return tryToVectorizeList(Candidates, R, MaxVFOnly);

+ },

+ /*MaxVFOnly=*/true, R);

+ return Changed;

+bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,

+ BasicBlock *BB, BoUpSLP &R) {

+ assert(all_of(Instructions,

+ [](auto *I) {

+ return isa<InsertElementInst, InsertValueInst>(I);

+ }) &&

+ "This function only accepts Insert instructions");

bool OpsChanged = false;

- SmallVector<Instruction *, 4> PostponedCmps;

SmallVector<WeakTrackingVH> PostponedInsts;

// pass1 - try to vectorize reductions only

for (auto *I : reverse(Instructions)) {

if (R.isDeleted(I))

continue;

- if (isa<CmpInst>(I)) {

- PostponedCmps.push_back(I);

- continue;

- }

OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);

}

// pass2 - try to match and vectorize a buildvector sequence.

@@ -13451,63 +14662,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,

// Now try to vectorize postponed instructions.

OpsChanged |= tryToVectorize(PostponedInsts, R);

- if (AtTerminator) {

- // Try to find reductions first.

- for (Instruction *I : PostponedCmps) {

- if (R.isDeleted(I))

- continue;

- for (Value *Op : I->operands())

- OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);

- }

- // Try to vectorize operands as vector bundles.

- for (Instruction *I : PostponedCmps) {

- if (R.isDeleted(I))

- continue;

- OpsChanged |= tryToVectorize(I, R);

- }

- // Try to vectorize list of compares.

- // Sort by type, compare predicate, etc.

- auto CompareSorter = [&](Value *V, Value *V2) {

- return compareCmp<false>(V, V2, *TLI,

- [&R](Instruction *I) { return R.isDeleted(I); });

- };

- auto AreCompatibleCompares = [&](Value *V1, Value *V2) {

- if (V1 == V2)

- return true;

- return compareCmp<true>(V1, V2, *TLI,

- [&R](Instruction *I) { return R.isDeleted(I); });

- };

- auto Limit = [&R](Value *V) {

- unsigned EltSize = R.getVectorElementSize(V);

- return std::max(2U, R.getMaxVecRegSize() / EltSize);

- };

- SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());

- OpsChanged |= tryToVectorizeSequence<Value>(

- Vals, Limit, CompareSorter, AreCompatibleCompares,

- [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {

- // Exclude possible reductions from other blocks.

- bool ArePossiblyReducedInOtherBlock =

- any_of(Candidates, [](Value *V) {

- return any_of(V->users(), [V](User *U) {

- return isa<SelectInst>(U) &&

- cast<SelectInst>(U)->getParent() !=

- cast<Instruction>(V)->getParent();

- });

- if (ArePossiblyReducedInOtherBlock)

- return false;

- return tryToVectorizeList(Candidates, R, LimitForRegisterSize);

- },

- /*LimitForRegisterSize=*/true);

- Instructions.clear();

- } else {

- Instructions.clear();

- // Insert in reverse order since the PostponedCmps vector was filled in

- // reverse order.

- Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend());

- }

+ Instructions.clear();

return OpsChanged;

}

@@ -13603,10 +14758,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

}

return true;

};

- auto Limit = [&R](Value *V) {

- unsigned EltSize = R.getVectorElementSize(V);

- return std::max(2U, R.getMaxVecRegSize() / EltSize);

- };

bool HaveVectorizedPhiNodes = false;

do {

@@ -13648,19 +14799,44 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

}

HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(

- Incoming, Limit, PHICompare, AreCompatiblePHIs,

- [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {

- return tryToVectorizeList(Candidates, R, LimitForRegisterSize);

+ Incoming, PHICompare, AreCompatiblePHIs,

+ [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {

+ return tryToVectorizeList(Candidates, R, MaxVFOnly);

- /*LimitForRegisterSize=*/true);

+ /*MaxVFOnly=*/true, R);

Changed |= HaveVectorizedPhiNodes;

VisitedInstrs.insert(Incoming.begin(), Incoming.end());

} while (HaveVectorizedPhiNodes);

VisitedInstrs.clear();

- InstSetVector PostProcessInstructions;

- SmallDenseSet<Instruction *, 4> KeyNodes;

+ InstSetVector PostProcessInserts;

+ SmallSetVector<CmpInst *, 8> PostProcessCmps;

+ // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true

+ // also vectorizes `PostProcessCmps`.

+ auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {

+ bool Changed = vectorizeInserts(PostProcessInserts, BB, R);

+ if (VectorizeCmps) {

+ Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);

+ PostProcessCmps.clear();

+ }

+ PostProcessInserts.clear();

+ return Changed;

+ };

+ // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.

+ auto IsInPostProcessInstrs = [&](Instruction *I) {

+ if (auto *Cmp = dyn_cast<CmpInst>(I))

+ return PostProcessCmps.contains(Cmp);

+ return isa<InsertElementInst, InsertValueInst>(I) &&

+ PostProcessInserts.contains(I);

+ };

+ // Returns true if `I` is an instruction without users, like terminator, or

+ // function call with ignored return value, store. Ignore unused instructions

+ // (basing on instruction type, except for CallInst and InvokeInst).

+ auto HasNoUsers = [](Instruction *I) {

+ return I->use_empty() &&

+ (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));

+ };

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

// Skip instructions with scalable type. The num of elements is unknown at

// compile-time for scalable type.

@@ -13672,9 +14848,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

continue;

// We may go through BB multiple times so skip the one we have checked.

if (!VisitedInstrs.insert(&*it).second) {

- if (it->use_empty() && KeyNodes.contains(&*it) &&

- vectorizeSimpleInstructions(PostProcessInstructions, BB, R,

- it->isTerminator())) {

+ if (HasNoUsers(&*it) &&

+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) {

// We would like to start over since some instructions are deleted

// and the iterator may become invalid value.

Changed = true;

@@ -13692,8 +14867,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

// Check that the PHI is a reduction PHI.

if (P->getNumIncomingValues() == 2) {

// Try to match and vectorize a horizontal reduction.

- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,

- TTI)) {

+ Instruction *Root = getReductionInstr(DT, P, BB, LI);

+ if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {

Changed = true;

it = BB->begin();

e = BB->end();

@@ -13714,19 +14889,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

// Postponed instructions should not be vectorized here, delay their

// vectorization.

if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));

- PI && !PostProcessInstructions.contains(PI))

- Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),

+ PI && !IsInPostProcessInstrs(PI))

+ Changed |= vectorizeRootInstruction(nullptr, PI,

P->getIncomingBlock(I), R, TTI);

}

continue;

}

- // Ran into an instruction without users, like terminator, or function call

- // with ignored return value, store. Ignore unused instructions (basing on

- // instruction type, except for CallInst and InvokeInst).

- if (it->use_empty() &&

- (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {

- KeyNodes.insert(&*it);

+ if (HasNoUsers(&*it)) {

bool OpsChanged = false;

auto *SI = dyn_cast<StoreInst>(it);

bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;

@@ -13746,16 +14916,16 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

// Postponed instructions should not be vectorized here, delay their

// vectorization.

if (auto *VI = dyn_cast<Instruction>(V);

- VI && !PostProcessInstructions.contains(VI))

+ VI && !IsInPostProcessInstrs(VI))

// Try to match and vectorize a horizontal reduction.

- OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);

+ OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);

}

// Start vectorization of post-process list of instructions from the

// top-tree instructions to try to vectorize as many instructions as

// possible.

- OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R,

- it->isTerminator());

+ OpsChanged |=

+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator());

if (OpsChanged) {

// We would like to start over since some instructions are deleted

// and the iterator may become invalid value.

@@ -13766,8 +14936,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

}

- if (isa<CmpInst, InsertElementInst, InsertValueInst>(it))

- PostProcessInstructions.insert(&*it);

+ if (isa<InsertElementInst, InsertValueInst>(it))

+ PostProcessInserts.insert(&*it);

+ else if (isa<CmpInst>(it))

+ PostProcessCmps.insert(cast<CmpInst>(&*it));

}

return Changed;

@@ -13928,10 +15100,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

return V1->getValueOperand()->getValueID() ==

V2->getValueOperand()->getValueID();

};

- auto Limit = [&R, this](StoreInst *SI) {

- unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType());

- return R.getMinVF(EltSize);

- };

// Attempt to sort and vectorize each of the store-groups.

for (auto &Pair : Stores) {

@@ -13945,28 +15113,11 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

continue;

Changed |= tryToVectorizeSequence<StoreInst>(

- Pair.second, Limit, StoreSorter, AreCompatibleStores,

+ Pair.second, StoreSorter, AreCompatibleStores,

[this, &R](ArrayRef<StoreInst *> Candidates, bool) {

return vectorizeStores(Candidates, R);

- /*LimitForRegisterSize=*/false);

+ /*MaxVFOnly=*/false, R);

}

return Changed;

}

-char SLPVectorizer::ID = 0;

-static const char lv_name[] = "SLP Vectorizer";

-INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)

-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)

-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)

-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)

-INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)

-Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }