src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-12-18 20:30:12 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2024-04-06 20:11:55 +0000
commit	5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch)
tree	1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent	3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff)
parent	312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

4231

1 files changed, 2733 insertions, 1498 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9870ffbb586c..9d799124074c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -19,7 +19,6 @@

#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/DenseSet.h"

-#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/PriorityQueue.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetOperations.h"

@@ -34,6 +33,7 @@

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/AssumptionCache.h"

#include "llvm/Analysis/CodeMetrics.h"

+#include "llvm/Analysis/ConstantFolding.h"

#include "llvm/Analysis/DemandedBits.h"

#include "llvm/Analysis/GlobalsModRef.h"

#include "llvm/Analysis/IVDescriptors.h"

@@ -97,7 +97,6 @@

#include <string>

#include <tuple>

#include <utility>

-#include <vector>

using namespace llvm;

using namespace llvm::PatternMatch;

@@ -108,8 +107,9 @@ using namespace slpvectorizer;

STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

-cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,

- cl::desc("Run the SLP vectorization passes"));

+static cl::opt<bool>

+ RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,

+ cl::desc("Run the SLP vectorization passes"));

static cl::opt<int>

SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,

@@ -140,10 +140,6 @@ static cl::opt<unsigned>

MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,

cl::desc("Maximum SLP vectorization factor (0=unlimited)"));

-static cl::opt<int>

-MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,

- cl::desc("Maximum depth of the lookup for consecutive stores."));

/// Limits the size of scheduling regions in a block.

/// It avoid long compile times for _very_ large blocks where vector

/// instructions are spread over a wide range.

@@ -232,6 +228,17 @@ static bool isVectorLikeInstWithConstOps(Value *V) {

return isConstant(I->getOperand(2));

}

+#if !defined(NDEBUG)

+/// Print a short descriptor of the instruction bundle suitable for debug output.

+static std::string shortBundleName(ArrayRef<Value *> VL) {

+ std::string Result;

+ raw_string_ostream OS(Result);

+ OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";

+ OS.flush();

+ return Result;

+#endif

/// \returns true if all of the instructions in \p VL are in the same block or

/// false otherwise.

static bool allSameBlock(ArrayRef<Value *> VL) {

@@ -384,8 +391,10 @@ static SmallBitVector isUndefVector(const Value *V,

if (isa<T>(II->getOperand(1)))

continue;

std::optional<unsigned> Idx = getInsertIndex(II);

- if (!Idx)

- continue;

+ if (!Idx) {

+ Res.reset();

+ return Res;

+ }

if (*Idx < UseMask.size() && !UseMask.test(*Idx))

Res.reset(*Idx);

}

@@ -429,26 +438,6 @@ static SmallBitVector isUndefVector(const Value *V,

/// i32 6>

/// %2 = mul <4 x i8> %1, %1

/// ret <4 x i8> %2

-/// We convert this initially to something like:

-/// %x0 = extractelement <4 x i8> %x, i32 0

-/// %x3 = extractelement <4 x i8> %x, i32 3

-/// %y1 = extractelement <4 x i8> %y, i32 1

-/// %y2 = extractelement <4 x i8> %y, i32 2

-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0

-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1

-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2

-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3

-/// %5 = mul <4 x i8> %4, %4

-/// %6 = extractelement <4 x i8> %5, i32 0

-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0

-/// %7 = extractelement <4 x i8> %5, i32 1

-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1

-/// %8 = extractelement <4 x i8> %5, i32 2

-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2

-/// %9 = extractelement <4 x i8> %5, i32 3

-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3

-/// ret <4 x i8> %ins4

-/// InstCombiner transforms this into a shuffle and vector mul

/// Mask will return the Shuffle Mask equivalent to the extracted elements.

/// TODO: Can we split off and reuse the shuffle mask detection from

/// ShuffleVectorInst/getShuffleCost?

@@ -539,117 +528,6 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {

return *EI->idx_begin();

}

-/// Tries to find extractelement instructions with constant indices from fixed

-/// vector type and gather such instructions into a bunch, which highly likely

-/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

-/// successful, the matched scalars are replaced by poison values in \p VL for

-/// future analysis.

-static std::optional<TTI::ShuffleKind>

-tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

- SmallVectorImpl<int> &Mask) {

- // Scan list of gathered scalars for extractelements that can be represented

- // as shuffles.

- MapVector<Value *, SmallVector<int>> VectorOpToIdx;

- SmallVector<int> UndefVectorExtracts;

- for (int I = 0, E = VL.size(); I < E; ++I) {

- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

- if (!EI) {

- if (isa<UndefValue>(VL[I]))

- UndefVectorExtracts.push_back(I);

- continue;

- }

- auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

- if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

- continue;

- std::optional<unsigned> Idx = getExtractIndex(EI);

- // Undefined index.

- if (!Idx) {

- UndefVectorExtracts.push_back(I);

- continue;

- }

- SmallBitVector ExtractMask(VecTy->getNumElements(), true);

- ExtractMask.reset(*Idx);

- if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

- UndefVectorExtracts.push_back(I);

- continue;

- }

- VectorOpToIdx[EI->getVectorOperand()].push_back(I);

- }

- // Sort the vector operands by the maximum number of uses in extractelements.

- MapVector<unsigned, SmallVector<Value *>> VFToVector;

- for (const auto &Data : VectorOpToIdx)

- VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]

- .push_back(Data.first);

- for (auto &Data : VFToVector) {

- stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {

- return VectorOpToIdx.find(V1)->second.size() >

- VectorOpToIdx.find(V2)->second.size();

- });

- }

- // Find the best pair of the vectors with the same number of elements or a

- // single vector.

- const int UndefSz = UndefVectorExtracts.size();

- unsigned SingleMax = 0;

- Value *SingleVec = nullptr;

- unsigned PairMax = 0;

- std::pair<Value *, Value *> PairVec(nullptr, nullptr);

- for (auto &Data : VFToVector) {

- Value *V1 = Data.second.front();

- if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {

- SingleMax = VectorOpToIdx[V1].size() + UndefSz;

- SingleVec = V1;

- }

- Value *V2 = nullptr;

- if (Data.second.size() > 1)

- V2 = *std::next(Data.second.begin());

- if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +

- UndefSz) {

- PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;

- PairVec = std::make_pair(V1, V2);

- }

- if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

- return std::nullopt;

- // Check if better to perform a shuffle of 2 vectors or just of a single

- // vector.

- SmallVector<Value *> SavedVL(VL.begin(), VL.end());

- SmallVector<Value *> GatheredExtracts(

- VL.size(), PoisonValue::get(VL.front()->getType()));

- if (SingleMax >= PairMax && SingleMax) {

- for (int Idx : VectorOpToIdx[SingleVec])

- std::swap(GatheredExtracts[Idx], VL[Idx]);

- } else {

- for (Value *V : {PairVec.first, PairVec.second})

- for (int Idx : VectorOpToIdx[V])

- std::swap(GatheredExtracts[Idx], VL[Idx]);

- }

- // Add extracts from undefs too.

- for (int Idx : UndefVectorExtracts)

- std::swap(GatheredExtracts[Idx], VL[Idx]);

- // Check that gather of extractelements can be represented as just a

- // shuffle of a single/two vectors the scalars are extracted from.

- std::optional<TTI::ShuffleKind> Res =

- isFixedVectorShuffle(GatheredExtracts, Mask);

- if (!Res) {

- // TODO: try to check other subsets if possible.

- // Restore the original VL if attempt was not successful.

- VL.swap(SavedVL);

- return std::nullopt;

- }

- // Restore unused scalars from mask, if some of the extractelements were not

- // selected for shuffle.

- for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {

- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

- if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

- !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

- is_contained(UndefVectorExtracts, I))

- continue;

- if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))

- std::swap(VL[I], GatheredExtracts[I]);

- }

- return Res;

namespace {

/// Main data required for vectorization of instructions.

@@ -695,7 +573,7 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {

return S.OpValue;

}

-/// \returns true if \p Opcode is allowed as part of of the main/alternate

+/// \returns true if \p Opcode is allowed as part of the main/alternate

/// instruction for SLP vectorization.

///

/// Example of unsupported opcode is SDIV that can potentially cause UB if the

@@ -889,18 +767,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

/// \returns true if all of the values in \p VL have the same type or false

/// otherwise.

static bool allSameType(ArrayRef<Value *> VL) {

- Type *Ty = VL[0]->getType();

- for (int i = 1, e = VL.size(); i < e; i++)

- if (VL[i]->getType() != Ty)

- return false;

- return true;

+ Type *Ty = VL.front()->getType();

+ return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });

}

/// \returns True if in-tree use also needs extract. This refers to

/// possible scalar operand in vectorized instruction.

-static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

- TargetLibraryInfo *TLI) {

+static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

+ TargetLibraryInfo *TLI) {

unsigned Opcode = UserInst->getOpcode();

switch (Opcode) {

case Instruction::Load: {

@@ -914,11 +788,10 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

case Instruction::Call: {

CallInst *CI = cast<CallInst>(UserInst);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

- return (CI->getArgOperand(i) == Scalar);

- }

- [[fallthrough]];

+ return any_of(enumerate(CI->args()), [&](auto &&Arg) {

+ return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&

+ Arg.value().get() == Scalar;

+ });

}

default:

return false;

@@ -1181,6 +1054,7 @@ public:

void deleteTree() {

VectorizableTree.clear();

ScalarToTreeEntry.clear();

+ MultiNodeScalars.clear();

MustGather.clear();

EntryToLastInstruction.clear();

ExternalUses.clear();

@@ -1273,7 +1147,7 @@ public:

/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

///

/// \returns number of elements in vector if isomorphism exists, 0 otherwise.

- unsigned canMapToVector(Type *T, const DataLayout &DL) const;

+ unsigned canMapToVector(Type *T) const;

/// \returns True if the VectorizableTree is both tiny and not fully

/// vectorizable. We do not vectorize such trees.

@@ -1324,6 +1198,9 @@ public:

}

LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }

#endif

+ bool operator == (const EdgeInfo &Other) const {

+ return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;

+ }

};

/// A helper class used for scoring candidates for two consecutive lanes.

@@ -1764,7 +1641,7 @@ public:

auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);

if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))

return 0;

- return R.areAllUsersVectorized(IdxLaneI, std::nullopt)

+ return R.areAllUsersVectorized(IdxLaneI)

? LookAheadHeuristics::ScoreAllUserVectorized

: 0;

}

@@ -1941,7 +1818,7 @@ public:

HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

} else if (NumFreeOpsHash.NumOfAPOs == Min &&

NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

- auto It = HashMap.find(NumFreeOpsHash.Hash);

+ auto *It = HashMap.find(NumFreeOpsHash.Hash);

if (It == HashMap.end())

HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

else

@@ -2203,7 +2080,7 @@ public:

for (int Pass = 0; Pass != 2; ++Pass) {

// Check if no need to reorder operands since they're are perfect or

// shuffled diamond match.

- // Need to to do it to avoid extra external use cost counting for

+ // Need to do it to avoid extra external use cost counting for

// shuffled matches, which may cause regressions.

if (SkipReordering())

break;

@@ -2388,6 +2265,18 @@ public:

~BoUpSLP();

private:

+ /// Determine if a vectorized value \p V in can be demoted to

+ /// a smaller type with a truncation. We collect the values that will be

+ /// demoted in ToDemote and additional roots that require investigating in

+ /// Roots.

+ /// \param DemotedConsts list of Instruction/OperandIndex pairs that are

+ /// constant and to be demoted. Required to correctly identify constant nodes

+ /// to be demoted.

+ bool collectValuesToDemote(

+ Value *V, SmallVectorImpl<Value *> &ToDemote,

+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,

+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;

/// Check if the operands on the edges \p Edges of the \p UserTE allows

/// reordering (i.e. the operands can be reordered because they have only one

/// user and reordarable).

@@ -2410,12 +2299,25 @@ private:

TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {

ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);

TreeEntry *TE = nullptr;

- const auto *It = find_if(VL, [this, &TE](Value *V) {

+ const auto *It = find_if(VL, [&](Value *V) {

TE = getTreeEntry(V);

- return TE;

+ if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))

+ return true;

+ auto It = MultiNodeScalars.find(V);

+ if (It != MultiNodeScalars.end()) {

+ for (TreeEntry *E : It->second) {

+ if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {

+ TE = E;

+ return true;

+ }

+ return false;

});

- if (It != VL.end() && TE->isSame(VL))

+ if (It != VL.end()) {

+ assert(TE->isSame(VL) && "Expected same scalars.");

return TE;

+ }

return nullptr;

}

@@ -2428,13 +2330,16 @@ private:

}

/// Checks if all users of \p I are the part of the vectorization tree.

- bool areAllUsersVectorized(Instruction *I,

- ArrayRef<Value *> VectorizedVals) const;

+ bool areAllUsersVectorized(

+ Instruction *I,

+ const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;

/// Return information about the vector formed for the specified index

/// of a vector of (the same) instruction.

- TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,

- unsigned OpIdx);

+ TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);

+ /// \ returns the graph entry for the \p Idx operand of the \p E entry.

+ const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;

/// \returns the cost of the vectorizable entry.

InstructionCost getEntryCost(const TreeEntry *E,

@@ -2450,15 +2355,22 @@ private:

/// vector) and sets \p CurrentOrder to the identity permutation; otherwise

/// returns false, setting \p CurrentOrder to either an empty vector or a

/// non-identity permutation that allows to reuse extract instructions.

+ /// \param ResizeAllowed indicates whether it is allowed to handle subvector

+ /// extract order.

bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

- SmallVectorImpl<unsigned> &CurrentOrder) const;

+ SmallVectorImpl<unsigned> &CurrentOrder,

+ bool ResizeAllowed = false) const;

/// Vectorize a single entry in the tree.

- Value *vectorizeTree(TreeEntry *E);

+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

+ /// avoid issues with def-use order.

+ Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);

/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry

/// \p E.

- Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);

+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

+ /// avoid issues with def-use order.

+ Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);

/// Create a new vector from a list of scalar values. Produces a sequence

/// which exploits values reused across lanes, and arranges the inserts

@@ -2477,17 +2389,50 @@ private:

/// instruction in the list).

Instruction &getLastInstructionInBundle(const TreeEntry *E);

- /// Checks if the gathered \p VL can be represented as shuffle(s) of previous

- /// tree entries.

+ /// Tries to find extractelement instructions with constant indices from fixed

+ /// vector type and gather such instructions into a bunch, which highly likely

+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

+ /// was successful, the matched scalars are replaced by poison values in \p VL

+ /// for future analysis.

+ std::optional<TargetTransformInfo::ShuffleKind>

+ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,

+ SmallVectorImpl<int> &Mask) const;

+ /// Tries to find extractelement instructions with constant indices from fixed

+ /// vector type and gather such instructions into a bunch, which highly likely

+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

+ /// was successful, the matched scalars are replaced by poison values in \p VL

+ /// for future analysis.

+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

+ tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

+ SmallVectorImpl<int> &Mask,

+ unsigned NumParts) const;

+ /// Checks if the gathered \p VL can be represented as a single register

+ /// shuffle(s) of previous tree entries.

/// \param TE Tree entry checked for permutation.

/// \param VL List of scalars (a subset of the TE scalar), checked for

- /// permutations.

+ /// permutations. Must form single-register vector.

/// \returns ShuffleKind, if gathered values can be represented as shuffles of

- /// previous tree entries. \p Mask is filled with the shuffle mask.

+ /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.

std::optional<TargetTransformInfo::ShuffleKind>

- isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

- SmallVectorImpl<int> &Mask,

- SmallVectorImpl<const TreeEntry *> &Entries);

+ isGatherShuffledSingleRegisterEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,

+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);

+ /// Checks if the gathered \p VL can be represented as multi-register

+ /// shuffle(s) of previous tree entries.

+ /// \param TE Tree entry checked for permutation.

+ /// \param VL List of scalars (a subset of the TE scalar), checked for

+ /// permutations.

+ /// \returns per-register series of ShuffleKind, if gathered values can be

+ /// represented as shuffles of previous tree entries. \p Mask is filled with

+ /// the shuffle mask (also on per-register base).

+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

+ isGatherShuffledEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,

+ unsigned NumParts);

/// \returns the scalarization cost for this list of values. Assuming that

/// this subtree gets vectorized, we may need to extract the values from the

@@ -2517,14 +2462,14 @@ private:

/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the

/// users of \p TE and collects the stores. It returns the map from the store

/// pointers to the collected stores.

- DenseMap<Value *, SmallVector<StoreInst *, 4>>

+ DenseMap<Value *, SmallVector<StoreInst *>>

collectUserStores(const BoUpSLP::TreeEntry *TE) const;

/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the

- /// stores in \p StoresVec can form a vector instruction. If so it returns true

- /// and populates \p ReorderIndices with the shuffle indices of the the stores

- /// when compared to the sorted vector.

- bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

+ /// stores in \p StoresVec can form a vector instruction. If so it returns

+ /// true and populates \p ReorderIndices with the shuffle indices of the

+ /// stores when compared to the sorted vector.

+ bool canFormVector(ArrayRef<StoreInst *> StoresVec,

OrdersType &ReorderIndices) const;

/// Iterates through the users of \p TE, looking for scalar stores that can be

@@ -2621,10 +2566,18 @@ private:

/// The Scalars are vectorized into this value. It is initialized to Null.

WeakTrackingVH VectorizedValue = nullptr;

+ /// New vector phi instructions emitted for the vectorized phi nodes.

+ PHINode *PHI = nullptr;

/// Do we need to gather this sequence or vectorize it

/// (either with vector instruction or with scatter/gather

/// intrinsics for store/load)?

- enum EntryState { Vectorize, ScatterVectorize, NeedToGather };

+ enum EntryState {

+ Vectorize,

+ ScatterVectorize,

+ PossibleStridedVectorize,

+ NeedToGather

+ };

EntryState State;

/// Does this sequence require some shuffling?

@@ -2772,6 +2725,14 @@ private:

return FoundLane;

}

+ /// Build a shuffle mask for graph entry which represents a merge of main

+ /// and alternate operations.

+ void

+ buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,

+ SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<Value *> *OpScalars = nullptr,

+ SmallVectorImpl<Value *> *AltScalars = nullptr) const;

#ifndef NDEBUG

/// Debug printer.

LLVM_DUMP_METHOD void dump() const {

@@ -2792,6 +2753,9 @@ private:

case ScatterVectorize:

dbgs() << "ScatterVectorize\n";

break;

+ case PossibleStridedVectorize:

+ dbgs() << "PossibleStridedVectorize\n";

+ break;

case NeedToGather:

dbgs() << "NeedToGather\n";

break;

@@ -2892,7 +2856,14 @@ private:

}

if (Last->State != TreeEntry::NeedToGather) {

for (Value *V : VL) {

- assert(!getTreeEntry(V) && "Scalar already in tree!");

+ const TreeEntry *TE = getTreeEntry(V);

+ assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&

+ "Scalar already in tree!");

+ if (TE) {

+ if (TE != Last)

+ MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);

+ continue;

+ }

ScalarToTreeEntry[V] = Last;

}

// Update the scheduler bundle to point to this TreeEntry.

@@ -2905,7 +2876,8 @@ private:

for (Value *V : VL) {

if (doesNotNeedToBeScheduled(V))

continue;

- assert(BundleMember && "Unexpected end of bundle.");

+ if (!BundleMember)

+ continue;

BundleMember->TE = Last;

BundleMember = BundleMember->NextInBundle;

}

@@ -2913,6 +2885,10 @@ private:

assert(!BundleMember && "Bundle and VL out of sync");

} else {

MustGather.insert(VL.begin(), VL.end());

+ // Build a map for gathered scalars to the nodes where they are used.

+ for (Value *V : VL)

+ if (!isConstant(V))

+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);

}

if (UserTreeIdx.UserTE)

@@ -2950,6 +2926,10 @@ private:

/// Maps a specific scalar to its tree entry.

SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;

+ /// List of scalars, used in several vectorize nodes, and the list of the

+ /// nodes.

+ SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;

/// Maps a value to the proposed vectorizable size.

SmallDenseMap<Value *, unsigned> InstrElementSize;

@@ -2995,25 +2975,25 @@ private:

/// is invariant in the calling loop.

bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,

Instruction *Inst2) {

+ if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))

+ return true;

// First check if the result is already in the cache.

- AliasCacheKey key = std::make_pair(Inst1, Inst2);

- std::optional<bool> &result = AliasCache[key];

- if (result) {

- return *result;

- }

- bool aliased = true;

- if (Loc1.Ptr && isSimple(Inst1))

- aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

+ AliasCacheKey Key = std::make_pair(Inst1, Inst2);

+ auto It = AliasCache.find(Key);

+ if (It != AliasCache.end())

+ return It->second;

+ bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

// Store the result in the cache.

- result = aliased;

- return aliased;

+ AliasCache.try_emplace(Key, Aliased);

+ AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);

+ return Aliased;

}

using AliasCacheKey = std::pair<Instruction *, Instruction *>;

/// Cache for alias results.

/// TODO: consider moving this to the AliasAnalysis itself.

- DenseMap<AliasCacheKey, std::optional<bool>> AliasCache;

+ DenseMap<AliasCacheKey, bool> AliasCache;

// Cache for pointerMayBeCaptured calls inside AA. This is preserved

// globally through SLP because we don't perform any action which

@@ -3047,7 +3027,7 @@ private:

SetVector<Instruction *> GatherShuffleExtractSeq;

/// A list of blocks that we are going to CSE.

- SetVector<BasicBlock *> CSEBlocks;

+ DenseSet<BasicBlock *> CSEBlocks;

/// Contains all scheduling relevant data for an instruction.

/// A ScheduleData either represents a single instruction or a member of an

@@ -3497,7 +3477,7 @@ private:

BasicBlock *BB;

/// Simple memory allocation for ScheduleData.

- std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

+ SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

/// The size of a ScheduleData array in ScheduleDataChunks.

int ChunkSize;

@@ -3607,7 +3587,7 @@ private:

/// where "width" indicates the minimum bit width and "signed" is True if the

/// value must be signed-extended, rather than zero-extended, back to its

/// original width.

- MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;

+ DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;

};

} // end namespace slpvectorizer

@@ -3676,7 +3656,7 @@ template <> struct GraphTraits<BoUpSLP *> {

template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

using TreeEntry = BoUpSLP::TreeEntry;

- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

+ DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}

std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {

std::string Str;

@@ -3699,7 +3679,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

const BoUpSLP *) {

if (Entry->State == TreeEntry::NeedToGather)

return "color=red";

- if (Entry->State == TreeEntry::ScatterVectorize)

+ if (Entry->State == TreeEntry::ScatterVectorize ||

+ Entry->State == TreeEntry::PossibleStridedVectorize)

return "color=blue";

return "";

}

@@ -3761,7 +3742,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {

inversePermutation(Order, MaskOrder);

}

reorderReuses(MaskOrder, Mask);

- if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {

+ if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {

Order.clear();

return;

}

@@ -3779,7 +3760,40 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

OrdersType CurrentOrder(NumScalars, NumScalars);

SmallVector<int> Positions;

SmallBitVector UsedPositions(NumScalars);

- const TreeEntry *STE = nullptr;

+ DenseMap<const TreeEntry *, unsigned> UsedEntries;

+ DenseMap<Value *, std::pair<const TreeEntry *, unsigned>> ValueToEntryPos;

+ for (Value *V : TE.Scalars) {

+ if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))

+ continue;

+ const auto *LocalSTE = getTreeEntry(V);

+ if (!LocalSTE)

+ continue;

+ unsigned Lane =

+ std::distance(LocalSTE->Scalars.begin(), find(LocalSTE->Scalars, V));

+ if (Lane >= NumScalars)

+ continue;

+ ++UsedEntries.try_emplace(LocalSTE, 0).first->getSecond();

+ ValueToEntryPos.try_emplace(V, LocalSTE, Lane);

+ }

+ if (UsedEntries.empty())

+ return std::nullopt;

+ const TreeEntry &BestSTE =

+ *std::max_element(UsedEntries.begin(), UsedEntries.end(),

+ [](const std::pair<const TreeEntry *, unsigned> &P1,

+ const std::pair<const TreeEntry *, unsigned> &P2) {

+ return P1.second < P2.second;

+ })

+ ->first;

+ UsedEntries.erase(&BestSTE);

+ const TreeEntry *SecondBestSTE = nullptr;

+ if (!UsedEntries.empty())

+ SecondBestSTE =

+ std::max_element(UsedEntries.begin(), UsedEntries.end(),

+ [](const std::pair<const TreeEntry *, unsigned> &P1,

+ const std::pair<const TreeEntry *, unsigned> &P2) {

+ return P1.second < P2.second;

+ })

+ ->first;

// Try to find all gathered scalars that are gets vectorized in other

// vectorize node. Here we can have only one single tree vector node to

// correctly identify order of the gathered scalars.

@@ -3787,58 +3801,56 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

Value *V = TE.Scalars[I];

if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))

continue;

- if (const auto *LocalSTE = getTreeEntry(V)) {

- if (!STE)

- STE = LocalSTE;

- else if (STE != LocalSTE)

- // Take the order only from the single vector node.

- return std::nullopt;

- unsigned Lane =

- std::distance(STE->Scalars.begin(), find(STE->Scalars, V));

- if (Lane >= NumScalars)

- return std::nullopt;

- if (CurrentOrder[Lane] != NumScalars) {

- if (Lane != I)

- continue;

- UsedPositions.reset(CurrentOrder[Lane]);

- }

- // The partial identity (where only some elements of the gather node are

- // in the identity order) is good.

- CurrentOrder[Lane] = I;

- UsedPositions.set(I);

+ const auto [LocalSTE, Lane] = ValueToEntryPos.lookup(V);

+ if (!LocalSTE || (LocalSTE != &BestSTE && LocalSTE != SecondBestSTE))

+ continue;

+ if (CurrentOrder[Lane] != NumScalars) {

+ if ((CurrentOrder[Lane] >= BestSTE.Scalars.size() ||

+ BestSTE.Scalars[CurrentOrder[Lane]] == V) &&

+ (Lane != I || LocalSTE == SecondBestSTE))

+ continue;

+ UsedPositions.reset(CurrentOrder[Lane]);

}

+ // The partial identity (where only some elements of the gather node are

+ // in the identity order) is good.

+ CurrentOrder[Lane] = I;

+ UsedPositions.set(I);

}

// Need to keep the order if we have a vector entry and at least 2 scalars or

// the vectorized entry has just 2 scalars.

- if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {

- auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {

- for (unsigned I = 0; I < NumScalars; ++I)

- if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)

- return false;

- return true;

- };

- if (IsIdentityOrder(CurrentOrder))

- return OrdersType();

- auto *It = CurrentOrder.begin();

- for (unsigned I = 0; I < NumScalars;) {

- if (UsedPositions.test(I)) {

- ++I;

- continue;

- }

- if (*It == NumScalars) {

- *It = I;

- ++I;

- }

- ++It;

+ if (BestSTE.Scalars.size() != 2 && UsedPositions.count() <= 1)

+ return std::nullopt;

+ auto IsIdentityOrder = [&](ArrayRef<unsigned> CurrentOrder) {

+ for (unsigned I = 0; I < NumScalars; ++I)

+ if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)

+ return false;

+ return true;

+ };

+ if (IsIdentityOrder(CurrentOrder))

+ return OrdersType();

+ auto *It = CurrentOrder.begin();

+ for (unsigned I = 0; I < NumScalars;) {

+ if (UsedPositions.test(I)) {

+ ++I;

+ continue;

}

- return std::move(CurrentOrder);

+ if (*It == NumScalars) {

+ *It = I;

+ ++I;

+ }

+ ++It;

}

- return std::nullopt;

+ return std::move(CurrentOrder);

}

namespace {

/// Tracks the state we can represent the loads in the given sequence.

-enum class LoadsState { Gather, Vectorize, ScatterVectorize };

+enum class LoadsState {

+ Gather,

+ Vectorize,

+ ScatterVectorize,

+ PossibleStridedVectorize

+};

} // anonymous namespace

static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,

@@ -3898,6 +3910,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

if (IsSorted || all_of(PointerOps, [&](Value *P) {

return arePointersCompatible(P, PointerOps.front(), TLI);

})) {

+ bool IsPossibleStrided = false;

if (IsSorted) {

Value *Ptr0;

Value *PtrN;

@@ -3913,6 +3926,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

// Check that the sorted loads are consecutive.

if (static_cast<unsigned>(*Diff) == VL.size() - 1)

return LoadsState::Vectorize;

+ // Simple check if not a strided access - clear order.

+ IsPossibleStrided = *Diff % (VL.size() - 1) == 0;

}

// TODO: need to improve analysis of the pointers, if not all of them are

// GEPs or have > 2 operands, we end up with a gather node, which just

@@ -3934,7 +3949,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&

!TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))

- return LoadsState::ScatterVectorize;

+ return IsPossibleStrided ? LoadsState::PossibleStridedVectorize

+ : LoadsState::ScatterVectorize;

}

@@ -4050,7 +4066,8 @@ static bool areTwoInsertFromSameBuildVector(

// Go through the vector operand of insertelement instructions trying to find

// either VU as the original vector for IE2 or V as the original vector for

// IE1.

- SmallSet<int, 8> ReusedIdx;

+ SmallBitVector ReusedIdx(

+ cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());

bool IsReusedIdx = false;

do {

if (IE2 == VU && !IE1)

@@ -4058,16 +4075,18 @@ static bool areTwoInsertFromSameBuildVector(

if (IE1 == V && !IE2)

return V->hasOneUse();

if (IE1 && IE1 != V) {

- IsReusedIdx |=

- !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;

+ unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);

+ IsReusedIdx |= ReusedIdx.test(Idx1);

+ ReusedIdx.set(Idx1);

if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)

IE1 = nullptr;

else

IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));

}

if (IE2 && IE2 != VU) {

- IsReusedIdx |=

- !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;

+ unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);

+ IsReusedIdx |= ReusedIdx.test(Idx2);

+ ReusedIdx.set(Idx2);

if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)

IE2 = nullptr;

else

@@ -4135,13 +4154,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

return std::nullopt; // No need to reorder.

return std::move(ResOrder);

}

- if (TE.State == TreeEntry::Vectorize &&

+ if ((TE.State == TreeEntry::Vectorize ||

+ TE.State == TreeEntry::PossibleStridedVectorize) &&

(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&

!TE.isAltShuffle())

return TE.ReorderIndices;

if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {

- auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {

+ auto PHICompare = [&](unsigned I1, unsigned I2) {

+ Value *V1 = TE.Scalars[I1];

+ Value *V2 = TE.Scalars[I2];

if (V1 == V2)

return false;

if (!V1->hasOneUse() || !V2->hasOneUse())

@@ -4180,14 +4202,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

};

if (!TE.ReorderIndices.empty())

return TE.ReorderIndices;

- DenseMap<Value *, unsigned> PhiToId;

- SmallVector<Value *, 4> Phis;

+ DenseMap<unsigned, unsigned> PhiToId;

+ SmallVector<unsigned> Phis(TE.Scalars.size());

+ std::iota(Phis.begin(), Phis.end(), 0);

OrdersType ResOrder(TE.Scalars.size());

- for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {

- PhiToId[TE.Scalars[Id]] = Id;

- Phis.push_back(TE.Scalars[Id]);

- }

- llvm::stable_sort(Phis, PHICompare);

+ for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)

+ PhiToId[Id] = Id;

+ stable_sort(Phis, PHICompare);

for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)

ResOrder[Id] = PhiToId[Phis[Id]];

if (IsIdentityOrder(ResOrder))

@@ -4214,7 +4235,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

// Check that gather of extractelements can be represented as

// just a shuffle of a single vector.

OrdersType CurrentOrder;

- bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);

+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,

+ /*ResizeAllowed=*/true);

if (Reuse || !CurrentOrder.empty()) {

if (!CurrentOrder.empty())

fixupOrderingIndices(CurrentOrder);

@@ -4270,7 +4292,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,

unsigned Sz) {

ArrayRef<int> FirstCluster = Mask.slice(0, Sz);

- if (ShuffleVectorInst::isIdentityMask(FirstCluster))

+ if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))

return false;

for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {

ArrayRef<int> Cluster = Mask.slice(I, Sz);

@@ -4386,7 +4408,9 @@ void BoUpSLP::reorderTopToBottom() {

++Cnt;

}

VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())

+ if (!(TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize) ||

+ !TE->ReuseShuffleIndices.empty())

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

if (TE->State == TreeEntry::Vectorize &&

TE->getOpcode() == Instruction::PHI)

@@ -4409,6 +4433,9 @@ void BoUpSLP::reorderTopToBottom() {

MapVector<OrdersType, unsigned,

DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

OrdersUses;

+ // Last chance orders - scatter vectorize. Try to use their orders if no

+ // other orders or the order is counted already.

+ SmallVector<OrdersType> StridedVectorizeOrders;

SmallPtrSet<const TreeEntry *, 4> VisitedOps;

for (const TreeEntry *OpTE : OrderedEntries) {

// No need to reorder this nodes, still need to extend and to use shuffle,

@@ -4455,6 +4482,11 @@ void BoUpSLP::reorderTopToBottom() {

if (Order.empty())

continue;

}

+ // Postpone scatter orders.

+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {

+ StridedVectorizeOrders.push_back(Order);

+ continue;

+ }

// Stores actually store the mask, not the order, need to invert.

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

@@ -4472,8 +4504,21 @@ void BoUpSLP::reorderTopToBottom() {

}

// Set order of the user node.

- if (OrdersUses.empty())

- continue;

+ if (OrdersUses.empty()) {

+ if (StridedVectorizeOrders.empty())

+ continue;

+ // Add (potentially!) strided vectorize orders.

+ for (OrdersType &Order : StridedVectorizeOrders)

+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

+ } else {

+ // Account (potentially!) strided vectorize orders only if it was used

+ // already.

+ for (OrdersType &Order : StridedVectorizeOrders) {

+ auto *It = OrdersUses.find(Order);

+ if (It != OrdersUses.end())

+ ++It->second;

+ }

// Choose the most used order.

ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

unsigned Cnt = OrdersUses.front().second;

@@ -4514,7 +4559,8 @@ void BoUpSLP::reorderTopToBottom() {

}

continue;

}

- if (TE->State == TreeEntry::Vectorize &&

+ if ((TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize) &&

isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,

InsertElementInst>(TE->getMainOp()) &&

!TE->isAltShuffle()) {

@@ -4555,6 +4601,10 @@ bool BoUpSLP::canReorderOperands(

}))

continue;

if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {

+ // FIXME: Do not reorder (possible!) strided vectorized nodes, they

+ // require reordering of the operands, which is not implemented yet.

+ if (TE->State == TreeEntry::PossibleStridedVectorize)

+ return false;

// Do not reorder if operand node is used by many user nodes.

if (any_of(TE->UserTreeIndices,

[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))

@@ -4567,7 +4617,8 @@ bool BoUpSLP::canReorderOperands(

// simply add to the list of gathered ops.

// If there are reused scalars, process this node as a regular vectorize

// node, just reorder reuses mask.

- if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())

+ if (TE->State != TreeEntry::Vectorize &&

+ TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())

GatherOps.push_back(TE);

continue;

}

@@ -4602,18 +4653,19 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

// Currently the are vectorized loads,extracts without alternate operands +

// some gathering of extracts.

SmallVector<TreeEntry *> NonVectorized;

- for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,

- &NonVectorized](

- const std::unique_ptr<TreeEntry> &TE) {

- if (TE->State != TreeEntry::Vectorize)

+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

+ if (TE->State != TreeEntry::Vectorize &&

+ TE->State != TreeEntry::PossibleStridedVectorize)

NonVectorized.push_back(TE.get());

if (std::optional<OrdersType> CurrentOrder =

getReorderingData(*TE, /*TopToBottom=*/false)) {

OrderedEntries.insert(TE.get());

- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())

+ if (!(TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize) ||

+ !TE->ReuseShuffleIndices.empty())

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

}

- });

+ }

// 1. Propagate order to the graph nodes, which use only reordered nodes.

// I.e., if the node has operands, that are reordered, try to make at least

@@ -4627,6 +4679,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

SmallVector<TreeEntry *> Filtered;

for (TreeEntry *TE : OrderedEntries) {

if (!(TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize ||

(TE->State == TreeEntry::NeedToGather &&

GathersToOrders.count(TE))) ||

TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

@@ -4649,8 +4702,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

}

// Erase filtered entries.

- for_each(Filtered,

- [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });

+ for (TreeEntry *TE : Filtered)

+ OrderedEntries.remove(TE);

SmallVector<

std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>

UsersVec(Users.begin(), Users.end());

@@ -4662,10 +4715,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

SmallVector<TreeEntry *> GatherOps;

if (!canReorderOperands(Data.first, Data.second, NonVectorized,

GatherOps)) {

- for_each(Data.second,

- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

- OrderedEntries.remove(Op.second);

- });

+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)

+ OrderedEntries.remove(Op.second);

continue;

}

// All operands are reordered and used only in this node - propagate the

@@ -4673,6 +4724,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

MapVector<OrdersType, unsigned,

DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

OrdersUses;

+ // Last chance orders - scatter vectorize. Try to use their orders if no

+ // other orders or the order is counted already.

+ SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;

// Do the analysis for each tree entry only once, otherwise the order of

// the same node my be considered several times, though might be not

// profitable.

@@ -4694,6 +4748,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {

return P.second == OpTE;

});

+ // Postpone scatter orders.

+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {

+ StridedVectorizeOrders.emplace_back(Order, NumOps);

+ continue;

+ }

// Stores actually store the mask, not the order, need to invert.

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

@@ -4754,11 +4813,27 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

}

// If no orders - skip current nodes and jump to the next one, if any.

if (OrdersUses.empty()) {

- for_each(Data.second,

- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

- OrderedEntries.remove(Op.second);

- });

- continue;

+ if (StridedVectorizeOrders.empty() ||

+ (Data.first->ReorderIndices.empty() &&

+ Data.first->ReuseShuffleIndices.empty() &&

+ !(IgnoreReorder &&

+ Data.first == VectorizableTree.front().get()))) {

+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)

+ OrderedEntries.remove(Op.second);

+ continue;

+ }

+ // Add (potentially!) strided vectorize orders.

+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)

+ OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=

+ Pair.second;

+ } else {

+ // Account (potentially!) strided vectorize orders only if it was used

+ // already.

+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {

+ auto *It = OrdersUses.find(Pair.first);

+ if (It != OrdersUses.end())

+ It->second += Pair.second;

+ }

}

// Choose the best order.

ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

@@ -4771,10 +4846,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

}

// Set order of the user node (reordering of operands and user nodes).

if (BestOrder.empty()) {

- for_each(Data.second,

- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

- OrderedEntries.remove(Op.second);

- });

+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)

+ OrderedEntries.remove(Op.second);

continue;

}

// Erase operands from OrderedEntries list and adjust their orders.

@@ -4796,7 +4869,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

continue;

}

// Gathers are processed separately.

- if (TE->State != TreeEntry::Vectorize)

+ if (TE->State != TreeEntry::Vectorize &&

+ TE->State != TreeEntry::PossibleStridedVectorize &&

+ (TE->State != TreeEntry::ScatterVectorize ||

+ TE->ReorderIndices.empty()))

continue;

assert((BestOrder.size() == TE->ReorderIndices.size() ||

TE->ReorderIndices.empty()) &&

@@ -4825,7 +4901,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

Data.first->isAltShuffle())

Data.first->reorderOperands(Mask);

if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||

- Data.first->isAltShuffle()) {

+ Data.first->isAltShuffle() ||

+ Data.first->State == TreeEntry::PossibleStridedVectorize) {

reorderScalars(Data.first->Scalars, Mask);

reorderOrder(Data.first->ReorderIndices, MaskOrder);

if (Data.first->ReuseShuffleIndices.empty() &&

@@ -4859,10 +4936,12 @@ void BoUpSLP::buildExternalUses(

// For each lane:

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

Value *Scalar = Entry->Scalars[Lane];

+ if (!isa<Instruction>(Scalar))

+ continue;

int FoundLane = Entry->findLaneForValue(Scalar);

// Check if the scalar is externally used as an extra arg.

- auto ExtI = ExternallyUsedValues.find(Scalar);

+ const auto *ExtI = ExternallyUsedValues.find(Scalar);

if (ExtI != ExternallyUsedValues.end()) {

LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "

<< Lane << " from " << *Scalar << ".\n");

@@ -4886,7 +4965,8 @@ void BoUpSLP::buildExternalUses(

// be used.

if (UseScalar != U ||

UseEntry->State == TreeEntry::ScatterVectorize ||

- !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

+ UseEntry->State == TreeEntry::PossibleStridedVectorize ||

+ !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U

<< ".\n");

assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");

@@ -4906,9 +4986,9 @@ void BoUpSLP::buildExternalUses(

}

-DenseMap<Value *, SmallVector<StoreInst *, 4>>

+DenseMap<Value *, SmallVector<StoreInst *>>

BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {

- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;

+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;

for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {

Value *V = TE->Scalars[Lane];

// To save compilation time we don't visit if we have too many users.

@@ -4947,14 +5027,14 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {

return PtrToStoresMap;

}

-bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

+bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,

OrdersType &ReorderIndices) const {

// We check whether the stores in StoreVec can form a vector by sorting them

// and checking whether they are consecutive.

// To avoid calling getPointersDiff() while sorting we create a vector of

// pairs {store, offset from first} and sort this instead.

- SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());

+ SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());

StoreInst *S0 = StoresVec[0];

StoreOffsetVec[0] = {S0, 0};

Type *S0Ty = S0->getValueOperand()->getType();

@@ -5023,7 +5103,7 @@ SmallVector<BoUpSLP::OrdersType, 1>

BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {

unsigned NumLanes = TE->Scalars.size();

- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =

+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =

collectUserStores(TE);

// Holds the reorder indices for each candidate store vector that is a user of

@@ -5244,6 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

return TreeEntry::Vectorize;

case LoadsState::ScatterVectorize:

return TreeEntry::ScatterVectorize;

+ case LoadsState::PossibleStridedVectorize:

+ return TreeEntry::PossibleStridedVectorize;

case LoadsState::Gather:

#ifndef NDEBUG

Type *ScalarTy = VL0->getType();

@@ -5416,7 +5498,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

VFShape Shape = VFShape::get(

- *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

+ CI->getFunctionType(),

+ ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

false /*HasGlobalPred*/);

Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

@@ -5488,9 +5571,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

SmallVector<int> ReuseShuffleIndicies;

SmallVector<Value *> UniqueValues;

- auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,

- &UserTreeIdx,

- this](const InstructionsState &S) {

+ SmallVector<Value *> NonUniqueValueVL;

+ auto TryToFindDuplicates = [&](const InstructionsState &S,

+ bool DoNotFail = false) {

// Check that every instruction appears once in this bundle.

DenseMap<Value *, unsigned> UniquePositions(VL.size());

for (Value *V : VL) {

@@ -5517,6 +5600,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

!isConstant(V);

})) ||

!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {

+ if (DoNotFail && UniquePositions.size() > 1 &&

+ NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&

+ all_of(UniqueValues, [=](Value *V) {

+ return isa<ExtractElementInst>(V) ||

+ areAllUsersVectorized(cast<Instruction>(V),

+ UserIgnoreList);

+ })) {

+ unsigned PWSz = PowerOf2Ceil(UniqueValues.size());

+ if (PWSz == VL.size()) {

+ ReuseShuffleIndicies.clear();

+ } else {

+ NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());

+ NonUniqueValueVL.append(PWSz - UniqueValues.size(),

+ UniqueValues.back());

+ VL = NonUniqueValueVL;

+ }

+ return true;

+ }

LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

return false;

@@ -5528,6 +5629,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

InstructionsState S = getSameOpcode(VL, *TLI);

+ // Don't vectorize ephemeral values.

+ if (!EphValues.empty()) {

+ for (Value *V : VL) {

+ if (EphValues.count(V)) {

+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

+ << ") is ephemeral.\n");

+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

+ return;

+ }

// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of

// a load), in which case peek through to include it in the tree, without

// ballooning over-budget.

@@ -5633,7 +5746,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

BasicBlock *BB = nullptr;

bool IsScatterVectorizeUserTE =

UserTreeIdx.UserTE &&

- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||

+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);

bool AreAllSameInsts =

(S.getOpcode() && allSameBlock(VL)) ||

(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&

@@ -5665,39 +5779,44 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// We now know that this is a vector of instructions of the same type from

// the same block.

- // Don't vectorize ephemeral values.

- if (!EphValues.empty()) {

- for (Value *V : VL) {

- if (EphValues.count(V)) {

- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

- << ") is ephemeral.\n");

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

- return;

- }

// Check if this is a duplicate of another entry.

if (TreeEntry *E = getTreeEntry(S.OpValue)) {

LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");

if (!E->isSame(VL)) {

- LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");

- if (TryToFindDuplicates(S))

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

+ auto It = MultiNodeScalars.find(S.OpValue);

+ if (It != MultiNodeScalars.end()) {

+ auto *TEIt = find_if(It->getSecond(),

+ [&](TreeEntry *ME) { return ME->isSame(VL); });

+ if (TEIt != It->getSecond().end())

+ E = *TEIt;

+ else

+ E = nullptr;

+ } else {

+ E = nullptr;

+ }

+ if (!E) {

+ if (!doesNotNeedToBeScheduled(S.OpValue)) {

+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ return;

+ }

+ } else {

+ // Record the reuse of the tree node. FIXME, currently this is only used

+ // to properly draw the graph rather than for the actual vectorization.

+ E->UserTreeIndices.push_back(UserTreeIdx);

+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue

+ << ".\n");

return;

}

- // Record the reuse of the tree node. FIXME, currently this is only used to

- // properly draw the graph rather than for the actual vectorization.

- E->UserTreeIndices.push_back(UserTreeIdx);

- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue

- << ".\n");

- return;

}

// Check that none of the instructions in the bundle are already in the tree.

for (Value *V : VL) {

- if (!IsScatterVectorizeUserTE && !isa<Instruction>(V))

+ if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||

+ doesNotNeedToBeScheduled(V))

continue;

if (getTreeEntry(V)) {

LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

@@ -5725,7 +5844,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Special processing for sorted pointers for ScatterVectorize node with

// constant indeces only.

if (AreAllSameInsts && UserTreeIdx.UserTE &&

- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&

+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||

+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&

!(S.getOpcode() && allSameBlock(VL))) {

assert(S.OpValue->getType()->isPointerTy() &&

count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=

@@ -5760,7 +5880,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

// Check that every instruction appears once in this bundle.

- if (!TryToFindDuplicates(S))

+ if (!TryToFindDuplicates(S, /*DoNotFail=*/true))

return;

// Perform specific checks for each particular instruction kind.

@@ -5780,7 +5900,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

BlockScheduling &BS = *BSRef;

- std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);

+ std::optional<ScheduleData *> Bundle =

+ BS.tryScheduleBundle(UniqueValues, this, S);

#ifdef EXPENSIVE_CHECKS

// Make sure we didn't break any internal invariants

BS.verify();

@@ -5905,6 +6026,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// from such a struct, we read/write packed bits disagreeing with the

// unvectorized version.

TreeEntry *TE = nullptr;

+ fixupOrderingIndices(CurrentOrder);

switch (State) {

case TreeEntry::Vectorize:

if (CurrentOrder.empty()) {

@@ -5913,7 +6035,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");

} else {

- fixupOrderingIndices(CurrentOrder);

// Need to reorder.

TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies, CurrentOrder);

@@ -5921,6 +6042,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

TE->setOperandsInOrder();

break;

+ case TreeEntry::PossibleStridedVectorize:

+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.

+ if (CurrentOrder.empty()) {

+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,

+ UserTreeIdx, ReuseShuffleIndicies);

+ } else {

+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,

+ UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);

+ }

+ TE->setOperandsInOrder();

+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");

+ break;

case TreeEntry::ScatterVectorize:

// Vectorizing non-consecutive loads with `llvm.masked.gather`.

TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

@@ -5951,13 +6085,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");

TE->setOperandsInOrder();

- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(i));

+ Operands.push_back(cast<Instruction>(V)->getOperand(I));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6031,13 +6165,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

TE->setOperandsInOrder();

- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(i));

+ Operands.push_back(cast<Instruction>(V)->getOperand(I));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6087,8 +6221,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (!CI)

Operands.back().push_back(Op);

else

- Operands.back().push_back(ConstantExpr::getIntegerCast(

- CI, Ty, CI->getValue().isSignBitSet()));

+ Operands.back().push_back(ConstantFoldIntegerCast(

+ CI, Ty, CI->getValue().isSignBitSet(), *DL));

}

TE->setOperand(IndexIdx, Operands.back());

@@ -6132,18 +6266,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

TE->setOperandsInOrder();

- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

- // For scalar operands no need to to create an entry since no need to

+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {

+ // For scalar operands no need to create an entry since no need to

// vectorize it.

- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I))

continue;

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL) {

auto *CI2 = cast<CallInst>(V);

- Operands.push_back(CI2->getArgOperand(i));

+ Operands.push_back(CI2->getArgOperand(I));

}

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6194,13 +6328,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

TE->setOperandsInOrder();

- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(i));

+ Operands.push_back(cast<Instruction>(V)->getOperand(I));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6210,7 +6344,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

llvm_unreachable("Unexpected vectorization of the instructions.");

}

-unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

+unsigned BoUpSLP::canMapToVector(Type *T) const {

unsigned N = 1;

Type *EltTy = T;

@@ -6234,15 +6368,16 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

if (!isValidElementType(EltTy))

return 0;

- uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));

+ uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));

if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||

- VTSize != DL.getTypeStoreSizeInBits(T))

+ VTSize != DL->getTypeStoreSizeInBits(T))

return 0;

return N;

}

bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

- SmallVectorImpl<unsigned> &CurrentOrder) const {

+ SmallVectorImpl<unsigned> &CurrentOrder,

+ bool ResizeAllowed) const {

const auto *It = find_if(VL, [](Value *V) {

return isa<ExtractElementInst, ExtractValueInst>(V);

});

@@ -6263,8 +6398,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

// We have to extract from a vector/aggregate with the same number of elements.

unsigned NElts;

if (E0->getOpcode() == Instruction::ExtractValue) {

- const DataLayout &DL = E0->getModule()->getDataLayout();

- NElts = canMapToVector(Vec->getType(), DL);

+ NElts = canMapToVector(Vec->getType());

if (!NElts)

return false;

// Check if load can be rewritten as load of vector.

@@ -6275,46 +6409,55 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();

}

- if (NElts != VL.size())

- return false;

- // Check that all of the indices extract from the correct offset.

- bool ShouldKeepOrder = true;

unsigned E = VL.size();

- // Assign to all items the initial value E + 1 so we can check if the extract

- // instruction index was used already.

- // Also, later we can check that all the indices are used and we have a

- // consecutive access in the extract instructions, by checking that no

- // element of CurrentOrder still has value E + 1.

- CurrentOrder.assign(E, E);

- unsigned I = 0;

- for (; I < E; ++I) {

- auto *Inst = dyn_cast<Instruction>(VL[I]);

+ if (!ResizeAllowed && NElts != E)

+ return false;

+ SmallVector<int> Indices(E, PoisonMaskElem);

+ unsigned MinIdx = NElts, MaxIdx = 0;

+ for (auto [I, V] : enumerate(VL)) {

+ auto *Inst = dyn_cast<Instruction>(V);

if (!Inst)

continue;

if (Inst->getOperand(0) != Vec)

- break;

+ return false;

if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

if (isa<UndefValue>(EE->getIndexOperand()))

continue;

std::optional<unsigned> Idx = getExtractIndex(Inst);

if (!Idx)

- break;

+ return false;

const unsigned ExtIdx = *Idx;

- if (ExtIdx != I) {

- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)

- break;

- ShouldKeepOrder = false;

- CurrentOrder[ExtIdx] = I;

- } else {

- if (CurrentOrder[I] != E)

- break;

- CurrentOrder[I] = I;

- }

+ if (ExtIdx >= NElts)

+ continue;

+ Indices[I] = ExtIdx;

+ if (MinIdx > ExtIdx)

+ MinIdx = ExtIdx;

+ if (MaxIdx < ExtIdx)

+ MaxIdx = ExtIdx;

}

- if (I < E) {

- CurrentOrder.clear();

+ if (MaxIdx - MinIdx + 1 > E)

return false;

+ if (MaxIdx + 1 <= E)

+ MinIdx = 0;

+ // Check that all of the indices extract from the correct offset.

+ bool ShouldKeepOrder = true;

+ // Assign to all items the initial value E + 1 so we can check if the extract

+ // instruction index was used already.

+ // Also, later we can check that all the indices are used and we have a

+ // consecutive access in the extract instructions, by checking that no

+ // element of CurrentOrder still has value E + 1.

+ CurrentOrder.assign(E, E);

+ for (unsigned I = 0; I < E; ++I) {

+ if (Indices[I] == PoisonMaskElem)

+ continue;

+ const unsigned ExtIdx = Indices[I] - MinIdx;

+ if (CurrentOrder[ExtIdx] != E) {

+ CurrentOrder.clear();

+ return false;

+ }

+ ShouldKeepOrder &= ExtIdx == I;

+ CurrentOrder[ExtIdx] = I;

}

if (ShouldKeepOrder)

CurrentOrder.clear();

@@ -6322,9 +6465,9 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

return ShouldKeepOrder;

}

-bool BoUpSLP::areAllUsersVectorized(Instruction *I,

- ArrayRef<Value *> VectorizedVals) const {

- return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||

+bool BoUpSLP::areAllUsersVectorized(

+ Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {

+ return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||

all_of(I->users(), [this](User *U) {

return ScalarToTreeEntry.count(U) > 0 ||

isVectorLikeInstWithConstOps(U) ||

@@ -6351,8 +6494,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

auto IntrinsicCost =

TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);

- auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(

- VecTy->getNumElements())),

+ auto Shape = VFShape::get(CI->getFunctionType(),

+ ElementCount::getFixed(VecTy->getNumElements()),

false /*HasGlobalPred*/);

Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

auto LibCost = IntrinsicCost;

@@ -6365,16 +6508,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

return {IntrinsicCost, LibCost};

}

-/// Build shuffle mask for shuffle graph entries and lists of main and alternate

-/// operations operands.

-static void

-buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

- ArrayRef<int> ReusesIndices,

- const function_ref<bool(Instruction *)> IsAltOp,

- SmallVectorImpl<int> &Mask,

- SmallVectorImpl<Value *> *OpScalars = nullptr,

- SmallVectorImpl<Value *> *AltScalars = nullptr) {

- unsigned Sz = VL.size();

+void BoUpSLP::TreeEntry::buildAltOpShuffleMask(

+ const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<Value *> *OpScalars,

+ SmallVectorImpl<Value *> *AltScalars) const {

+ unsigned Sz = Scalars.size();

Mask.assign(Sz, PoisonMaskElem);

SmallVector<int> OrderMask;

if (!ReorderIndices.empty())

@@ -6383,7 +6521,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

unsigned Idx = I;

if (!ReorderIndices.empty())

Idx = OrderMask[I];

- auto *OpInst = cast<Instruction>(VL[Idx]);

+ auto *OpInst = cast<Instruction>(Scalars[Idx]);

if (IsAltOp(OpInst)) {

Mask[I] = Sz + Idx;

if (AltScalars)

@@ -6394,9 +6532,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

OpScalars->push_back(OpInst);

}

- if (!ReusesIndices.empty()) {

- SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem);

- transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {

+ if (!ReuseShuffleIndices.empty()) {

+ SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);

+ transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {

return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;

});

Mask.swap(NewMask);

@@ -6429,52 +6567,27 @@ static bool isAlternateInstruction(const Instruction *I,

return I->getOpcode() == AltOp->getOpcode();

}

-TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,

- unsigned OpIdx) {

- assert(!VL.empty());

- const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof));

- const auto *Op0 = I0->getOperand(OpIdx);

+TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {

+ assert(!Ops.empty());

+ const auto *Op0 = Ops.front();

- const bool IsConstant = all_of(VL, [&](Value *V) {

+ const bool IsConstant = all_of(Ops, [](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I)

- return true;

- auto *Op = I->getOperand(OpIdx);

- return isConstant(Op) && !isa<UndefValue>(Op);

+ return isConstant(V) && !isa<UndefValue>(V);

});

- const bool IsUniform = all_of(VL, [&](Value *V) {

+ const bool IsUniform = all_of(Ops, [=](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I)

- return false;

- return I->getOperand(OpIdx) == Op0;

+ return V == Op0;

});

- const bool IsPowerOfTwo = all_of(VL, [&](Value *V) {

+ const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I) {

- assert((isa<UndefValue>(V) ||

- I0->getOpcode() == Instruction::GetElementPtr) &&

- "Expected undef or GEP.");

- return true;

- }

- auto *Op = I->getOperand(OpIdx);

- if (auto *CI = dyn_cast<ConstantInt>(Op))

+ if (auto *CI = dyn_cast<ConstantInt>(V))

return CI->getValue().isPowerOf2();

return false;

});

- const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) {

+ const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I) {

- assert((isa<UndefValue>(V) ||

- I0->getOpcode() == Instruction::GetElementPtr) &&

- "Expected undef or GEP.");

- return true;

- }

- const auto *Op = I->getOperand(OpIdx);

- if (auto *CI = dyn_cast<ConstantInt>(Op))

+ if (auto *CI = dyn_cast<ConstantInt>(V))

return CI->getValue().isNegatedPowerOf2();

return false;

});

@@ -6505,9 +6618,24 @@ protected:

bool IsStrict) {

int Limit = Mask.size();

int VF = VecTy->getNumElements();

- return (VF == Limit || !IsStrict) &&

- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&

- ShuffleVectorInst::isIdentityMask(Mask);

+ int Index = -1;

+ if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))

+ return true;

+ if (!IsStrict) {

+ // Consider extract subvector starting from index 0.

+ if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

+ Index == 0)

+ return true;

+ // All VF-size submasks are identity (e.g.

+ // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).

+ if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {

+ ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);

+ return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||

+ ShuffleVectorInst::isIdentityMask(Slice, VF);

+ }))

+ return true;

+ }

+ return false;

}

/// Tries to combine 2 different masks into single one.

@@ -6577,7 +6705,8 @@ protected:

if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {

if (!IdentityOp || !SinglePermute ||

(isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&

- !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {

+ !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,

+ IdentityMask.size()))) {

IdentityOp = SV;

// Store current mask in the IdentityMask so later we did not lost

// this info if IdentityOp is selected as the best candidate for the

@@ -6647,7 +6776,7 @@ protected:

}

if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());

!OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||

- ShuffleVectorInst::isZeroEltSplatMask(Mask)) {

+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {

if (IdentityOp) {

V = IdentityOp;

assert(Mask.size() == IdentityMask.size() &&

@@ -6663,7 +6792,7 @@ protected:

/*IsStrict=*/true) ||

(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&

Shuffle->isZeroEltSplat() &&

- ShuffleVectorInst::isZeroEltSplatMask(Mask)));

+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));

}

V = Op;

return false;

@@ -6768,11 +6897,9 @@ protected:

CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

}

- const int Limit = CombinedMask1.size() * 2;

- if (Op1 == Op2 && Limit == 2 * VF &&

- all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&

- (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||

- (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&

+ if (Op1 == Op2 &&

+ (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||

+ (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&

isa<ShuffleVectorInst>(Op1) &&

cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==

ArrayRef(CombinedMask1))))

@@ -6807,10 +6934,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;

const TargetTransformInfo &TTI;

InstructionCost Cost = 0;

- ArrayRef<Value *> VectorizedVals;

+ SmallDenseSet<Value *> VectorizedVals;

BoUpSLP &R;

SmallPtrSetImpl<Value *> &CheckedExtracts;

constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ /// While set, still trying to estimate the cost for the same nodes and we

+ /// can delay actual cost estimation (virtual shuffle instruction emission).

+ /// May help better estimate the cost if same nodes must be permuted + allows

+ /// to move most of the long shuffles cost estimation to TTI.

+ bool SameNodesEstimated = true;

+ static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {

+ if (Ty->getScalarType()->isPointerTy()) {

+ Constant *Res = ConstantExpr::getIntToPtr(

+ ConstantInt::getAllOnesValue(

+ IntegerType::get(Ty->getContext(),

+ DL.getTypeStoreSizeInBits(Ty->getScalarType()))),

+ Ty->getScalarType());

+ if (auto *VTy = dyn_cast<VectorType>(Ty))

+ Res = ConstantVector::getSplat(VTy->getElementCount(), Res);

+ return Res;

+ }

+ return Constant::getAllOnesValue(Ty);

+ }

InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {

if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))

@@ -6821,20 +6967,35 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

// Improve gather cost for gather of loads, if we can group some of the

// loads into vector loads.

InstructionsState S = getSameOpcode(VL, *R.TLI);

- if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&

- !S.isAltShuffle() &&

+ const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());

+ unsigned MinVF = R.getMinVF(2 * Sz);

+ if (VL.size() > 2 &&

+ ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||

+ (InVectors.empty() &&

+ any_of(seq<unsigned>(0, VL.size() / MinVF),

+ [&](unsigned Idx) {

+ ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);

+ InstructionsState S = getSameOpcode(SubVL, *R.TLI);

+ return S.getOpcode() == Instruction::Load &&

+ !S.isAltShuffle();

+ }))) &&

!all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&

!isSplat(Gathers)) {

- BoUpSLP::ValueSet VectorizedLoads;

+ SetVector<Value *> VectorizedLoads;

+ SmallVector<LoadInst *> VectorizedStarts;

+ SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized;

unsigned StartIdx = 0;

unsigned VF = VL.size() / 2;

- unsigned VectorizedCnt = 0;

- unsigned ScatterVectorizeCnt = 0;

- const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());

- for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {

+ for (; VF >= MinVF; VF /= 2) {

for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;

Cnt += VF) {

ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

+ if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {

+ InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);

+ if (SliceS.getOpcode() != Instruction::Load ||

+ SliceS.isAltShuffle())

+ continue;

+ }

if (!VectorizedLoads.count(Slice.front()) &&

!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {

SmallVector<Value *> PointerOps;

@@ -6845,12 +7006,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

switch (LS) {

case LoadsState::Vectorize:

case LoadsState::ScatterVectorize:

+ case LoadsState::PossibleStridedVectorize:

// Mark the vectorized loads so that we don't vectorize them

// again.

- if (LS == LoadsState::Vectorize)

- ++VectorizedCnt;

+ // TODO: better handling of loads with reorders.

+ if (LS == LoadsState::Vectorize && CurrentOrder.empty())

+ VectorizedStarts.push_back(cast<LoadInst>(Slice.front()));

else

- ++ScatterVectorizeCnt;

+ ScatterVectorized.emplace_back(Cnt, VF);

VectorizedLoads.insert(Slice.begin(), Slice.end());

// If we vectorized initial block, no need to try to vectorize

// it again.

@@ -6881,8 +7044,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

}

// Exclude potentially vectorized loads from list of gathered

// scalars.

- auto *LI = cast<LoadInst>(S.MainOp);

- Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));

+ Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));

// The cost for vectorized loads.

InstructionCost ScalarsCost = 0;

for (Value *V : VectorizedLoads) {

@@ -6892,17 +7054,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

LI->getAlign(), LI->getPointerAddressSpace(),

CostKind, TTI::OperandValueInfo(), LI);

}

- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);

- Align Alignment = LI->getAlign();

- GatherCost +=

- VectorizedCnt *

- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

- LI->getPointerAddressSpace(), CostKind,

- TTI::OperandValueInfo(), LI);

- GatherCost += ScatterVectorizeCnt *

- TTI.getGatherScatterOpCost(

- Instruction::Load, LoadTy, LI->getPointerOperand(),

- /*VariableMask=*/false, Alignment, CostKind, LI);

+ auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);

+ for (LoadInst *LI : VectorizedStarts) {

+ Align Alignment = LI->getAlign();

+ GatherCost +=

+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

+ LI->getPointerAddressSpace(), CostKind,

+ TTI::OperandValueInfo(), LI);

+ }

+ for (std::pair<unsigned, unsigned> P : ScatterVectorized) {

+ auto *LI0 = cast<LoadInst>(VL[P.first]);

+ Align CommonAlignment = LI0->getAlign();

+ for (Value *V : VL.slice(P.first + 1, VF - 1))

+ CommonAlignment =

+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

+ GatherCost += TTI.getGatherScatterOpCost(

+ Instruction::Load, LoadTy, LI0->getPointerOperand(),

+ /*VariableMask=*/false, CommonAlignment, CostKind, LI0);

+ }

if (NeedInsertSubvectorAnalysis) {

// Add the cost for the subvectors insert.

for (int I = VF, E = VL.size(); I < E; I += VF)

@@ -6938,77 +7107,137 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

: R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));

};

- /// Compute the cost of creating a vector of type \p VecTy containing the

- /// extracted values from \p VL.

- InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,

- TTI::ShuffleKind ShuffleKind) {

- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());

- unsigned NumOfParts = TTI.getNumberOfParts(VecTy);

- if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||

- !NumOfParts || VecTy->getNumElements() < NumOfParts)

- return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);

- bool AllConsecutive = true;

- unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;

- unsigned Idx = -1;

+ /// Compute the cost of creating a vector containing the extracted values from

+ /// \p VL.

+ InstructionCost

+ computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,

+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

+ unsigned NumParts) {

+ assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");

+ unsigned NumElts =

+ std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {

+ auto *EE = dyn_cast<ExtractElementInst>(V);

+ if (!EE)

+ return Sz;

+ auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());

+ return std::max(Sz, VecTy->getNumElements());

+ });

+ unsigned NumSrcRegs = TTI.getNumberOfParts(

+ FixedVectorType::get(VL.front()->getType(), NumElts));

+ if (NumSrcRegs == 0)

+ NumSrcRegs = 1;

+ // FIXME: this must be moved to TTI for better estimation.

+ unsigned EltsPerVector = PowerOf2Ceil(std::max(

+ divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));

+ auto CheckPerRegistersShuffle =

+ [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {

+ DenseSet<int> RegIndices;

+ // Check that if trying to permute same single/2 input vectors.

+ TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;

+ int FirstRegId = -1;

+ for (int &I : Mask) {

+ if (I == PoisonMaskElem)

+ continue;

+ int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;

+ if (FirstRegId < 0)

+ FirstRegId = RegId;

+ RegIndices.insert(RegId);

+ if (RegIndices.size() > 2)

+ return std::nullopt;

+ if (RegIndices.size() == 2)

+ ShuffleKind = TTI::SK_PermuteTwoSrc;

+ I = (I % NumElts) % EltsPerVector +

+ (RegId == FirstRegId ? 0 : EltsPerVector);

+ }

+ return ShuffleKind;

+ };

InstructionCost Cost = 0;

// Process extracts in blocks of EltsPerVector to check if the source vector

// operand can be re-used directly. If not, add the cost of creating a

// shuffle to extract the values into a vector register.

- SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);

- for (auto *V : VL) {

- ++Idx;

- // Reached the start of a new vector registers.

- if (Idx % EltsPerVector == 0) {

- RegMask.assign(EltsPerVector, PoisonMaskElem);

- AllConsecutive = true;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ if (!ShuffleKinds[Part])

continue;

- }

- // Need to exclude undefs from analysis.

- if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)

+ ArrayRef<int> MaskSlice =

+ Mask.slice(Part * EltsPerVector,

+ (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)

+ ? Mask.size() % EltsPerVector

+ : EltsPerVector);

+ SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);

+ copy(MaskSlice, SubMask.begin());

+ std::optional<TTI::ShuffleKind> RegShuffleKind =

+ CheckPerRegistersShuffle(SubMask);

+ if (!RegShuffleKind) {

+ Cost += TTI.getShuffleCost(

+ *ShuffleKinds[Part],

+ FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);

continue;

- // Check all extracts for a vector register on the target directly

- // extract values in order.

- unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

- if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) {

- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

- CurrentIdx % EltsPerVector == Idx % EltsPerVector;

- RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;

}

- if (AllConsecutive)

- continue;

- // Skip all indices, except for the last index per vector block.

- if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())

- continue;

- // If we have a series of extracts which are not consecutive and hence

- // cannot re-use the source vector register directly, compute the shuffle

- // cost to extract the vector with EltsPerVector elements.

- Cost += TTI.getShuffleCost(

- TargetTransformInfo::SK_PermuteSingleSrc,

- FixedVectorType::get(VecTy->getElementType(), EltsPerVector),

- RegMask);

+ if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||

+ !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {

+ Cost += TTI.getShuffleCost(

+ *RegShuffleKind,

+ FixedVectorType::get(VL.front()->getType(), EltsPerVector),

+ SubMask);

+ }

}

return Cost;

}

+ /// Transforms mask \p CommonMask per given \p Mask to make proper set after

+ /// shuffle emission.

+ static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,

+ ArrayRef<int> Mask) {

+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

+ if (Mask[Idx] != PoisonMaskElem)

+ CommonMask[Idx] = Idx;

+ }

+ /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given

+ /// mask \p Mask, register number \p Part, that includes \p SliceSize

+ /// elements.

+ void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,

+ ArrayRef<int> Mask, unsigned Part,

+ unsigned SliceSize) {

+ if (SameNodesEstimated) {

+ // Delay the cost estimation if the same nodes are reshuffling.

+ // If we already requested the cost of reshuffling of E1 and E2 before, no

+ // need to estimate another cost with the sub-Mask, instead include this

+ // sub-Mask into the CommonMask to estimate it later and avoid double cost

+ // estimation.

+ if ((InVectors.size() == 2 &&

+ InVectors.front().get<const TreeEntry *>() == &E1 &&

+ InVectors.back().get<const TreeEntry *>() == E2) ||

+ (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {

+ assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),

+ [](int Idx) { return Idx == PoisonMaskElem; }) &&

+ "Expected all poisoned elements.");

+ ArrayRef<int> SubMask =

+ ArrayRef(Mask).slice(Part * SliceSize, SliceSize);

+ copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));

+ return;

+ }

+ // Found non-matching nodes - need to estimate the cost for the matched

+ // and transform mask.

+ Cost += createShuffle(InVectors.front(),

+ InVectors.size() == 1 ? nullptr : InVectors.back(),

+ CommonMask);

+ transformMaskAfterShuffle(CommonMask, CommonMask);

+ }

+ SameNodesEstimated = false;

+ Cost += createShuffle(&E1, E2, Mask);

+ transformMaskAfterShuffle(CommonMask, Mask);

+ }

class ShuffleCostBuilder {

const TargetTransformInfo &TTI;

static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {

- int Limit = 2 * VF;

+ int Index = -1;

return Mask.empty() ||

(VF == Mask.size() &&

- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&

- ShuffleVectorInst::isIdentityMask(Mask));

+ ShuffleVectorInst::isIdentityMask(Mask, VF)) ||

+ (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

+ Index == 0);

}

public:

@@ -7021,21 +7250,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

if (isEmptyOrIdentity(Mask, VF))

return TTI::TCC_Free;

- return TTI.getShuffleCost(

- TTI::SK_PermuteTwoSrc,

- FixedVectorType::get(

- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),

- Mask);

+ return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,

+ cast<VectorType>(V1->getType()), Mask);

}

InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {

// Empty mask or identity mask are free.

- if (isEmptyOrIdentity(Mask, Mask.size()))

+ unsigned VF =

+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

+ if (isEmptyOrIdentity(Mask, VF))

return TTI::TCC_Free;

- return TTI.getShuffleCost(

- TTI::SK_PermuteSingleSrc,

- FixedVectorType::get(

- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),

- Mask);

+ return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,

+ cast<VectorType>(V1->getType()), Mask);

}

InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }

InstructionCost createPoison(Type *Ty, unsigned VF) const {

@@ -7052,139 +7277,226 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

const PointerUnion<Value *, const TreeEntry *> &P2,

ArrayRef<int> Mask) {

ShuffleCostBuilder Builder(TTI);

+ SmallVector<int> CommonMask(Mask.begin(), Mask.end());

Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();

- unsigned CommonVF = 0;

- if (!V1) {

+ unsigned CommonVF = Mask.size();

+ if (!V1 && !V2 && !P2.isNull()) {

+ // Shuffle 2 entry nodes.

const TreeEntry *E = P1.get<const TreeEntry *>();

unsigned VF = E->getVectorFactor();

- if (V2) {

- unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();

- if (V2VF != VF && V2VF == E->Scalars.size())

- VF = E->Scalars.size();

- } else if (!P2.isNull()) {

- const TreeEntry *E2 = P2.get<const TreeEntry *>();

- if (E->Scalars.size() == E2->Scalars.size())

- CommonVF = VF = E->Scalars.size();

- } else {

- // P2 is empty, check that we have same node + reshuffle (if any).

- if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

- VF = E->Scalars.size();

- SmallVector<int> CommonMask(Mask.begin(), Mask.end());

- ::addMask(CommonMask, E->getCommonMask());

- V1 = Constant::getNullValue(

- FixedVectorType::get(E->Scalars.front()->getType(), VF));

- return BaseShuffleAnalysis::createShuffle<InstructionCost>(

- V1, nullptr, CommonMask, Builder);

+ const TreeEntry *E2 = P2.get<const TreeEntry *>();

+ CommonVF = std::max(VF, E2->getVectorFactor());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (E->Scalars.size() == E2->Scalars.size()) {

+ SmallVector<int> EMask = E->getCommonMask();

+ SmallVector<int> E2Mask = E2->getCommonMask();

+ if (!EMask.empty() || !E2Mask.empty()) {

+ for (int &Idx : CommonMask) {

+ if (Idx == PoisonMaskElem)

+ continue;

+ if (Idx < static_cast<int>(CommonVF) && !EMask.empty())

+ Idx = EMask[Idx];

+ else if (Idx >= static_cast<int>(CommonVF))

+ Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +

+ E->Scalars.size();

+ }

}

+ CommonVF = E->Scalars.size();

}

V1 = Constant::getNullValue(

- FixedVectorType::get(E->Scalars.front()->getType(), VF));

- }

- if (!V2 && !P2.isNull()) {

- const TreeEntry *E = P2.get<const TreeEntry *>();

+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));

+ } else if (!V1 && P2.isNull()) {

+ // Shuffle single entry node.

+ const TreeEntry *E = P1.get<const TreeEntry *>();

unsigned VF = E->getVectorFactor();

- unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();

- if (!CommonVF && V1VF == E->Scalars.size())

+ CommonVF = VF;

+ assert(

+ all_of(Mask,

+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&

+ "All elements in mask must be less than CommonVF.");

+ if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

+ SmallVector<int> EMask = E->getCommonMask();

+ assert(!EMask.empty() && "Expected non-empty common mask.");

+ for (int &Idx : CommonMask) {

+ if (Idx != PoisonMaskElem)

+ Idx = EMask[Idx];

+ }

CommonVF = E->Scalars.size();

- if (CommonVF)

- VF = CommonVF;

- V2 = Constant::getNullValue(

- FixedVectorType::get(E->Scalars.front()->getType(), VF));

- }

- return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,

- Builder);

+ }

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));

+ } else if (V1 && P2.isNull()) {

+ // Shuffle single vector.

+ CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ assert(

+ all_of(Mask,

+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&

+ "All elements in mask must be less than CommonVF.");

+ } else if (V1 && !V2) {

+ // Shuffle vector and tree node.

+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ const TreeEntry *E2 = P2.get<const TreeEntry *>();

+ CommonVF = std::max(VF, E2->getVectorFactor());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (E2->Scalars.size() == VF && VF != CommonVF) {

+ SmallVector<int> E2Mask = E2->getCommonMask();

+ assert(!E2Mask.empty() && "Expected non-empty common mask.");

+ for (int &Idx : CommonMask) {

+ if (Idx == PoisonMaskElem)

+ continue;

+ if (Idx >= static_cast<int>(CommonVF))

+ Idx = E2Mask[Idx - CommonVF] + VF;

+ }

+ CommonVF = VF;

+ }

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL,

+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));

+ } else if (!V1 && V2) {

+ // Shuffle vector and tree node.

+ unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();

+ const TreeEntry *E1 = P1.get<const TreeEntry *>();

+ CommonVF = std::max(VF, E1->getVectorFactor());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (E1->Scalars.size() == VF && VF != CommonVF) {

+ SmallVector<int> E1Mask = E1->getCommonMask();

+ assert(!E1Mask.empty() && "Expected non-empty common mask.");

+ for (int &Idx : CommonMask) {

+ if (Idx == PoisonMaskElem)

+ continue;

+ if (Idx >= static_cast<int>(CommonVF))

+ Idx = E1Mask[Idx - CommonVF] + VF;

+ }

+ CommonVF = VF;

+ }

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL,

+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));

+ } else {

+ assert(V1 && V2 && "Expected both vectors.");

+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ CommonVF =

+ std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (V1->getType() != V2->getType()) {

+ V1 = Constant::getNullValue(FixedVectorType::get(

+ cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL, FixedVectorType::get(

+ cast<FixedVectorType>(V1->getType())->getElementType(),

+ CommonVF));

+ }

+ InVectors.front() = Constant::getNullValue(FixedVectorType::get(

+ cast<FixedVectorType>(V1->getType())->getElementType(),

+ CommonMask.size()));

+ if (InVectors.size() == 2)

+ InVectors.pop_back();

+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(

+ V1, V2, CommonMask, Builder);

}

public:

ShuffleCostEstimator(TargetTransformInfo &TTI,

ArrayRef<Value *> VectorizedVals, BoUpSLP &R,

SmallPtrSetImpl<Value *> &CheckedExtracts)

- : TTI(TTI), VectorizedVals(VectorizedVals), R(R),

- CheckedExtracts(CheckedExtracts) {}

- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,

- TTI::ShuffleKind ShuffleKind) {

+ : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),

+ R(R), CheckedExtracts(CheckedExtracts) {}

+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,

+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

+ unsigned NumParts, bool &UseVecBaseAsInput) {

+ UseVecBaseAsInput = false;

if (Mask.empty())

return nullptr;

Value *VecBase = nullptr;

ArrayRef<Value *> VL = E->Scalars;

- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());

// If the resulting type is scalarized, do not adjust the cost.

- unsigned VecNumParts = TTI.getNumberOfParts(VecTy);

- if (VecNumParts == VecTy->getNumElements())

+ if (NumParts == VL.size())

return nullptr;

- DenseMap<Value *, int> ExtractVectorsTys;

- for (auto [I, V] : enumerate(VL)) {

- // Ignore non-extractelement scalars.

- if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))

- continue;

- // If all users of instruction are going to be vectorized and this

- // instruction itself is not going to be vectorized, consider this

- // instruction as dead and remove its cost from the final cost of the

- // vectorized tree.

- // Also, avoid adjusting the cost for extractelements with multiple uses

- // in different graph entries.

- const TreeEntry *VE = R.getTreeEntry(V);

- if (!CheckedExtracts.insert(V).second ||

- !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

- (VE && VE != E))

- continue;

- auto *EE = cast<ExtractElementInst>(V);

- VecBase = EE->getVectorOperand();

- std::optional<unsigned> EEIdx = getExtractIndex(EE);

- if (!EEIdx)

- continue;

- unsigned Idx = *EEIdx;

- if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {

- auto It =

- ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;

- It->getSecond() = std::min<int>(It->second, Idx);

- }

- // Take credit for instruction that will become dead.

- if (EE->hasOneUse()) {

- Instruction *Ext = EE->user_back();

- if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {

- return isa<GetElementPtrInst>(U);

- })) {

- // Use getExtractWithExtendCost() to calculate the cost of

- // extractelement/ext pair.

- Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

- EE->getVectorOperandType(), Idx);

- // Add back the cost of s|zext which is subtracted separately.

- Cost += TTI.getCastInstrCost(

- Ext->getOpcode(), Ext->getType(), EE->getType(),

- TTI::getCastContextHint(Ext), CostKind, Ext);

+ // Check if it can be considered reused if same extractelements were

+ // vectorized already.

+ bool PrevNodeFound = any_of(

+ ArrayRef(R.VectorizableTree).take_front(E->Idx),

+ [&](const std::unique_ptr<TreeEntry> &TE) {

+ return ((!TE->isAltShuffle() &&

+ TE->getOpcode() == Instruction::ExtractElement) ||

+ TE->State == TreeEntry::NeedToGather) &&

+ all_of(enumerate(TE->Scalars), [&](auto &&Data) {

+ return VL.size() > Data.index() &&

+ (Mask[Data.index()] == PoisonMaskElem ||

+ isa<UndefValue>(VL[Data.index()]) ||

+ Data.value() == VL[Data.index()]);

+ });

+ SmallPtrSet<Value *, 4> UniqueBases;

+ unsigned SliceSize = VL.size() / NumParts;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);

+ for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {

+ // Ignore non-extractelement scalars.

+ if (isa<UndefValue>(V) ||

+ (!SubMask.empty() && SubMask[I] == PoisonMaskElem))

continue;

- }

- Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,

- Idx);

- }

- // Add a cost for subvector extracts/inserts if required.

- for (const auto &Data : ExtractVectorsTys) {

- auto *EEVTy = cast<FixedVectorType>(Data.first->getType());

- unsigned NumElts = VecTy->getNumElements();

- if (Data.second % NumElts == 0)

- continue;

- if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {

- unsigned Idx = (Data.second / NumElts) * NumElts;

- unsigned EENumElts = EEVTy->getNumElements();

- if (Idx % NumElts == 0)

+ // If all users of instruction are going to be vectorized and this

+ // instruction itself is not going to be vectorized, consider this

+ // instruction as dead and remove its cost from the final cost of the

+ // vectorized tree.

+ // Also, avoid adjusting the cost for extractelements with multiple uses

+ // in different graph entries.

+ auto *EE = cast<ExtractElementInst>(V);

+ VecBase = EE->getVectorOperand();

+ UniqueBases.insert(VecBase);

+ const TreeEntry *VE = R.getTreeEntry(V);

+ if (!CheckedExtracts.insert(V).second ||

+ !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||

+ (VE && VE != E))

continue;

- if (Idx + NumElts <= EENumElts) {

- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

- EEVTy, std::nullopt, CostKind, Idx, VecTy);

- } else {

- // Need to round up the subvector type vectorization factor to avoid a

- // crash in cost model functions. Make SubVT so that Idx + VF of SubVT

- // <= EENumElts.

- auto *SubVT =

- FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);

- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

- EEVTy, std::nullopt, CostKind, Idx, SubVT);

+ std::optional<unsigned> EEIdx = getExtractIndex(EE);

+ if (!EEIdx)

+ continue;

+ unsigned Idx = *EEIdx;

+ // Take credit for instruction that will become dead.

+ if (EE->hasOneUse() || !PrevNodeFound) {

+ Instruction *Ext = EE->user_back();

+ if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {

+ return isa<GetElementPtrInst>(U);

+ })) {

+ // Use getExtractWithExtendCost() to calculate the cost of

+ // extractelement/ext pair.

+ Cost -=

+ TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

+ EE->getVectorOperandType(), Idx);

+ // Add back the cost of s|zext which is subtracted separately.

+ Cost += TTI.getCastInstrCost(

+ Ext->getOpcode(), Ext->getType(), EE->getType(),

+ TTI::getCastContextHint(Ext), CostKind, Ext);

+ continue;

+ }

}

- } else {

- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,

- VecTy, std::nullopt, CostKind, 0, EEVTy);

+ Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),

+ CostKind, Idx);

}

// Check that gather of extractelements can be represented as just a

@@ -7192,31 +7504,152 @@ public:

// Found the bunch of extractelement instructions that must be gathered

// into a vector and can be represented as a permutation elements in a

// single input vector or of 2 input vectors.

- Cost += computeExtractCost(VL, Mask, ShuffleKind);

+ // Done for reused if same extractelements were vectorized already.

+ if (!PrevNodeFound)

+ Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);

+ InVectors.assign(1, E);

+ CommonMask.assign(Mask.begin(), Mask.end());

+ transformMaskAfterShuffle(CommonMask, CommonMask);

+ SameNodesEstimated = false;

+ if (NumParts != 1 && UniqueBases.size() != 1) {

+ UseVecBaseAsInput = true;

+ VecBase = Constant::getNullValue(

+ FixedVectorType::get(VL.front()->getType(), CommonMask.size()));

+ }

return VecBase;

}

- void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {

- CommonMask.assign(Mask.begin(), Mask.end());

- InVectors.assign({E1, E2});

+ /// Checks if the specified entry \p E needs to be delayed because of its

+ /// dependency nodes.

+ std::optional<InstructionCost>

+ needToDelay(const TreeEntry *,

+ ArrayRef<SmallVector<const TreeEntry *>>) const {

+ // No need to delay the cost estimation during analysis.

+ return std::nullopt;

}

- void add(const TreeEntry *E1, ArrayRef<int> Mask) {

- CommonMask.assign(Mask.begin(), Mask.end());

- InVectors.assign(1, E1);

+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {

+ if (&E1 == &E2) {

+ assert(all_of(Mask,

+ [&](int Idx) {

+ return Idx < static_cast<int>(E1.getVectorFactor());

+ }) &&

+ "Expected single vector shuffle mask.");

+ add(E1, Mask);

+ return;

+ }

+ if (InVectors.empty()) {

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign({&E1, &E2});

+ return;

+ }

+ assert(!CommonMask.empty() && "Expected non-empty common mask.");

+ auto *MaskVecTy =

+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());

+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);

+ if (NumParts == 0 || NumParts >= Mask.size())

+ NumParts = 1;

+ unsigned SliceSize = Mask.size() / NumParts;

+ const auto *It =

+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });

+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

+ estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);

+ }

+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {

+ if (InVectors.empty()) {

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign(1, &E1);

+ return;

+ }

+ assert(!CommonMask.empty() && "Expected non-empty common mask.");

+ auto *MaskVecTy =

+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());

+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);

+ if (NumParts == 0 || NumParts >= Mask.size())

+ NumParts = 1;

+ unsigned SliceSize = Mask.size() / NumParts;

+ const auto *It =

+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });

+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

+ estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);

+ if (!SameNodesEstimated && InVectors.size() == 1)

+ InVectors.emplace_back(&E1);

+ }

+ /// Adds 2 input vectors and the mask for their shuffling.

+ void add(Value *V1, Value *V2, ArrayRef<int> Mask) {

+ // May come only for shuffling of 2 vectors with extractelements, already

+ // handled in adjustExtracts.

+ assert(InVectors.size() == 1 &&

+ all_of(enumerate(CommonMask),

+ [&](auto P) {

+ if (P.value() == PoisonMaskElem)

+ return Mask[P.index()] == PoisonMaskElem;

+ auto *EI =

+ cast<ExtractElementInst>(InVectors.front()

+ .get<const TreeEntry *>()

+ ->Scalars[P.index()]);

+ return EI->getVectorOperand() == V1 ||

+ EI->getVectorOperand() == V2;

+ }) &&

+ "Expected extractelement vectors.");

}

/// Adds another one input vector and the mask for the shuffling.

- void add(Value *V1, ArrayRef<int> Mask) {

- assert(CommonMask.empty() && InVectors.empty() &&

- "Expected empty input mask/vectors.");

- CommonMask.assign(Mask.begin(), Mask.end());

- InVectors.assign(1, V1);

+ void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {

+ if (InVectors.empty()) {

+ assert(CommonMask.empty() && !ForExtracts &&

+ "Expected empty input mask/vectors.");

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign(1, V1);

+ return;

+ }

+ if (ForExtracts) {

+ // No need to add vectors here, already handled them in adjustExtracts.

+ assert(InVectors.size() == 1 &&

+ InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&

+ all_of(enumerate(CommonMask),

+ [&](auto P) {

+ Value *Scalar = InVectors.front()

+ .get<const TreeEntry *>()

+ ->Scalars[P.index()];

+ if (P.value() == PoisonMaskElem)

+ return P.value() == Mask[P.index()] ||

+ isa<UndefValue>(Scalar);

+ if (isa<Constant>(V1))

+ return true;

+ auto *EI = cast<ExtractElementInst>(Scalar);

+ return EI->getVectorOperand() == V1;

+ }) &&

+ "Expected only tree entry for extractelement vectors.");

+ return;

+ }

+ assert(!InVectors.empty() && !CommonMask.empty() &&

+ "Expected only tree entries from extracts/reused buildvectors.");

+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ if (InVectors.size() == 2) {

+ Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

+ transformMaskAfterShuffle(CommonMask, CommonMask);

+ VF = std::max<unsigned>(VF, CommonMask.size());

+ } else if (const auto *InTE =

+ InVectors.front().dyn_cast<const TreeEntry *>()) {

+ VF = std::max(VF, InTE->getVectorFactor());

+ } else {

+ VF = std::max(

+ VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())

+ ->getNumElements());

+ }

+ InVectors.push_back(V1);

+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)

+ CommonMask[Idx] = Mask[Idx] + VF;

}

- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {

+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,

+ Value *Root = nullptr) {

Cost += getBuildVectorCost(VL, Root);

if (!Root) {

- assert(InVectors.empty() && "Unexpected input vectors for buildvector.");

// FIXME: Need to find a way to avoid use of getNullValue here.

SmallVector<Constant *> Vals;

- for (Value *V : VL) {

+ unsigned VF = VL.size();

+ if (MaskVF != 0)

+ VF = std::min(VF, MaskVF);

+ for (Value *V : VL.take_front(VF)) {

if (isa<UndefValue>(V)) {

Vals.push_back(cast<Constant>(V));

continue;

@@ -7226,9 +7659,11 @@ public:

return ConstantVector::get(Vals);

}

return ConstantVector::getSplat(

- ElementCount::getFixed(VL.size()),

- Constant::getNullValue(VL.front()->getType()));

+ ElementCount::getFixed(

+ cast<FixedVectorType>(Root->getType())->getNumElements()),

+ getAllOnesValue(*R.DL, VL.front()->getType()));

}

+ InstructionCost createFreeze(InstructionCost Cost) { return Cost; }

/// Finalize emission of the shuffles.

InstructionCost

finalize(ArrayRef<int> ExtMask, unsigned VF = 0,

@@ -7236,31 +7671,24 @@ public:

IsFinalized = true;

if (Action) {

const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

- if (InVectors.size() == 2) {

+ if (InVectors.size() == 2)

Cost += createShuffle(Vec, InVectors.back(), CommonMask);

- InVectors.pop_back();

- } else {

+ else

Cost += createShuffle(Vec, nullptr, CommonMask);

- }

for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

if (CommonMask[Idx] != PoisonMaskElem)

CommonMask[Idx] = Idx;

assert(VF > 0 &&

"Expected vector length for the final value before action.");

- Value *V = Vec.dyn_cast<Value *>();

- if (!Vec.isNull() && !V)

- V = Constant::getNullValue(FixedVectorType::get(

- Vec.get<const TreeEntry *>()->Scalars.front()->getType(),

- CommonMask.size()));

+ Value *V = Vec.get<Value *>();

Action(V, CommonMask);

+ InVectors.front() = V;

}

::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);

- if (CommonMask.empty())

- return Cost;

- int Limit = CommonMask.size() * 2;

- if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&

- ShuffleVectorInst::isIdentityMask(CommonMask))

+ if (CommonMask.empty()) {

+ assert(InVectors.size() == 1 && "Expected only one vector with no mask");

return Cost;

+ }

return Cost +

createShuffle(InVectors.front(),

InVectors.size() == 2 ? InVectors.back() : nullptr,

@@ -7273,28 +7701,63 @@ public:

}

};

+const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,

+ unsigned Idx) const {

+ Value *Op = E->getOperand(Idx).front();

+ if (const TreeEntry *TE = getTreeEntry(Op)) {

+ if (find_if(E->UserTreeIndices, [&](const EdgeInfo &EI) {

+ return EI.EdgeIdx == Idx && EI.UserTE == E;

+ }) != TE->UserTreeIndices.end())

+ return TE;

+ auto MIt = MultiNodeScalars.find(Op);

+ if (MIt != MultiNodeScalars.end()) {

+ for (const TreeEntry *TE : MIt->second) {

+ if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {

+ return EI.EdgeIdx == Idx && EI.UserTE == E;

+ }) != TE->UserTreeIndices.end())

+ return TE;

+ }

+ const auto *It =

+ find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

+ return TE->State == TreeEntry::NeedToGather &&

+ find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {

+ return EI.EdgeIdx == Idx && EI.UserTE == E;

+ }) != TE->UserTreeIndices.end();

+ });

+ assert(It != VectorizableTree.end() && "Expected vectorizable entry.");

+ return It->get();

InstructionCost

BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

SmallPtrSetImpl<Value *> &CheckedExtracts) {

ArrayRef<Value *> VL = E->Scalars;

Type *ScalarTy = VL[0]->getType();

- if (auto *SI = dyn_cast<StoreInst>(VL[0]))

- ScalarTy = SI->getValueOperand()->getType();

- else if (auto *CI = dyn_cast<CmpInst>(VL[0]))

- ScalarTy = CI->getOperand(0)->getType();

- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

- ScalarTy = IE->getOperand(1)->getType();

+ if (E->State != TreeEntry::NeedToGather) {

+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))

+ ScalarTy = SI->getValueOperand()->getType();

+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))

+ ScalarTy = CI->getOperand(0)->getType();

+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

+ ScalarTy = IE->getOperand(1)->getType();

+ }

+ if (!FixedVectorType::isValidElementType(ScalarTy))

+ return InstructionCost::getInvalid();

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

// If we have computed a smaller type for the expression, update VecTy so

// that the costs will be accurate.

- if (MinBWs.count(VL[0]))

- VecTy = FixedVectorType::get(

- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

+ auto It = MinBWs.find(E);

+ if (It != MinBWs.end()) {

+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);

+ VecTy = FixedVectorType::get(ScalarTy, VL.size());

+ }

unsigned EntryVF = E->getVectorFactor();

- auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);

+ auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

if (E->State == TreeEntry::NeedToGather) {

@@ -7302,121 +7765,13 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

return 0;

if (isa<InsertElementInst>(VL[0]))

return InstructionCost::getInvalid();

- ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,

- CheckedExtracts);

- unsigned VF = E->getVectorFactor();

- SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),

- E->ReuseShuffleIndices.end());

- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

- // Build a mask out of the reorder indices and reorder scalars per this

- // mask.

- SmallVector<int> ReorderMask;

- inversePermutation(E->ReorderIndices, ReorderMask);

- if (!ReorderMask.empty())

- reorderScalars(GatheredScalars, ReorderMask);

- SmallVector<int> Mask;

- SmallVector<int> ExtractMask;

- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;

- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;

- SmallVector<const TreeEntry *> Entries;

- Type *ScalarTy = GatheredScalars.front()->getType();

- // Check for gathered extracts.

- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);

- SmallVector<Value *> IgnoredVals;

- if (UserIgnoreList)

- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

- bool Resized = false;

- if (Value *VecBase = Estimator.adjustExtracts(

- E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))

- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {

- Resized = true;

- GatheredScalars.append(VF - GatheredScalars.size(),

- PoisonValue::get(ScalarTy));

- }

- // Do not try to look for reshuffled loads for gathered loads (they will be

- // handled later), for vectorized scalars, and cases, which are definitely

- // not profitable (splats and small gather nodes.)

- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||

- E->isAltShuffle() ||

- all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

- isSplat(E->Scalars) ||

- (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))

- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);

- if (GatherShuffle) {

- assert((Entries.size() == 1 || Entries.size() == 2) &&

- "Expected shuffle of 1 or 2 entries.");

- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&

- Entries.front()->isSame(E->Scalars)) {

- // Perfect match in the graph, will reuse the previously vectorized

- // node. Cost is 0.

- LLVM_DEBUG(

- dbgs()

- << "SLP: perfect diamond match for gather bundle that starts with "

- << *VL.front() << ".\n");

- // Restore the mask for previous partially matched values.

- for (auto [I, V] : enumerate(E->Scalars)) {

- if (isa<PoisonValue>(V)) {

- Mask[I] = PoisonMaskElem;

- continue;

- }

- if (Mask[I] == PoisonMaskElem)

- Mask[I] = Entries.front()->findLaneForValue(V);

- }

- Estimator.add(Entries.front(), Mask);

- return Estimator.finalize(E->ReuseShuffleIndices);

- }

- if (!Resized) {

- unsigned VF1 = Entries.front()->getVectorFactor();

- unsigned VF2 = Entries.back()->getVectorFactor();

- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)

- GatheredScalars.append(VF - GatheredScalars.size(),

- PoisonValue::get(ScalarTy));

- }

- // Remove shuffled elements from list of gathers.

- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

- if (Mask[I] != PoisonMaskElem)

- GatheredScalars[I] = PoisonValue::get(ScalarTy);

- }

- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()

- << " entries for bundle that starts with "

- << *VL.front() << ".\n";);

- if (Entries.size() == 1)

- Estimator.add(Entries.front(), Mask);

- else

- Estimator.add(Entries.front(), Entries.back(), Mask);

- if (all_of(GatheredScalars, PoisonValue ::classof))

- return Estimator.finalize(E->ReuseShuffleIndices);

- return Estimator.finalize(

- E->ReuseShuffleIndices, E->Scalars.size(),

- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {

- Vec = Estimator.gather(GatheredScalars,

- Constant::getNullValue(FixedVectorType::get(

- GatheredScalars.front()->getType(),

- GatheredScalars.size())));

- });

- }

- if (!all_of(GatheredScalars, PoisonValue::classof)) {

- auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());

- bool SameGathers = VL.equals(Gathers);

- Value *BV = Estimator.gather(

- Gathers, SameGathers ? nullptr

- : Constant::getNullValue(FixedVectorType::get(

- GatheredScalars.front()->getType(),

- GatheredScalars.size())));

- SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);

- std::iota(ReuseMask.begin(), ReuseMask.end(), 0);

- Estimator.add(BV, ReuseMask);

- }

- if (ExtractShuffle)

- Estimator.add(E, std::nullopt);

- return Estimator.finalize(E->ReuseShuffleIndices);

+ return processBuildVector<ShuffleCostEstimator, InstructionCost>(

+ E, *TTI, VectorizedVals, *this, CheckedExtracts);

}

InstructionCost CommonCost = 0;

SmallVector<int> Mask;

- if (!E->ReorderIndices.empty()) {

+ if (!E->ReorderIndices.empty() &&

+ E->State != TreeEntry::PossibleStridedVectorize) {

SmallVector<int> NewMask;

if (E->getOpcode() == Instruction::Store) {

// For stores the order is actually a mask.

@@ -7429,11 +7784,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

}

if (NeedToShuffleReuses)

::addMask(Mask, E->ReuseShuffleIndices);

- if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))

+ if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

CommonCost =

TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);

assert((E->State == TreeEntry::Vectorize ||

- E->State == TreeEntry::ScatterVectorize) &&

+ E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

"Unhandled state");

assert(E->getOpcode() &&

((allSameType(VL) && allSameBlock(VL)) ||

@@ -7443,7 +7799,34 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

Instruction *VL0 = E->getMainOp();

unsigned ShuffleOrOp =

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

- const unsigned Sz = VL.size();

+ SetVector<Value *> UniqueValues(VL.begin(), VL.end());

+ const unsigned Sz = UniqueValues.size();

+ SmallBitVector UsedScalars(Sz, false);

+ for (unsigned I = 0; I < Sz; ++I) {

+ if (getTreeEntry(UniqueValues[I]) == E)

+ continue;

+ UsedScalars.set(I);

+ }

+ auto GetCastContextHint = [&](Value *V) {

+ if (const TreeEntry *OpTE = getTreeEntry(V)) {

+ if (OpTE->State == TreeEntry::ScatterVectorize)

+ return TTI::CastContextHint::GatherScatter;

+ if (OpTE->State == TreeEntry::Vectorize &&

+ OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {

+ if (OpTE->ReorderIndices.empty())

+ return TTI::CastContextHint::Normal;

+ SmallVector<int> Mask;

+ inversePermutation(OpTE->ReorderIndices, Mask);

+ if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))

+ return TTI::CastContextHint::Reversed;

+ }

+ } else {

+ InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);

+ if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())

+ return TTI::CastContextHint::GatherScatter;

+ }

+ return TTI::CastContextHint::None;

+ };

auto GetCostDiff =

[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,

function_ref<InstructionCost(InstructionCost)> VectorCost) {

@@ -7453,13 +7836,49 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

// For some of the instructions no need to calculate cost for each

// particular instruction, we can use the cost of the single

// instruction x total number of scalar instructions.

- ScalarCost = Sz * ScalarEltCost(0);

+ ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);

} else {

- for (unsigned I = 0; I < Sz; ++I)

+ for (unsigned I = 0; I < Sz; ++I) {

+ if (UsedScalars.test(I))

+ continue;

ScalarCost += ScalarEltCost(I);

+ }

}

InstructionCost VecCost = VectorCost(CommonCost);

+ // Check if the current node must be resized, if the parent node is not

+ // resized.

+ if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {

+ const EdgeInfo &EI = E->UserTreeIndices.front();

+ if ((EI.UserTE->getOpcode() != Instruction::Select ||

+ EI.EdgeIdx != 0) &&

+ It != MinBWs.end()) {

+ auto UserBWIt = MinBWs.find(EI.UserTE);

+ Type *UserScalarTy =

+ EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();

+ if (UserBWIt != MinBWs.end())

+ UserScalarTy = IntegerType::get(ScalarTy->getContext(),

+ UserBWIt->second.first);

+ if (ScalarTy != UserScalarTy) {

+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

+ unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);

+ unsigned VecOpcode;

+ auto *SrcVecTy =

+ FixedVectorType::get(UserScalarTy, E->getVectorFactor());

+ if (BWSz > SrcBWSz)

+ VecOpcode = Instruction::Trunc;

+ else

+ VecOpcode =

+ It->second.second ? Instruction::SExt : Instruction::ZExt;

+ TTI::CastContextHint CCH = GetCastContextHint(VL0);

+ VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,

+ CostKind);

+ ScalarCost +=

+ Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy,

+ CCH, CostKind);

+ }

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,

ScalarCost, "Calculated costs for Tree"));

return VecCost - ScalarCost;

@@ -7550,7 +7969,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

// Count reused scalars.

InstructionCost ScalarCost = 0;

SmallPtrSet<const TreeEntry *, 4> CountedOps;

- for (Value *V : VL) {

+ for (Value *V : UniqueValues) {

auto *PHI = dyn_cast<PHINode>(V);

if (!PHI)

continue;

@@ -7571,8 +7990,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

}

case Instruction::ExtractValue:

case Instruction::ExtractElement: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *I = cast<Instruction>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *I = cast<Instruction>(UniqueValues[Idx]);

VectorType *SrcVecTy;

if (ShuffleOrOp == Instruction::ExtractElement) {

auto *EE = cast<ExtractElementInst>(I);

@@ -7680,8 +8099,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

// need to shift the vector.

// Do not calculate the cost if the actual size is the register size and

// we can merge this shuffle with the following SK_Select.

- auto *InsertVecTy =

- FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);

+ auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);

if (!IsIdentity)

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

InsertVecTy, Mask);

@@ -7697,8 +8115,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));

if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {

if (InsertVecSz != VecSz) {

- auto *ActualVecTy =

- FixedVectorType::get(SrcVecTy->getElementType(), VecSz);

+ auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);

Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,

std::nullopt, CostKind, OffsetBeg - Offset,

InsertVecTy);

@@ -7729,22 +8146,52 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

case Instruction::Trunc:

case Instruction::FPTrunc:

case Instruction::BitCast: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

- return TTI->getCastInstrCost(E->getOpcode(), ScalarTy,

- VI->getOperand(0)->getType(),

+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

+ Type *SrcScalarTy = VL0->getOperand(0)->getType();

+ auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());

+ unsigned Opcode = ShuffleOrOp;

+ unsigned VecOpcode = Opcode;

+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&

+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {

+ // Check if the values are candidates to demote.

+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);

+ if (SrcIt != MinBWs.end()) {

+ SrcBWSz = SrcIt->second.first;

+ SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);

+ SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());

+ }

+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

+ if (BWSz == SrcBWSz) {

+ VecOpcode = Instruction::BitCast;

+ } else if (BWSz < SrcBWSz) {

+ VecOpcode = Instruction::Trunc;

+ } else if (It != MinBWs.end()) {

+ assert(BWSz > SrcBWSz && "Invalid cast!");

+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

+ }

+ auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {

+ // Do not count cost here if minimum bitwidth is in effect and it is just

+ // a bitcast (here it is just a noop).

+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)

+ return TTI::TCC_Free;

+ auto *VI = VL0->getOpcode() == Opcode

+ ? cast<Instruction>(UniqueValues[Idx])

+ : nullptr;

+ return TTI->getCastInstrCost(Opcode, VL0->getType(),

+ VL0->getOperand(0)->getType(),

TTI::getCastContextHint(VI), CostKind, VI);

};

auto GetVectorCost = [=](InstructionCost CommonCost) {

- Type *SrcTy = VL0->getOperand(0)->getType();

- auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());

- InstructionCost VecCost = CommonCost;

- // Check if the values are candidates to demote.

- if (!MinBWs.count(VL0) || VecTy != SrcVecTy)

- VecCost +=

- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,

- TTI::getCastContextHint(VL0), CostKind, VL0);

- return VecCost;

+ // Do not count cost here if minimum bitwidth is in effect and it is just

+ // a bitcast (here it is just a noop).

+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)

+ return CommonCost;

+ auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;

+ TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));

+ return CommonCost +

+ TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,

+ VecOpcode == Opcode ? VI : nullptr);

};

return GetCostDiff(GetScalarCost, GetVectorCost);

}

@@ -7761,7 +8208,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

? CmpInst::BAD_FCMP_PREDICATE

: CmpInst::BAD_ICMP_PREDICATE;

auto GetScalarCost = [&](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

+ auto *VI = cast<Instruction>(UniqueValues[Idx]);

CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()

? CmpInst::BAD_FCMP_PREDICATE

: CmpInst::BAD_ICMP_PREDICATE;

@@ -7821,8 +8268,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *VI = cast<Instruction>(UniqueValues[Idx]);

unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;

TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));

TTI::OperandValueInfo Op2Info =

@@ -7833,8 +8280,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

};

auto GetVectorCost = [=](InstructionCost CommonCost) {

unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;

- TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);

- TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);

+ TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));

+ TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));

return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,

Op2Info) +

CommonCost;

@@ -7845,23 +8292,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

return CommonCost + GetGEPCostDiff(VL, VL0);

}

case Instruction::Load: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<LoadInst>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *VI = cast<LoadInst>(UniqueValues[Idx]);

return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),

VI->getPointerAddressSpace(), CostKind,

TTI::OperandValueInfo(), VI);

};

auto *LI0 = cast<LoadInst>(VL0);

- auto GetVectorCost = [=](InstructionCost CommonCost) {

+ auto GetVectorCost = [&](InstructionCost CommonCost) {

InstructionCost VecLdCost;

if (E->State == TreeEntry::Vectorize) {

VecLdCost = TTI->getMemoryOpCost(

Instruction::Load, VecTy, LI0->getAlign(),

LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());

} else {

- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");

+ assert((E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

+ "Unknown EntryState");

Align CommonAlignment = LI0->getAlign();

- for (Value *V : VL)

+ for (Value *V : UniqueValues)

CommonAlignment =

std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

VecLdCost = TTI->getGatherScatterOpCost(

@@ -7874,7 +8323,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);

// If this node generates masked gather load then it is not a terminal node.

// Hence address operand cost is estimated separately.

- if (E->State == TreeEntry::ScatterVectorize)

+ if (E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize)

return Cost;

// Estimate cost of GEPs since this tree node is a terminator.

@@ -7887,7 +8337,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

bool IsReorder = !E->ReorderIndices.empty();

auto GetScalarCost = [=](unsigned Idx) {

auto *VI = cast<StoreInst>(VL[Idx]);

- TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0);

+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());

return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),

VI->getPointerAddressSpace(), CostKind,

OpInfo, VI);

@@ -7896,7 +8346,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

auto GetVectorCost = [=](InstructionCost CommonCost) {

// We know that we can merge the stores. Calculate the cost.

- TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);

+ TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));

return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),

BaseSI->getPointerAddressSpace(), CostKind,

OpInfo) +

@@ -7912,8 +8362,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());

}

case Instruction::Call: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *CI = cast<CallInst>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *CI = cast<CallInst>(UniqueValues[Idx]);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

if (ID != Intrinsic::not_intrinsic) {

IntrinsicCostAttributes CostAttrs(ID, *CI, 1);

@@ -7954,8 +8404,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

}

return false;

};

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *VI = cast<Instruction>(UniqueValues[Idx]);

assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");

(void)E;

return TTI->getInstructionCost(VI, CostKind);

@@ -7995,21 +8445,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,

TTI::CastContextHint::None, CostKind);

}

- if (E->ReuseShuffleIndices.empty()) {

- VecCost +=

- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);

- } else {

- SmallVector<int> Mask;

- buildShuffleEntryMask(

- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

- [E](Instruction *I) {

- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

- return I->getOpcode() == E->getAltOpcode();

- },

- Mask);

- VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,

- FinalVecTy, Mask);

- }

+ SmallVector<int> Mask;

+ E->buildAltOpShuffleMask(

+ [E](Instruction *I) {

+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

+ return I->getOpcode() == E->getAltOpcode();

+ },

+ Mask);

+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,

+ FinalVecTy, Mask);

return VecCost;

};

return GetCostDiff(GetScalarCost, GetVectorCost);

@@ -8065,7 +8509,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {

// Gathering cost would be too much for tiny trees.

if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||

(VectorizableTree[1]->State == TreeEntry::NeedToGather &&

- VectorizableTree[0]->State != TreeEntry::ScatterVectorize))

+ VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&

+ VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))

return false;

return true;

@@ -8144,6 +8589,23 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {

allConstant(VectorizableTree[1]->Scalars))))

return true;

+ // If the graph includes only PHI nodes and gathers, it is defnitely not

+ // profitable for the vectorization, we can skip it, if the cost threshold is

+ // default. The cost of vectorized PHI nodes is almost always 0 + the cost of

+ // gathers/buildvectors.

+ constexpr int Limit = 4;

+ if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&

+ !VectorizableTree.empty() &&

+ all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

+ return (TE->State == TreeEntry::NeedToGather &&

+ TE->getOpcode() != Instruction::ExtractElement &&

+ count_if(TE->Scalars,

+ [](Value *V) { return isa<ExtractElementInst>(V); }) <=

+ Limit) ||

+ TE->getOpcode() == Instruction::PHI;

+ }))

+ return true;

// We can vectorize the tree if its size is greater than or equal to the

// minimum size specified by the MinTreeSize command line option.

if (VectorizableTree.size() >= MinTreeSize)

@@ -8435,16 +8897,6 @@ static T *performExtractsShuffleAction(

}

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

- // Build a map for gathered scalars to the nodes where they are used.

- ValueToGatherNodes.clear();

- for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {

- if (EntryPtr->State != TreeEntry::NeedToGather)

- continue;

- for (Value *V : EntryPtr->Scalars)

- if (!isConstant(V))

- ValueToGatherNodes.try_emplace(V).first->getSecond().insert(

- EntryPtr.get());

- }

InstructionCost Cost = 0;

LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "

<< VectorizableTree.size() << ".\n");

@@ -8460,8 +8912,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

E->isSame(TE.Scalars)) {

// Some gather nodes might be absolutely the same as some vectorizable

// nodes after reordering, need to handle it.

- LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with "

- << *TE.Scalars[0] << ".\n"

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "

+ << shortBundleName(TE.Scalars) << ".\n"

<< "SLP: Current total cost = " << Cost << "\n");

continue;

}

@@ -8469,9 +8921,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);

Cost += C;

- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

- << " for bundle that starts with " << *TE.Scalars[0]

- << ".\n"

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "

+ << shortBundleName(TE.Scalars) << ".\n"

<< "SLP: Current total cost = " << Cost << "\n");

}

@@ -8480,6 +8931,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;

SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;

SmallVector<APInt> DemandedElts;

+ SmallDenseSet<Value *, 4> UsedInserts;

+ DenseSet<Value *> VectorCasts;

for (ExternalUser &EU : ExternalUses) {

// We only add extract cost once for the same scalar.

if (!isa_and_nonnull<InsertElementInst>(EU.User) &&

@@ -8500,6 +8953,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// to detect it as a final shuffled/identity match.

if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {

if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

+ if (!UsedInserts.insert(VU).second)

+ continue;

std::optional<unsigned> InsertIdx = getInsertIndex(VU);

if (InsertIdx) {

const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);

@@ -8546,6 +9001,28 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

FirstUsers.emplace_back(VU, ScalarTE);

DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));

VecId = FirstUsers.size() - 1;

+ auto It = MinBWs.find(ScalarTE);

+ if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) {

+ unsigned BWSz = It->second.second;

+ unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType());

+ unsigned VecOpcode;

+ if (BWSz < SrcBWSz)

+ VecOpcode = Instruction::Trunc;

+ else

+ VecOpcode =

+ It->second.second ? Instruction::SExt : Instruction::ZExt;

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ InstructionCost C = TTI->getCastInstrCost(

+ VecOpcode, FTy,

+ FixedVectorType::get(

+ IntegerType::get(FTy->getContext(), It->second.first),

+ FTy->getNumElements()),

+ TTI::CastContextHint::None, CostKind);

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

+ << " for extending externally used vector with "

+ "non-equal minimum bitwidth.\n");

+ Cost += C;

+ }

} else {

if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))

It->first = VU;

@@ -8567,11 +9044,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// for the extract and the added cost of the sign extend if needed.

auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

- if (MinBWs.count(ScalarRoot)) {

- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

- auto Extend =

- MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;

+ auto It = MinBWs.find(getTreeEntry(EU.Scalar));

+ if (It != MinBWs.end()) {

+ auto *MinTy = IntegerType::get(F->getContext(), It->second.first);

+ unsigned Extend =

+ It->second.second ? Instruction::SExt : Instruction::ZExt;

VecTy = FixedVectorType::get(MinTy, BundleWidth);

ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),

VecTy, EU.Lane);

@@ -8580,6 +9057,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

CostKind, EU.Lane);

}

+ // Add reduced value cost, if resized.

+ if (!VectorizedVals.empty()) {

+ auto BWIt = MinBWs.find(VectorizableTree.front().get());

+ if (BWIt != MinBWs.end()) {

+ Type *DstTy = VectorizableTree.front()->Scalars.front()->getType();

+ unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);

+ unsigned Opcode = Instruction::Trunc;

+ if (OriginalSz < BWIt->second.first)

+ Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;

+ Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);

+ Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,

+ TTI::CastContextHint::None,

+ TTI::TCK_RecipThroughput);

+ }

InstructionCost SpillCost = getSpillCost();

Cost += SpillCost + ExtractCost;

@@ -8590,9 +9082,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

unsigned VecVF = TE->getVectorFactor();

if (VF != VecVF &&

(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||

- (all_of(Mask,

- [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&

- !ShuffleVectorInst::isIdentityMask(Mask)))) {

+ !ShuffleVectorInst::isIdentityMask(Mask, VF))) {

SmallVector<int> OrigMask(VecVF, PoisonMaskElem);

std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

OrigMask.begin());

@@ -8611,19 +9101,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// Calculate the cost of the reshuffled vectors, if any.

for (int I = 0, E = FirstUsers.size(); I < E; ++I) {

Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);

- unsigned VF = ShuffleMasks[I].begin()->second.size();

- auto *FTy = FixedVectorType::get(

- cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);

auto Vector = ShuffleMasks[I].takeVector();

- auto &&EstimateShufflesCost = [this, FTy,

- &Cost](ArrayRef<int> Mask,

- ArrayRef<const TreeEntry *> TEs) {

+ unsigned VF = 0;

+ auto EstimateShufflesCost = [&](ArrayRef<int> Mask,

+ ArrayRef<const TreeEntry *> TEs) {

assert((TEs.size() == 1 || TEs.size() == 2) &&

"Expected exactly 1 or 2 tree entries.");

if (TEs.size() == 1) {

- int Limit = 2 * Mask.size();

- if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||

- !ShuffleVectorInst::isIdentityMask(Mask)) {

+ if (VF == 0)

+ VF = TEs.front()->getVectorFactor();

+ auto *FTy =

+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);

+ if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&

+ !all_of(enumerate(Mask), [=](const auto &Data) {

+ return Data.value() == PoisonMaskElem ||

+ (Data.index() < VF &&

+ static_cast<int>(Data.index()) == Data.value());

+ })) {

InstructionCost C =

TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

@@ -8634,6 +9128,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

Cost += C;

}

} else {

+ if (VF == 0) {

+ if (TEs.front() &&

+ TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())

+ VF = TEs.front()->getVectorFactor();

+ else

+ VF = Mask.size();

+ }

+ auto *FTy =

+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);

InstructionCost C =

TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

@@ -8643,6 +9146,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

dbgs() << "SLP: Current total cost = " << Cost << "\n");

Cost += C;

}

+ VF = Mask.size();

return TEs.back();

};

(void)performExtractsShuffleAction<const TreeEntry>(

@@ -8671,54 +9175,198 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

return Cost;

}

-std::optional<TargetTransformInfo::ShuffleKind>

-BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

- SmallVectorImpl<int> &Mask,

- SmallVectorImpl<const TreeEntry *> &Entries) {

- Entries.clear();

- // No need to check for the topmost gather node.

- if (TE == VectorizableTree.front().get())

+/// Tries to find extractelement instructions with constant indices from fixed

+/// vector type and gather such instructions into a bunch, which highly likely

+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

+/// successful, the matched scalars are replaced by poison values in \p VL for

+/// future analysis.

+std::optional<TTI::ShuffleKind>

+BoUpSLP::tryToGatherSingleRegisterExtractElements(

+ MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {

+ // Scan list of gathered scalars for extractelements that can be represented

+ // as shuffles.

+ MapVector<Value *, SmallVector<int>> VectorOpToIdx;

+ SmallVector<int> UndefVectorExtracts;

+ for (int I = 0, E = VL.size(); I < E; ++I) {

+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

+ if (!EI) {

+ if (isa<UndefValue>(VL[I]))

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

+ if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

+ continue;

+ std::optional<unsigned> Idx = getExtractIndex(EI);

+ // Undefined index.

+ if (!Idx) {

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ SmallBitVector ExtractMask(VecTy->getNumElements(), true);

+ ExtractMask.reset(*Idx);

+ if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ VectorOpToIdx[EI->getVectorOperand()].push_back(I);

+ }

+ // Sort the vector operands by the maximum number of uses in extractelements.

+ MapVector<unsigned, SmallVector<Value *>> VFToVector;

+ for (const auto &Data : VectorOpToIdx)

+ VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]

+ .push_back(Data.first);

+ for (auto &Data : VFToVector) {

+ stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {

+ return VectorOpToIdx.find(V1)->second.size() >

+ VectorOpToIdx.find(V2)->second.size();

+ });

+ }

+ // Find the best pair of the vectors with the same number of elements or a

+ // single vector.

+ const int UndefSz = UndefVectorExtracts.size();

+ unsigned SingleMax = 0;

+ Value *SingleVec = nullptr;

+ unsigned PairMax = 0;

+ std::pair<Value *, Value *> PairVec(nullptr, nullptr);

+ for (auto &Data : VFToVector) {

+ Value *V1 = Data.second.front();

+ if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {

+ SingleMax = VectorOpToIdx[V1].size() + UndefSz;

+ SingleVec = V1;

+ }

+ Value *V2 = nullptr;

+ if (Data.second.size() > 1)

+ V2 = *std::next(Data.second.begin());

+ if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +

+ UndefSz) {

+ PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;

+ PairVec = std::make_pair(V1, V2);

+ }

+ if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

+ return std::nullopt;

+ // Check if better to perform a shuffle of 2 vectors or just of a single

+ // vector.

+ SmallVector<Value *> SavedVL(VL.begin(), VL.end());

+ SmallVector<Value *> GatheredExtracts(

+ VL.size(), PoisonValue::get(VL.front()->getType()));

+ if (SingleMax >= PairMax && SingleMax) {

+ for (int Idx : VectorOpToIdx[SingleVec])

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ } else {

+ for (Value *V : {PairVec.first, PairVec.second})

+ for (int Idx : VectorOpToIdx[V])

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ }

+ // Add extracts from undefs too.

+ for (int Idx : UndefVectorExtracts)

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ // Check that gather of extractelements can be represented as just a

+ // shuffle of a single/two vectors the scalars are extracted from.

+ std::optional<TTI::ShuffleKind> Res =

+ isFixedVectorShuffle(GatheredExtracts, Mask);

+ if (!Res) {

+ // TODO: try to check other subsets if possible.

+ // Restore the original VL if attempt was not successful.

+ copy(SavedVL, VL.begin());

return std::nullopt;

+ }

+ // Restore unused scalars from mask, if some of the extractelements were not

+ // selected for shuffle.

+ for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {

+ if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&

+ isa<UndefValue>(GatheredExtracts[I])) {

+ std::swap(VL[I], GatheredExtracts[I]);

+ continue;

+ }

+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

+ if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

+ !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

+ is_contained(UndefVectorExtracts, I))

+ continue;

+ }

+ return Res;

+/// Tries to find extractelement instructions with constant indices from fixed

+/// vector type and gather such instructions into a bunch, which highly likely

+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

+/// successful, the matched scalars are replaced by poison values in \p VL for

+/// future analysis.

+SmallVector<std::optional<TTI::ShuffleKind>>

+BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

+ SmallVectorImpl<int> &Mask,

+ unsigned NumParts) const {

+ assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");

+ SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);

Mask.assign(VL.size(), PoisonMaskElem);

- assert(TE->UserTreeIndices.size() == 1 &&

- "Expected only single user of the gather node.");

+ unsigned SliceSize = VL.size() / NumParts;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ // Scan list of gathered scalars for extractelements that can be represented

+ // as shuffles.

+ MutableArrayRef<Value *> SubVL =

+ MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);

+ SmallVector<int> SubMask;

+ std::optional<TTI::ShuffleKind> Res =

+ tryToGatherSingleRegisterExtractElements(SubVL, SubMask);

+ ShufflesRes[Part] = Res;

+ copy(SubMask, std::next(Mask.begin(), Part * SliceSize));

+ }

+ if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {

+ return Res.has_value();

+ }))

+ ShufflesRes.clear();

+ return ShufflesRes;

+std::optional<TargetTransformInfo::ShuffleKind>

+BoUpSLP::isGatherShuffledSingleRegisterEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,

+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {

+ Entries.clear();

// TODO: currently checking only for Scalars in the tree entry, need to count

// reused elements too for better cost estimation.

- Instruction &UserInst =

- getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);

- BasicBlock *ParentBB = nullptr;

+ const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();

+ const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);

+ const BasicBlock *TEInsertBlock = nullptr;

// Main node of PHI entries keeps the correct order of operands/incoming

// blocks.

- if (auto *PHI =

- dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) {

- ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx);

+ if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {

+ TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);

+ TEInsertPt = TEInsertBlock->getTerminator();

} else {

- ParentBB = UserInst.getParent();

+ TEInsertBlock = TEInsertPt->getParent();

}

- auto *NodeUI = DT->getNode(ParentBB);

+ auto *NodeUI = DT->getNode(TEInsertBlock);

assert(NodeUI && "Should only process reachable instructions");

SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());

- auto CheckOrdering = [&](Instruction *LastEI) {

- // Check if the user node of the TE comes after user node of EntryPtr,

- // otherwise EntryPtr depends on TE.

- // Gather nodes usually are not scheduled and inserted before their first

- // user node. So, instead of checking dependency between the gather nodes

- // themselves, we check the dependency between their user nodes.

- // If one user node comes before the second one, we cannot use the second

- // gather node as the source vector for the first gather node, because in

- // the list of instructions it will be emitted later.

- auto *EntryParent = LastEI->getParent();

- auto *NodeEUI = DT->getNode(EntryParent);

+ auto CheckOrdering = [&](const Instruction *InsertPt) {

+ // Argument InsertPt is an instruction where vector code for some other

+ // tree entry (one that shares one or more scalars with TE) is going to be

+ // generated. This lambda returns true if insertion point of vector code

+ // for the TE dominates that point (otherwise dependency is the other way

+ // around). The other node is not limited to be of a gather kind. Gather

+ // nodes are not scheduled and their vector code is inserted before their

+ // first user. If user is PHI, that is supposed to be at the end of a

+ // predecessor block. Otherwise it is the last instruction among scalars of

+ // the user node. So, instead of checking dependency between instructions

+ // themselves, we check dependency between their insertion points for vector

+ // code (since each scalar instruction ends up as a lane of a vector

+ // instruction).

+ const BasicBlock *InsertBlock = InsertPt->getParent();

+ auto *NodeEUI = DT->getNode(InsertBlock);

if (!NodeEUI)

return false;

assert((NodeUI == NodeEUI) ==

(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&

"Different nodes should have different DFS numbers");

// Check the order of the gather nodes users.

- if (UserInst.getParent() != EntryParent &&

+ if (TEInsertPt->getParent() != InsertBlock &&

(DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))

return false;

- if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI))

+ if (TEInsertPt->getParent() == InsertBlock &&

+ TEInsertPt->comesBefore(InsertPt))

return false;

return true;

};

@@ -8743,43 +9391,42 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

[&](Value *V) { return GatheredScalars.contains(V); }) &&

"Must contain at least single gathered value.");

assert(TEPtr->UserTreeIndices.size() == 1 &&

- "Expected only single user of the gather node.");

- PHINode *EntryPHI =

- dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp());

- Instruction *EntryUserInst =

- EntryPHI ? nullptr

- : &getLastInstructionInBundle(

- TEPtr->UserTreeIndices.front().UserTE);

- if (&UserInst == EntryUserInst) {

- assert(!EntryPHI && "Unexpected phi node entry.");

- // If 2 gathers are operands of the same entry, compare operands

- // indices, use the earlier one as the base.

- if (TE->UserTreeIndices.front().UserTE ==

- TEPtr->UserTreeIndices.front().UserTE &&

- TE->UserTreeIndices.front().EdgeIdx <

- TEPtr->UserTreeIndices.front().EdgeIdx)

+ "Expected only single user of a gather node.");

+ const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();

+ PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());

+ const Instruction *InsertPt =

+ UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()

+ : &getLastInstructionInBundle(UseEI.UserTE);

+ if (TEInsertPt == InsertPt) {

+ // If 2 gathers are operands of the same entry (regardless of whether

+ // user is PHI or else), compare operands indices, use the earlier one

+ // as the base.

+ if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)

+ continue;

+ // If the user instruction is used for some reason in different

+ // vectorized nodes - make it depend on index.

+ if (TEUseEI.UserTE != UseEI.UserTE &&

+ TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)

continue;

}

- // Check if the user node of the TE comes after user node of EntryPtr,

- // otherwise EntryPtr depends on TE.

- auto *EntryI =

- EntryPHI

- ? EntryPHI

- ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx)

- ->getTerminator()

- : EntryUserInst;

- if ((ParentBB != EntryI->getParent() ||

- TE->UserTreeIndices.front().EdgeIdx <

- TEPtr->UserTreeIndices.front().EdgeIdx ||

- TE->UserTreeIndices.front().UserTE !=

- TEPtr->UserTreeIndices.front().UserTE) &&

- !CheckOrdering(EntryI))

+ // Check if the user node of the TE comes after user node of TEPtr,

+ // otherwise TEPtr depends on TE.

+ if ((TEInsertBlock != InsertPt->getParent() ||

+ TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&

+ !CheckOrdering(InsertPt))

continue;

VToTEs.insert(TEPtr);

}

if (const TreeEntry *VTE = getTreeEntry(V)) {

- Instruction &EntryUserInst = getLastInstructionInBundle(VTE);

- if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst))

+ Instruction &LastBundleInst = getLastInstructionInBundle(VTE);

+ if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))

+ continue;

+ auto It = MinBWs.find(VTE);

+ // If vectorize node is demoted - do not match.

+ if (It != MinBWs.end() &&

+ It->second.first != DL->getTypeSizeInBits(V->getType()))

continue;

VToTEs.insert(VTE);

}

@@ -8823,8 +9470,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

}

- if (UsedTEs.empty())

+ if (UsedTEs.empty()) {

+ Entries.clear();

return std::nullopt;

+ }

unsigned VF = 0;

if (UsedTEs.size() == 1) {

@@ -8838,9 +9487,19 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {

return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);

});

- if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {

+ if (It != FirstEntries.end() &&

+ ((*It)->getVectorFactor() == VL.size() ||

+ ((*It)->getVectorFactor() == TE->Scalars.size() &&

+ TE->ReuseShuffleIndices.size() == VL.size() &&

+ (*It)->isSame(TE->Scalars)))) {

Entries.push_back(*It);

- std::iota(Mask.begin(), Mask.end(), 0);

+ if ((*It)->getVectorFactor() == VL.size()) {

+ std::iota(std::next(Mask.begin(), Part * VL.size()),

+ std::next(Mask.begin(), (Part + 1) * VL.size()), 0);

+ } else {

+ SmallVector<int> CommonMask = TE->getCommonMask();

+ copy(CommonMask, Mask.begin());

+ }

// Clear undef scalars.

for (int I = 0, Sz = VL.size(); I < Sz; ++I)

if (isa<PoisonValue>(VL[I]))

@@ -8923,12 +9582,9 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

// by extractelements processing) or may form vector node in future.

auto MightBeIgnored = [=](Value *V) {

auto *I = dyn_cast<Instruction>(V);

- SmallVector<Value *> IgnoredVals;

- if (UserIgnoreList)

- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&

!isVectorLikeInstWithConstOps(I) &&

- !areAllUsersVectorized(I, IgnoredVals) && isSimple(I);

+ !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);

};

// Check that the neighbor instruction may form a full vector node with the

// current instruction V. It is possible, if they have same/alternate opcode

@@ -8980,7 +9636,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

TempEntries.push_back(Entries[I]);

}

Entries.swap(TempEntries);

- if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {

+ if (EntryLanes.size() == Entries.size() &&

+ !VL.equals(ArrayRef(TE->Scalars)

+ .slice(Part * VL.size(),

+ std::min<int>(VL.size(), TE->Scalars.size())))) {

// We may have here 1 or 2 entries only. If the number of scalars is equal

// to the number of entries, no need to do the analysis, it is not very

// profitable. Since VL is not the same as TE->Scalars, it means we already

@@ -8993,9 +9652,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

// Pair.first is the offset to the vector, while Pair.second is the index of

// scalar in the list.

for (const std::pair<unsigned, int> &Pair : EntryLanes) {

- Mask[Pair.second] = Pair.first * VF +

- Entries[Pair.first]->findLaneForValue(VL[Pair.second]);

- IsIdentity &= Mask[Pair.second] == Pair.second;

+ unsigned Idx = Part * VL.size() + Pair.second;

+ Mask[Idx] = Pair.first * VF +

+ Entries[Pair.first]->findLaneForValue(VL[Pair.second]);

+ IsIdentity &= Mask[Idx] == Pair.second;

}

switch (Entries.size()) {

case 1:

@@ -9010,9 +9670,64 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

break;

}

Entries.clear();

+ // Clear the corresponding mask elements.

+ std::fill(std::next(Mask.begin(), Part * VL.size()),

+ std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);

return std::nullopt;

}

+SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

+BoUpSLP::isGatherShuffledEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,

+ unsigned NumParts) {

+ assert(NumParts > 0 && NumParts < VL.size() &&

+ "Expected positive number of registers.");

+ Entries.clear();

+ // No need to check for the topmost gather node.

+ if (TE == VectorizableTree.front().get())

+ return {};

+ Mask.assign(VL.size(), PoisonMaskElem);

+ assert(TE->UserTreeIndices.size() == 1 &&

+ "Expected only single user of the gather node.");

+ assert(VL.size() % NumParts == 0 &&

+ "Number of scalars must be divisible by NumParts.");

+ unsigned SliceSize = VL.size() / NumParts;

+ SmallVector<std::optional<TTI::ShuffleKind>> Res;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);

+ SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();

+ std::optional<TTI::ShuffleKind> SubRes =

+ isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);

+ if (!SubRes)

+ SubEntries.clear();

+ Res.push_back(SubRes);

+ if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&

+ SubEntries.front()->getVectorFactor() == VL.size() &&

+ (SubEntries.front()->isSame(TE->Scalars) ||

+ SubEntries.front()->isSame(VL))) {

+ SmallVector<const TreeEntry *> LocalSubEntries;

+ LocalSubEntries.swap(SubEntries);

+ Entries.clear();

+ Res.clear();

+ std::iota(Mask.begin(), Mask.end(), 0);

+ // Clear undef scalars.

+ for (int I = 0, Sz = VL.size(); I < Sz; ++I)

+ if (isa<PoisonValue>(VL[I]))

+ Mask[I] = PoisonMaskElem;

+ Entries.emplace_back(1, LocalSubEntries.front());

+ Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);

+ return Res;

+ }

+ if (all_of(Res,

+ [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {

+ Entries.clear();

+ return {};

+ }

+ return Res;

InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,

bool ForPoisonSrc) const {

// Find the type of the operands in VL.

@@ -9224,18 +9939,20 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

auto *Front = E->getMainOp();

Instruction *LastInst = &getLastInstructionInBundle(E);

assert(LastInst && "Failed to find last instruction in bundle");

+ BasicBlock::iterator LastInstIt = LastInst->getIterator();

// If the instruction is PHI, set the insert point after all the PHIs.

bool IsPHI = isa<PHINode>(LastInst);

if (IsPHI)

- LastInst = LastInst->getParent()->getFirstNonPHI();

+ LastInstIt = LastInst->getParent()->getFirstNonPHIIt();

if (IsPHI || (E->State != TreeEntry::NeedToGather &&

doesNotNeedToSchedule(E->Scalars))) {

- Builder.SetInsertPoint(LastInst);

+ Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);

} else {

// Set the insertion point after the last instruction in the bundle. Set the

// debug location to Front.

- Builder.SetInsertPoint(LastInst->getParent(),

- std::next(LastInst->getIterator()));

+ Builder.SetInsertPoint(

+ LastInst->getParent(),

+ LastInst->getNextNonDebugInstruction()->getIterator());

}

Builder.SetCurrentDebugLocation(Front->getDebugLoc());

}

@@ -9271,10 +9988,12 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {

GatherShuffleExtractSeq.insert(InsElt);

CSEBlocks.insert(InsElt->getParent());

// Add to our 'need-to-extract' list.

- if (TreeEntry *Entry = getTreeEntry(V)) {

- // Find which lane we need to extract.

- unsigned FoundLane = Entry->findLaneForValue(V);

- ExternalUses.emplace_back(V, InsElt, FoundLane);

+ if (isa<Instruction>(V)) {

+ if (TreeEntry *Entry = getTreeEntry(V)) {

+ // Find which lane we need to extract.

+ unsigned FoundLane = Entry->findLaneForValue(V);

+ ExternalUses.emplace_back(V, InsElt, FoundLane);

+ }

}

return Vec;

};

@@ -9367,12 +10086,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

/// Holds all of the instructions that we gathered.

SetVector<Instruction *> &GatherShuffleExtractSeq;

/// A list of blocks that we are going to CSE.

- SetVector<BasicBlock *> &CSEBlocks;

+ DenseSet<BasicBlock *> &CSEBlocks;

public:

ShuffleIRBuilder(IRBuilderBase &Builder,

SetVector<Instruction *> &GatherShuffleExtractSeq,

- SetVector<BasicBlock *> &CSEBlocks)

+ DenseSet<BasicBlock *> &CSEBlocks)

: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),

CSEBlocks(CSEBlocks) {}

~ShuffleIRBuilder() = default;

@@ -9392,7 +10111,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

return V1;

unsigned VF = Mask.size();

unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();

- if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask))

+ if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))

return V1;

Value *Vec = Builder.CreateShuffleVector(V1, Mask);

if (auto *I = dyn_cast<Instruction>(Vec)) {

@@ -9455,7 +10174,11 @@ public:

: Builder(Builder), R(R) {}

/// Adjusts extractelements after reusing them.

- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {

+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,

+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

+ unsigned NumParts, bool &UseVecBaseAsInput) {

+ UseVecBaseAsInput = false;

+ SmallPtrSet<Value *, 4> UniqueBases;

Value *VecBase = nullptr;

for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

int Idx = Mask[I];

@@ -9463,6 +10186,10 @@ public:

continue;

auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

VecBase = EI->getVectorOperand();

+ if (const TreeEntry *TE = R.getTreeEntry(VecBase))

+ VecBase = TE->VectorizedValue;

+ assert(VecBase && "Expected vectorized value.");

+ UniqueBases.insert(VecBase);

// If the only one use is vectorized - can delete the extractelement

// itself.

if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {

@@ -9471,14 +10198,97 @@ public:

continue;

R.eraseInstruction(EI);

}

- return VecBase;

+ if (NumParts == 1 || UniqueBases.size() == 1)

+ return VecBase;

+ UseVecBaseAsInput = true;

+ auto TransformToIdentity = [](MutableArrayRef<int> Mask) {

+ for (auto [I, Idx] : enumerate(Mask))

+ if (Idx != PoisonMaskElem)

+ Idx = I;

+ };

+ // Perform multi-register vector shuffle, joining them into a single virtual

+ // long vector.

+ // Need to shuffle each part independently and then insert all this parts

+ // into a long virtual vector register, forming the original vector.

+ Value *Vec = nullptr;

+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);

+ unsigned SliceSize = E->Scalars.size() / NumParts;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ ArrayRef<Value *> VL =

+ ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);

+ MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);

+ constexpr int MaxBases = 2;

+ SmallVector<Value *, MaxBases> Bases(MaxBases);

+#ifndef NDEBUG

+ int PrevSize = 0;

+#endif // NDEBUG

+ for (const auto [I, V]: enumerate(VL)) {

+ if (SubMask[I] == PoisonMaskElem)

+ continue;

+ Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();

+ if (const TreeEntry *TE = R.getTreeEntry(VecOp))

+ VecOp = TE->VectorizedValue;

+ assert(VecOp && "Expected vectorized value.");

+ const int Size =

+ cast<FixedVectorType>(VecOp->getType())->getNumElements();

+#ifndef NDEBUG

+ assert((PrevSize == Size || PrevSize == 0) &&

+ "Expected vectors of the same size.");

+ PrevSize = Size;

+#endif // NDEBUG

+ Bases[SubMask[I] < Size ? 0 : 1] = VecOp;

+ }

+ if (!Bases.front())

+ continue;

+ Value *SubVec;

+ if (Bases.back()) {

+ SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);

+ TransformToIdentity(SubMask);

+ } else {

+ SubVec = Bases.front();

+ }

+ if (!Vec) {

+ Vec = SubVec;

+ assert((Part == 0 || all_of(seq<unsigned>(0, Part),

+ [&](unsigned P) {

+ ArrayRef<int> SubMask =

+ Mask.slice(P * SliceSize, SliceSize);

+ return all_of(SubMask, [](int Idx) {

+ return Idx == PoisonMaskElem;

+ });

+ })) &&

+ "Expected first part or all previous parts masked.");

+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

+ } else {

+ unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();

+ if (Vec->getType() != SubVec->getType()) {

+ unsigned SubVecVF =

+ cast<FixedVectorType>(SubVec->getType())->getNumElements();

+ VF = std::max(VF, SubVecVF);

+ }

+ // Adjust SubMask.

+ for (auto [I, Idx] : enumerate(SubMask))

+ if (Idx != PoisonMaskElem)

+ Idx += VF;

+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

+ Vec = createShuffle(Vec, SubVec, VecMask);

+ TransformToIdentity(VecMask);

+ }

+ copy(VecMask, Mask.begin());

+ return Vec;

}

/// Checks if the specified entry \p E needs to be delayed because of its

/// dependency nodes.

- Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {

+ std::optional<Value *>

+ needToDelay(const TreeEntry *E,

+ ArrayRef<SmallVector<const TreeEntry *>> Deps) const {

// No need to delay emission if all deps are ready.

- if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))

- return nullptr;

+ if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {

+ return all_of(

+ TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });

+ }))

+ return std::nullopt;

// Postpone gather emission, will be emitted after the end of the

// process to keep correct order.

auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),

@@ -9487,6 +10297,16 @@ public:

VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),

MaybeAlign());

}

+ /// Adds 2 input vectors (in form of tree entries) and the mask for their

+ /// shuffling.

+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {

+ add(E1.VectorizedValue, E2.VectorizedValue, Mask);

+ }

+ /// Adds single input vector (in form of tree entry) and the mask for its

+ /// shuffling.

+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {

+ add(E1.VectorizedValue, Mask);

+ }

/// Adds 2 input vectors and the mask for their shuffling.

void add(Value *V1, Value *V2, ArrayRef<int> Mask) {

assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");

@@ -9516,7 +10336,7 @@ public:

InVectors.push_back(V1);

}

/// Adds another one input vector and the mask for the shuffling.

- void add(Value *V1, ArrayRef<int> Mask) {

+ void add(Value *V1, ArrayRef<int> Mask, bool = false) {

if (InVectors.empty()) {

if (!isa<FixedVectorType>(V1->getType())) {

V1 = createShuffle(V1, nullptr, CommonMask);

@@ -9578,7 +10398,8 @@ public:

inversePermutation(Order, NewMask);

add(V1, NewMask);

}

- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {

+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,

+ Value *Root = nullptr) {

return R.gather(VL, Root);

}

Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }

@@ -9639,8 +10460,14 @@ public:

}

};

-Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

- ArrayRef<Value *> VL = E->getOperand(NodeIdx);

+Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,

+ bool PostponedPHIs) {

+ ValueList &VL = E->getOperand(NodeIdx);

+ if (E->State == TreeEntry::PossibleStridedVectorize &&

+ !E->ReorderIndices.empty()) {

+ SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());

+ reorderScalars(VL, Mask);

+ }

const unsigned VF = VL.size();

InstructionsState S = getSameOpcode(VL, *TLI);

// Special processing for GEPs bundle, which may include non-gep values.

@@ -9651,23 +10478,39 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

S = getSameOpcode(*It, *TLI);

}

if (S.getOpcode()) {

- if (TreeEntry *VE = getTreeEntry(S.OpValue);

- VE && VE->isSame(VL) &&

- (any_of(VE->UserTreeIndices,

- [E, NodeIdx](const EdgeInfo &EI) {

- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

- }) ||

- any_of(VectorizableTree,

- [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {

- return TE->isOperandGatherNode({E, NodeIdx}) &&

- VE->isSame(TE->Scalars);

- }))) {

+ auto CheckSameVE = [&](const TreeEntry *VE) {

+ return VE->isSame(VL) &&

+ (any_of(VE->UserTreeIndices,

+ [E, NodeIdx](const EdgeInfo &EI) {

+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

+ }) ||

+ any_of(VectorizableTree,

+ [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {

+ return TE->isOperandGatherNode({E, NodeIdx}) &&

+ VE->isSame(TE->Scalars);

+ }));

+ };

+ TreeEntry *VE = getTreeEntry(S.OpValue);

+ bool IsSameVE = VE && CheckSameVE(VE);

+ if (!IsSameVE) {

+ auto It = MultiNodeScalars.find(S.OpValue);

+ if (It != MultiNodeScalars.end()) {

+ auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {

+ return TE != VE && CheckSameVE(TE);

+ });

+ if (I != It->getSecond().end()) {

+ VE = *I;

+ IsSameVE = true;

+ }

+ if (IsSameVE) {

auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {

ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);

ShuffleBuilder.add(V, Mask);

return ShuffleBuilder.finalize(std::nullopt);

};

- Value *V = vectorizeTree(VE);

+ Value *V = vectorizeTree(VE, PostponedPHIs);

if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {

if (!VE->ReuseShuffleIndices.empty()) {

// Reshuffle to get only unique values.

@@ -9740,14 +10583,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

assert(I->get()->UserTreeIndices.size() == 1 &&

"Expected only single user for the gather node.");

assert(I->get()->isSame(VL) && "Expected same list of scalars.");

- IRBuilder<>::InsertPointGuard Guard(Builder);

- if (E->getOpcode() != Instruction::InsertElement &&

- E->getOpcode() != Instruction::PHI) {

- Instruction *LastInst = &getLastInstructionInBundle(E);

- assert(LastInst && "Failed to find last instruction in bundle");

- Builder.SetInsertPoint(LastInst);

- }

- return vectorizeTree(I->get());

+ return vectorizeTree(I->get(), PostponedPHIs);

}

template <typename BVTy, typename ResTy, typename... Args>

@@ -9765,7 +10601,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

inversePermutation(E->ReorderIndices, ReorderMask);

if (!ReorderMask.empty())

reorderScalars(GatheredScalars, ReorderMask);

- auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {

+ auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) {

if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {

return isa<UndefValue>(V) && !isa<PoisonValue>(V);

}))

@@ -9782,70 +10618,102 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

});

if (It == VectorizableTree.end())

return false;

- unsigned I =

- *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });

- int Sz = Mask.size();

- if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&

- ShuffleVectorInst::isIdentityMask(Mask))

+ int Idx;

+ if ((Mask.size() < InputVF &&

+ ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&

+ Idx == 0) ||

+ (Mask.size() == InputVF &&

+ ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {

std::iota(Mask.begin(), Mask.end(), 0);

- else

+ } else {

+ unsigned I =

+ *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });

std::fill(Mask.begin(), Mask.end(), I);

+ }

return true;

};

BVTy ShuffleBuilder(Params...);

ResTy Res = ResTy();

SmallVector<int> Mask;

- SmallVector<int> ExtractMask;

- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;

- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;

- SmallVector<const TreeEntry *> Entries;

+ SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);

+ SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;

+ Value *ExtractVecBase = nullptr;

+ bool UseVecBaseAsInput = false;

+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;

+ SmallVector<SmallVector<const TreeEntry *>> Entries;

Type *ScalarTy = GatheredScalars.front()->getType();

+ auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());

+ unsigned NumParts = TTI->getNumberOfParts(VecTy);

+ if (NumParts == 0 || NumParts >= GatheredScalars.size())

+ NumParts = 1;

if (!all_of(GatheredScalars, UndefValue::classof)) {

// Check for gathered extracts.

- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);

- SmallVector<Value *> IgnoredVals;

- if (UserIgnoreList)

- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

bool Resized = false;

- if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))

- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {

- Resized = true;

- GatheredScalars.append(VF - GatheredScalars.size(),

- PoisonValue::get(ScalarTy));

- }

+ ExtractShuffles =

+ tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

+ if (!ExtractShuffles.empty()) {

+ SmallVector<const TreeEntry *> ExtractEntries;

+ for (auto [Idx, I] : enumerate(ExtractMask)) {

+ if (I == PoisonMaskElem)

+ continue;

+ if (const auto *TE = getTreeEntry(

+ cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))

+ ExtractEntries.push_back(TE);

+ }

+ if (std::optional<ResTy> Delayed =

+ ShuffleBuilder.needToDelay(E, ExtractEntries)) {

+ // Delay emission of gathers which are not ready yet.

+ PostponedGathers.insert(E);

+ // Postpone gather emission, will be emitted after the end of the

+ // process to keep correct order.

+ return *Delayed;

+ }

+ if (Value *VecBase = ShuffleBuilder.adjustExtracts(

+ E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {

+ ExtractVecBase = VecBase;

+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

+ if (VF == VecBaseTy->getNumElements() &&

+ GatheredScalars.size() != VF) {

+ Resized = true;

+ GatheredScalars.append(VF - GatheredScalars.size(),

+ PoisonValue::get(ScalarTy));

+ }

// Gather extracts after we check for full matched gathers only.

- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||

+ if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||

E->isAltShuffle() ||

all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

isSplat(E->Scalars) ||

(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {

- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);

+ GatherShuffles =

+ isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);

}

- if (GatherShuffle) {

- if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {

+ if (!GatherShuffles.empty()) {

+ if (std::optional<ResTy> Delayed =

+ ShuffleBuilder.needToDelay(E, Entries)) {

// Delay emission of gathers which are not ready yet.

PostponedGathers.insert(E);

// Postpone gather emission, will be emitted after the end of the

// process to keep correct order.

- return Delayed;

+ return *Delayed;

}

- assert((Entries.size() == 1 || Entries.size() == 2) &&

- "Expected shuffle of 1 or 2 entries.");

- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&

- Entries.front()->isSame(E->Scalars)) {

+ if (GatherShuffles.size() == 1 &&

+ *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&

+ Entries.front().front()->isSame(E->Scalars)) {

// Perfect match in the graph, will reuse the previously vectorized

// node. Cost is 0.

LLVM_DEBUG(

dbgs()

- << "SLP: perfect diamond match for gather bundle that starts with "

- << *E->Scalars.front() << ".\n");

+ << "SLP: perfect diamond match for gather bundle "

+ << shortBundleName(E->Scalars) << ".\n");

// Restore the mask for previous partially matched values.

- if (Entries.front()->ReorderIndices.empty() &&

- ((Entries.front()->ReuseShuffleIndices.empty() &&

- E->Scalars.size() == Entries.front()->Scalars.size()) ||

- (E->Scalars.size() ==

- Entries.front()->ReuseShuffleIndices.size()))) {

+ Mask.resize(E->Scalars.size());

+ const TreeEntry *FrontTE = Entries.front().front();

+ if (FrontTE->ReorderIndices.empty() &&

+ ((FrontTE->ReuseShuffleIndices.empty() &&

+ E->Scalars.size() == FrontTE->Scalars.size()) ||

+ (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {

std::iota(Mask.begin(), Mask.end(), 0);

} else {

for (auto [I, V] : enumerate(E->Scalars)) {

@@ -9853,17 +10721,20 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

Mask[I] = PoisonMaskElem;

continue;

}

- Mask[I] = Entries.front()->findLaneForValue(V);

+ Mask[I] = FrontTE->findLaneForValue(V);

}

- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);

+ ShuffleBuilder.add(*FrontTE, Mask);

Res = ShuffleBuilder.finalize(E->getCommonMask());

return Res;

}

if (!Resized) {

- unsigned VF1 = Entries.front()->getVectorFactor();

- unsigned VF2 = Entries.back()->getVectorFactor();

- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)

+ if (GatheredScalars.size() != VF &&

+ any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {

+ return any_of(TEs, [&](const TreeEntry *TE) {

+ return TE->getVectorFactor() == VF;

+ });

+ }))

GatheredScalars.append(VF - GatheredScalars.size(),

PoisonValue::get(ScalarTy));

}

@@ -9943,78 +10814,108 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

if (It != Scalars.end()) {

// Replace undefs by the non-poisoned scalars and emit broadcast.

int Pos = std::distance(Scalars.begin(), It);

- for_each(UndefPos, [&](int I) {

+ for (int I : UndefPos) {

// Set the undef position to the non-poisoned scalar.

ReuseMask[I] = Pos;

// Replace the undef by the poison, in the mask it is replaced by

// non-poisoned scalar already.

if (I != Pos)

Scalars[I] = PoisonValue::get(ScalarTy);

- });

+ }

} else {

// Replace undefs by the poisons, emit broadcast and then emit

// freeze.

- for_each(UndefPos, [&](int I) {

+ for (int I : UndefPos) {

ReuseMask[I] = PoisonMaskElem;

if (isa<UndefValue>(Scalars[I]))

Scalars[I] = PoisonValue::get(ScalarTy);

- });

+ }

NeedFreeze = true;

}

};

- if (ExtractShuffle || GatherShuffle) {

+ if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {

bool IsNonPoisoned = true;

- bool IsUsedInExpr = false;

+ bool IsUsedInExpr = true;

Value *Vec1 = nullptr;

- if (ExtractShuffle) {

+ if (!ExtractShuffles.empty()) {

// Gather of extractelements can be represented as just a shuffle of

// a single/two vectors the scalars are extracted from.

// Find input vectors.

Value *Vec2 = nullptr;

for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {

- if (ExtractMask[I] == PoisonMaskElem ||

- (!Mask.empty() && Mask[I] != PoisonMaskElem)) {

+ if (!Mask.empty() && Mask[I] != PoisonMaskElem)

ExtractMask[I] = PoisonMaskElem;

- continue;

- }

- if (isa<UndefValue>(E->Scalars[I]))

- continue;

- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

- if (!Vec1) {

- Vec1 = EI->getVectorOperand();

- } else if (Vec1 != EI->getVectorOperand()) {

- assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&

- "Expected only 1 or 2 vectors shuffle.");

- Vec2 = EI->getVectorOperand();

+ }

+ if (UseVecBaseAsInput) {

+ Vec1 = ExtractVecBase;

+ } else {

+ for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {

+ if (ExtractMask[I] == PoisonMaskElem)

+ continue;

+ if (isa<UndefValue>(E->Scalars[I]))

+ continue;

+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

+ Value *VecOp = EI->getVectorOperand();

+ if (const auto *TE = getTreeEntry(VecOp))

+ if (TE->VectorizedValue)

+ VecOp = TE->VectorizedValue;

+ if (!Vec1) {

+ Vec1 = VecOp;

+ } else if (Vec1 != EI->getVectorOperand()) {

+ assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&

+ "Expected only 1 or 2 vectors shuffle.");

+ Vec2 = VecOp;

+ }

}

if (Vec2) {

+ IsUsedInExpr = false;

IsNonPoisoned &=

isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);

ShuffleBuilder.add(Vec1, Vec2, ExtractMask);

} else if (Vec1) {

- IsUsedInExpr = FindReusedSplat(ExtractMask);

- ShuffleBuilder.add(Vec1, ExtractMask);

+ IsUsedInExpr &= FindReusedSplat(

+ ExtractMask,

+ cast<FixedVectorType>(Vec1->getType())->getNumElements());

+ ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);

IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);

} else {

+ IsUsedInExpr = false;

ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(

ScalarTy, GatheredScalars.size())),

- ExtractMask);

+ ExtractMask, /*ForExtracts=*/true);

}

- if (GatherShuffle) {

- if (Entries.size() == 1) {

- IsUsedInExpr = FindReusedSplat(Mask);

- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);

- IsNonPoisoned &=

- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);

- } else {

- ShuffleBuilder.add(Entries.front()->VectorizedValue,

- Entries.back()->VectorizedValue, Mask);

- IsNonPoisoned &=

- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&

- isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);

+ if (!GatherShuffles.empty()) {

+ unsigned SliceSize = E->Scalars.size() / NumParts;

+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);

+ for (const auto [I, TEs] : enumerate(Entries)) {

+ if (TEs.empty()) {

+ assert(!GatherShuffles[I] &&

+ "No shuffles with empty entries list expected.");

+ continue;

+ }

+ assert((TEs.size() == 1 || TEs.size() == 2) &&

+ "Expected shuffle of 1 or 2 entries.");

+ auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);

+ VecMask.assign(VecMask.size(), PoisonMaskElem);

+ copy(SubMask, std::next(VecMask.begin(), I * SliceSize));

+ if (TEs.size() == 1) {

+ IsUsedInExpr &=

+ FindReusedSplat(VecMask, TEs.front()->getVectorFactor());

+ ShuffleBuilder.add(*TEs.front(), VecMask);

+ if (TEs.front()->VectorizedValue)

+ IsNonPoisoned &=

+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);

+ } else {

+ IsUsedInExpr = false;

+ ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);

+ if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)

+ IsNonPoisoned &=

+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&

+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);

+ }

}

// Try to figure out best way to combine values: build a shuffle and insert

@@ -10025,16 +10926,24 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

int MSz = Mask.size();

// Try to build constant vector and shuffle with it only if currently we

// have a single permutation and more than 1 scalar constants.

- bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;

+ bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();

bool IsIdentityShuffle =

- (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==

- TTI::SK_PermuteSingleSrc &&

+ ((UseVecBaseAsInput ||

+ all_of(ExtractShuffles,

+ [](const std::optional<TTI::ShuffleKind> &SK) {

+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==

+ TTI::SK_PermuteSingleSrc;

+ })) &&

none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&

- ShuffleVectorInst::isIdentityMask(ExtractMask)) ||

- (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==

- TTI::SK_PermuteSingleSrc &&

+ ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||

+ (!GatherShuffles.empty() &&

+ all_of(GatherShuffles,

+ [](const std::optional<TTI::ShuffleKind> &SK) {

+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==

+ TTI::SK_PermuteSingleSrc;

+ }) &&

none_of(Mask, [&](int I) { return I >= MSz; }) &&

- ShuffleVectorInst::isIdentityMask(Mask));

+ ShuffleVectorInst::isIdentityMask(Mask, MSz));

bool EnoughConstsForShuffle =

IsSingleShuffle &&

(none_of(GatheredScalars,

@@ -10064,7 +10973,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

if (!all_of(GatheredScalars, PoisonValue::classof)) {

SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);

TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);

- Value *BV = ShuffleBuilder.gather(GatheredScalars);

+ Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());

ShuffleBuilder.add(BV, BVMask);

}

if (all_of(NonConstants, [=](Value *V) {

@@ -10078,13 +10987,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

E->ReuseShuffleIndices, E->Scalars.size(),

[&](Value *&Vec, SmallVectorImpl<int> &Mask) {

TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);

- Vec = ShuffleBuilder.gather(NonConstants, Vec);

+ Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);

});

} else if (!allConstant(GatheredScalars)) {

// Gather unique scalars and all constants.

SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);

TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);

- Value *BV = ShuffleBuilder.gather(GatheredScalars);

+ Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());

ShuffleBuilder.add(BV, ReuseMask);

Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);

} else {

@@ -10109,10 +11018,12 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) {

*this);

}

-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

+Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {

IRBuilder<>::InsertPointGuard Guard(Builder);

- if (E->VectorizedValue) {

+ if (E->VectorizedValue &&

+ (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||

+ E->isAltShuffle())) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");

return E->VectorizedValue;

}

@@ -10126,13 +11037,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return Vec;

}

- auto FinalShuffle = [&](Value *V, const TreeEntry *E) {

+ auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,

+ bool IsSigned) {

+ if (V->getType() != VecTy)

+ V = Builder.CreateIntCast(V, VecTy, IsSigned);

ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);

if (E->getOpcode() == Instruction::Store) {

ArrayRef<int> Mask =

ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),

E->ReorderIndices.size());

ShuffleBuilder.add(V, Mask);

+ } else if (E->State == TreeEntry::PossibleStridedVectorize) {

+ ShuffleBuilder.addOrdered(V, std::nullopt);

} else {

ShuffleBuilder.addOrdered(V, E->ReorderIndices);

}

@@ -10140,7 +11056,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

};

assert((E->State == TreeEntry::Vectorize ||

- E->State == TreeEntry::ScatterVectorize) &&

+ E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

"Unhandled state");

unsigned ShuffleOrOp =

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

@@ -10150,6 +11067,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

ScalarTy = Store->getValueOperand()->getType();

else if (auto *IE = dyn_cast<InsertElementInst>(VL0))

ScalarTy = IE->getOperand(1)->getType();

+ bool IsSigned = false;

+ auto It = MinBWs.find(E);

+ if (It != MinBWs.end()) {

+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);

+ IsSigned = It->second.second;

+ }

auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());

switch (ShuffleOrOp) {

case Instruction::PHI: {

@@ -10157,32 +11080,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

E != VectorizableTree.front().get() ||

!E->UserTreeIndices.empty()) &&

"PHI reordering is free.");

+ if (PostponedPHIs && E->VectorizedValue)

+ return E->VectorizedValue;

auto *PH = cast<PHINode>(VL0);

- Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());

- Builder.SetCurrentDebugLocation(PH->getDebugLoc());

- PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

- Value *V = NewPhi;

- // Adjust insertion point once all PHI's have been generated.

- Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());

+ Builder.SetInsertPoint(PH->getParent(),

+ PH->getParent()->getFirstNonPHIIt());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

+ if (PostponedPHIs || !E->VectorizedValue) {

+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

+ E->PHI = NewPhi;

+ Value *V = NewPhi;

+ // Adjust insertion point once all PHI's have been generated.

+ Builder.SetInsertPoint(PH->getParent(),

+ PH->getParent()->getFirstInsertionPt());

+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

- E->VectorizedValue = V;

+ E->VectorizedValue = V;

+ if (PostponedPHIs)

+ return V;

+ }

+ PHINode *NewPhi = cast<PHINode>(E->PHI);

+ // If phi node is fully emitted - exit.

+ if (NewPhi->getNumIncomingValues() != 0)

+ return NewPhi;

// PHINodes may have multiple entries from the same block. We want to

// visit every block once.

SmallPtrSet<BasicBlock *, 4> VisitedBBs;

- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {

ValueList Operands;

- BasicBlock *IBB = PH->getIncomingBlock(i);

+ BasicBlock *IBB = PH->getIncomingBlock(I);

// Stop emission if all incoming values are generated.

if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

- return V;

+ return NewPhi;

}

if (!VisitedBBs.insert(IBB).second) {

@@ -10192,37 +11128,54 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Builder.SetInsertPoint(IBB->getTerminator());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

- Value *Vec = vectorizeOperand(E, i);

+ Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);

+ if (VecTy != Vec->getType()) {

+ assert(MinBWs.contains(getOperandEntry(E, I)) &&

+ "Expected item in MinBWs.");

+ Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);

+ }

NewPhi->addIncoming(Vec, IBB);

}

assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&

"Invalid number of incoming values");

- return V;

+ return NewPhi;

}

case Instruction::ExtractElement: {

Value *V = E->getSingleOperand(0);

setInsertPointAfterBundle(E);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

return V;

}

case Instruction::ExtractValue: {

auto *LI = cast<LoadInst>(E->getSingleOperand(0));

Builder.SetInsertPoint(LI);

- auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

- Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);

+ Value *Ptr = LI->getPointerOperand();

LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());

Value *NewV = propagateMetadata(V, E->Scalars);

- NewV = FinalShuffle(NewV, E);

+ NewV = FinalShuffle(NewV, E, VecTy, IsSigned);

E->VectorizedValue = NewV;

return NewV;

}

case Instruction::InsertElement: {

assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");

Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));

- Value *V = vectorizeOperand(E, 1);

+ Value *V = vectorizeOperand(E, 1, PostponedPHIs);

+ ArrayRef<Value *> Op = E->getOperand(1);

+ Type *ScalarTy = Op.front()->getType();

+ if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {

+ assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");

+ std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));

+ assert(Res.first > 0 && "Expected item in MinBWs.");

+ V = Builder.CreateIntCast(

+ V,

+ FixedVectorType::get(

+ ScalarTy,

+ cast<FixedVectorType>(V->getType())->getNumElements()),

+ Res.second);

+ }

// Create InsertVector shuffle if necessary

auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

@@ -10255,7 +11208,57 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Mask[InsertIdx - Offset] = I;

}

if (!IsIdentity || NumElts != NumScalars) {

- V = Builder.CreateShuffleVector(V, Mask);

+ Value *V2 = nullptr;

+ bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);

+ SmallVector<int> InsertMask(Mask);

+ if (NumElts != NumScalars && Offset == 0) {

+ // Follow all insert element instructions from the current buildvector

+ // sequence.

+ InsertElementInst *Ins = cast<InsertElementInst>(VL0);

+ do {

+ std::optional<unsigned> InsertIdx = getInsertIndex(Ins);

+ if (!InsertIdx)

+ break;

+ if (InsertMask[*InsertIdx] == PoisonMaskElem)

+ InsertMask[*InsertIdx] = *InsertIdx;

+ if (!Ins->hasOneUse())

+ break;

+ Ins = dyn_cast_or_null<InsertElementInst>(

+ Ins->getUniqueUndroppableUser());

+ } while (Ins);

+ SmallBitVector UseMask =

+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

+ SmallBitVector IsFirstPoison =

+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

+ SmallBitVector IsFirstUndef =

+ isUndefVector(FirstInsert->getOperand(0), UseMask);

+ if (!IsFirstPoison.all()) {

+ unsigned Idx = 0;

+ for (unsigned I = 0; I < NumElts; I++) {

+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&

+ IsFirstUndef.test(I)) {

+ if (IsVNonPoisonous) {

+ InsertMask[I] = I < NumScalars ? I : 0;

+ continue;

+ }

+ if (!V2)

+ V2 = UndefValue::get(V->getType());

+ if (Idx >= NumScalars)

+ Idx = NumScalars - 1;

+ InsertMask[I] = NumScalars + Idx;

+ ++Idx;

+ } else if (InsertMask[I] != PoisonMaskElem &&

+ Mask[I] == PoisonMaskElem) {

+ InsertMask[I] = PoisonMaskElem;

+ }

+ } else {

+ InsertMask = Mask;

+ }

+ if (!V2)

+ V2 = PoisonValue::get(V->getType());

+ V = Builder.CreateShuffleVector(V, V2, InsertMask);

if (auto *I = dyn_cast<Instruction>(V)) {

GatherShuffleExtractSeq.insert(I);

CSEBlocks.insert(I->getParent());

@@ -10274,15 +11277,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&

NumElts != NumScalars) {

if (IsFirstUndef.all()) {

- if (!ShuffleVectorInst::isIdentityMask(InsertMask)) {

- SmallBitVector IsFirstPoison =

- isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

- if (!IsFirstPoison.all()) {

- for (unsigned I = 0; I < NumElts; I++) {

- if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))

- InsertMask[I] = I + NumElts;

+ if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {

+ SmallBitVector IsFirstPoison =

+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

+ if (!IsFirstPoison.all()) {

+ for (unsigned I = 0; I < NumElts; I++) {

+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))

+ InsertMask[I] = I + NumElts;

+ }

}

- }

V = Builder.CreateShuffleVector(

IsFirstPoison.all() ? PoisonValue::get(V->getType())

@@ -10330,15 +11333,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::BitCast: {

setInsertPointAfterBundle(E);

- Value *InVec = vectorizeOperand(E, 0);

+ Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

auto *CI = cast<CastInst>(VL0);

- Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

- V = FinalShuffle(V, E);

+ Instruction::CastOps VecOpcode = CI->getOpcode();

+ Type *SrcScalarTy = VL0->getOperand(0)->getType();

+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&

+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {

+ // Check if the values are candidates to demote.

+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);

+ if (SrcIt != MinBWs.end())

+ SrcBWSz = SrcIt->second.first;

+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

+ if (BWSz == SrcBWSz) {

+ VecOpcode = Instruction::BitCast;

+ } else if (BWSz < SrcBWSz) {

+ VecOpcode = Instruction::Trunc;

+ } else if (It != MinBWs.end()) {

+ assert(BWSz > SrcBWSz && "Invalid cast!");

+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

+ }

+ Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)

+ ? InVec

+ : Builder.CreateCast(VecOpcode, InVec, VecTy);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10348,21 +11372,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::ICmp: {

setInsertPointAfterBundle(E);

- Value *L = vectorizeOperand(E, 0);

+ Value *L = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *R = vectorizeOperand(E, 1);

+ Value *R = vectorizeOperand(E, 1, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (L->getType() != R->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||

+ MinBWs.contains(getOperandEntry(E, 1))) &&

+ "Expected item in MinBWs.");

+ L = Builder.CreateIntCast(L, VecTy, IsSigned);

+ R = Builder.CreateIntCast(R, VecTy, IsSigned);

+ }

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

Value *V = Builder.CreateCmp(P0, L, R);

propagateIRFlags(V, E->Scalars, VL0);

- V = FinalShuffle(V, E);

+ // Do not cast for cmps.

+ VecTy = cast<FixedVectorType>(V->getType());

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10371,24 +11404,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Select: {

setInsertPointAfterBundle(E);

- Value *Cond = vectorizeOperand(E, 0);

+ Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *True = vectorizeOperand(E, 1);

+ Value *True = vectorizeOperand(E, 1, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *False = vectorizeOperand(E, 2);

+ Value *False = vectorizeOperand(E, 2, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (True->getType() != False->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 1)) ||

+ MinBWs.contains(getOperandEntry(E, 2))) &&

+ "Expected item in MinBWs.");

+ True = Builder.CreateIntCast(True, VecTy, IsSigned);

+ False = Builder.CreateIntCast(False, VecTy, IsSigned);

+ }

Value *V = Builder.CreateSelect(Cond, True, False);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10397,7 +11437,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::FNeg: {

setInsertPointAfterBundle(E);

- Value *Op = vectorizeOperand(E, 0);

+ Value *Op = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

@@ -10410,7 +11450,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10437,16 +11477,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Xor: {

setInsertPointAfterBundle(E);

- Value *LHS = vectorizeOperand(E, 0);

+ Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *RHS = vectorizeOperand(E, 1);

+ Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (LHS->getType() != RHS->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||

+ MinBWs.contains(getOperandEntry(E, 1))) &&

+ "Expected item in MinBWs.");

+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);

+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);

+ }

Value *V = Builder.CreateBinOp(

static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

@@ -10455,7 +11502,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10476,14 +11523,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// The pointer operand uses an in-tree scalar so we add the new

// LoadInst to ExternalUses list to make sure that an extract will

// be generated in the future.

- if (TreeEntry *Entry = getTreeEntry(PO)) {

- // Find which lane we need to extract.

- unsigned FoundLane = Entry->findLaneForValue(PO);

- ExternalUses.emplace_back(PO, NewLI, FoundLane);

+ if (isa<Instruction>(PO)) {

+ if (TreeEntry *Entry = getTreeEntry(PO)) {

+ // Find which lane we need to extract.

+ unsigned FoundLane = Entry->findLaneForValue(PO);

+ ExternalUses.emplace_back(PO, NewLI, FoundLane);

+ }

}

} else {

- assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");

- Value *VecPtr = vectorizeOperand(E, 0);

+ assert((E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

+ "Unhandled state");

+ Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

@@ -10497,35 +11548,32 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *V = propagateMetadata(NewLI, E->Scalars);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

return V;

}

case Instruction::Store: {

auto *SI = cast<StoreInst>(VL0);

- unsigned AS = SI->getPointerAddressSpace();

setInsertPointAfterBundle(E);

- Value *VecValue = vectorizeOperand(E, 0);

- VecValue = FinalShuffle(VecValue, E);

+ Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);

+ VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);

- Value *ScalarPtr = SI->getPointerOperand();

- Value *VecPtr = Builder.CreateBitCast(

- ScalarPtr, VecValue->getType()->getPointerTo(AS));

+ Value *Ptr = SI->getPointerOperand();

StoreInst *ST =

- Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());

+ Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());

- // The pointer operand uses an in-tree scalar, so add the new BitCast or

- // StoreInst to ExternalUses to make sure that an extract will be

- // generated in the future.

- if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {

- // Find which lane we need to extract.

- unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);

- ExternalUses.push_back(ExternalUser(

- ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,

- FoundLane));

+ // The pointer operand uses an in-tree scalar, so add the new StoreInst to

+ // ExternalUses to make sure that an extract will be generated in the

+ // future.

+ if (isa<Instruction>(Ptr)) {

+ if (TreeEntry *Entry = getTreeEntry(Ptr)) {

+ // Find which lane we need to extract.

+ unsigned FoundLane = Entry->findLaneForValue(Ptr);

+ ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane));

+ }

}

Value *V = propagateMetadata(ST, E->Scalars);

@@ -10538,7 +11586,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

auto *GEP0 = cast<GetElementPtrInst>(VL0);

setInsertPointAfterBundle(E);

- Value *Op0 = vectorizeOperand(E, 0);

+ Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

@@ -10546,7 +11594,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

SmallVector<Value *> OpVecs;

for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {

- Value *OpVec = vectorizeOperand(E, J);

+ Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

@@ -10564,7 +11612,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

V = propagateMetadata(I, GEPs);

}

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10586,41 +11634,42 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

VecCallCosts.first <= VecCallCosts.second;

Value *ScalarArg = nullptr;

- std::vector<Value *> OpVecs;

+ SmallVector<Value *> OpVecs;

SmallVector<Type *, 2> TysForDecl;

// Add return type if intrinsic is overloaded on it.

if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))

TysForDecl.push_back(

FixedVectorType::get(CI->getType(), E->Scalars.size()));

- for (int j = 0, e = CI->arg_size(); j < e; ++j) {

+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {

ValueList OpVL;

// Some intrinsics have scalar arguments. This argument should not be

// vectorized.

- if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {

+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, I)) {

CallInst *CEI = cast<CallInst>(VL0);

- ScalarArg = CEI->getArgOperand(j);

- OpVecs.push_back(CEI->getArgOperand(j));

- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

+ ScalarArg = CEI->getArgOperand(I);

+ OpVecs.push_back(CEI->getArgOperand(I));

+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))

TysForDecl.push_back(ScalarArg->getType());

continue;

}

- Value *OpVec = vectorizeOperand(E, j);

+ Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");

+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");

OpVecs.push_back(OpVec);

- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))

TysForDecl.push_back(OpVec->getType());

}

Function *CF;

if (!UseIntrinsic) {

VFShape Shape =

- VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(

- VecTy->getNumElements())),

+ VFShape::get(CI->getFunctionType(),

+ ElementCount::getFixed(

+ static_cast<unsigned>(VecTy->getNumElements())),

false /*HasGlobalPred*/);

CF = VFDatabase(*CI).getVectorizedFunction(Shape);

} else {

@@ -10634,7 +11683,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// The scalar argument uses an in-tree scalar so we add the new vectorized

// call to ExternalUses list to make sure that an extract will be

// generated in the future.

- if (ScalarArg) {

+ if (isa_and_present<Instruction>(ScalarArg)) {

if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {

// Find which lane we need to extract.

unsigned FoundLane = Entry->findLaneForValue(ScalarArg);

@@ -10644,7 +11693,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

propagateIRFlags(V, E->Scalars, VL0);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10662,20 +11711,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Value *LHS = nullptr, *RHS = nullptr;

if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {

setInsertPointAfterBundle(E);

- LHS = vectorizeOperand(E, 0);

+ LHS = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- RHS = vectorizeOperand(E, 1);

+ RHS = vectorizeOperand(E, 1, PostponedPHIs);

} else {

setInsertPointAfterBundle(E);

- LHS = vectorizeOperand(E, 0);

+ LHS = vectorizeOperand(E, 0, PostponedPHIs);

}

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (LHS && RHS && LHS->getType() != RHS->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||

+ MinBWs.contains(getOperandEntry(E, 1))) &&

+ "Expected item in MinBWs.");

+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);

+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);

+ }

Value *V0, *V1;

if (Instruction::isBinaryOp(E->getOpcode())) {

@@ -10708,8 +11764,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// each vector operation.

ValueList OpScalars, AltScalars;

SmallVector<int> Mask;

- buildShuffleEntryMask(

- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

+ E->buildAltOpShuffleMask(

[E, this](Instruction *I) {

assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

@@ -10727,6 +11782,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

CSEBlocks.insert(I->getParent());

}

+ if (V->getType() != VecTy && !isa<CmpInst>(VL0))

+ V = Builder.CreateIntCast(

+ V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10767,9 +11825,19 @@ Value *BoUpSLP::vectorizeTree(

// need to rebuild it.

EntryToLastInstruction.clear();

- Builder.SetInsertPoint(ReductionRoot ? ReductionRoot

- : &F->getEntryBlock().front());

- auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());

+ if (ReductionRoot)

+ Builder.SetInsertPoint(ReductionRoot->getParent(),

+ ReductionRoot->getIterator());

+ else

+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

+ // Postpone emission of PHIs operands to avoid cyclic dependencies issues.

+ (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);

+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)

+ if (TE->State == TreeEntry::Vectorize &&

+ TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&

+ TE->VectorizedValue)

+ (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);

// Run through the list of postponed gathers and emit them, replacing the temp

// emitted allocas with actual vector instructions.

ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();

@@ -10786,9 +11854,32 @@ Value *BoUpSLP::vectorizeTree(

TE->VectorizedValue = nullptr;

auto *UserI =

cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);

- Builder.SetInsertPoint(PrevVec);

+ // If user is a PHI node, its vector code have to be inserted right before

+ // block terminator. Since the node was delayed, there were some unresolved

+ // dependencies at the moment when stab instruction was emitted. In a case

+ // when any of these dependencies turn out an operand of another PHI, coming

+ // from this same block, position of a stab instruction will become invalid.

+ // The is because source vector that supposed to feed this gather node was

+ // inserted at the end of the block [after stab instruction]. So we need

+ // to adjust insertion point again to the end of block.

+ if (isa<PHINode>(UserI)) {

+ // Insert before all users.

+ Instruction *InsertPt = PrevVec->getParent()->getTerminator();

+ for (User *U : PrevVec->users()) {

+ if (U == UserI)

+ continue;

+ auto *UI = dyn_cast<Instruction>(U);

+ if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())

+ continue;

+ if (UI->comesBefore(InsertPt))

+ InsertPt = UI;

+ }

+ Builder.SetInsertPoint(InsertPt);

+ } else {

+ Builder.SetInsertPoint(PrevVec);

+ }

Builder.SetCurrentDebugLocation(UserI->getDebugLoc());

- Value *Vec = vectorizeTree(TE);

+ Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);

PrevVec->replaceAllUsesWith(Vec);

PostponedValues.try_emplace(Vec).first->second.push_back(TE);

// Replace the stub vector node, if it was used before for one of the

@@ -10801,26 +11892,6 @@ Value *BoUpSLP::vectorizeTree(

eraseInstruction(PrevVec);

}

- // If the vectorized tree can be rewritten in a smaller type, we truncate the

- // vectorized root. InstCombine will then rewrite the entire expression. We

- // sign extend the extracted values below.

- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

- if (MinBWs.count(ScalarRoot)) {

- if (auto *I = dyn_cast<Instruction>(VectorRoot)) {

- // If current instr is a phi and not the last phi, insert it after the

- // last phi node.

- if (isa<PHINode>(I))

- Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());

- else

- Builder.SetInsertPoint(&*++BasicBlock::iterator(I));

- }

- auto BundleWidth = VectorizableTree[0]->Scalars.size();

- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

- auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);

- auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);

- VectorizableTree[0]->VectorizedValue = Trunc;

- }

LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()

<< " values .\n");

@@ -10830,6 +11901,8 @@ Value *BoUpSLP::vectorizeTree(

// Maps extract Scalar to the corresponding extractelement instruction in the

// basic block. Only one extractelement per block should be emitted.

DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;

+ SmallDenseSet<Value *, 4> UsedInserts;

+ DenseMap<Value *, Value *> VectorCasts;

// Extract all of the elements with the external uses.

for (const auto &ExternalUse : ExternalUses) {

Value *Scalar = ExternalUse.Scalar;

@@ -10864,7 +11937,8 @@ Value *BoUpSLP::vectorizeTree(

Instruction *I = EEIt->second;

if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&

Builder.GetInsertPoint()->comesBefore(I))

- I->moveBefore(&*Builder.GetInsertPoint());

+ I->moveBefore(*Builder.GetInsertPoint()->getParent(),

+ Builder.GetInsertPoint());

Ex = I;

}

@@ -10887,11 +11961,10 @@ Value *BoUpSLP::vectorizeTree(

}

// If necessary, sign-extend or zero-extend ScalarRoot

// to the larger type.

- if (!MinBWs.count(ScalarRoot))

- return Ex;

- if (MinBWs[ScalarRoot].second)

- return Builder.CreateSExt(Ex, Scalar->getType());

- return Builder.CreateZExt(Ex, Scalar->getType());

+ if (Scalar->getType() != Ex->getType())

+ return Builder.CreateIntCast(Ex, Scalar->getType(),

+ MinBWs.find(E)->second.second);

+ return Ex;

}

assert(isa<FixedVectorType>(Scalar->getType()) &&

isa<InsertElementInst>(Scalar) &&

@@ -10909,12 +11982,13 @@ Value *BoUpSLP::vectorizeTree(

"ExternallyUsedValues map");

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

if (auto *PHI = dyn_cast<PHINode>(VecI))

- Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI());

+ Builder.SetInsertPoint(PHI->getParent(),

+ PHI->getParent()->getFirstNonPHIIt());

else

Builder.SetInsertPoint(VecI->getParent(),

std::next(VecI->getIterator()));

} else {

- Builder.SetInsertPoint(&F->getEntryBlock().front());

+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

}

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

// Required to update internally referenced instructions.

@@ -10927,12 +12001,26 @@ Value *BoUpSLP::vectorizeTree(

// Skip if the scalar is another vector op or Vec is not an instruction.

if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {

if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {

+ if (!UsedInserts.insert(VU).second)

+ continue;

+ // Need to use original vector, if the root is truncated.

+ auto BWIt = MinBWs.find(E);

+ if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {

+ auto VecIt = VectorCasts.find(Scalar);

+ if (VecIt == VectorCasts.end()) {

+ IRBuilder<>::InsertPointGuard Guard(Builder);

+ if (auto *IVec = dyn_cast<Instruction>(Vec))

+ Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());

+ Vec = Builder.CreateIntCast(Vec, VU->getType(),

+ BWIt->second.second);

+ VectorCasts.try_emplace(Scalar, Vec);

+ } else {

+ Vec = VecIt->second;

+ }

std::optional<unsigned> InsertIdx = getInsertIndex(VU);

if (InsertIdx) {

- // Need to use original vector, if the root is truncated.

- if (MinBWs.count(Scalar) &&

- VectorizableTree[0]->VectorizedValue == Vec)

- Vec = VectorRoot;

auto *It =

find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {

// Checks if 2 insertelements are from the same buildvector.

@@ -10992,18 +12080,18 @@ Value *BoUpSLP::vectorizeTree(

// Find the insertion point for the extractelement lane.

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

if (PHINode *PH = dyn_cast<PHINode>(User)) {

- for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {

- if (PH->getIncomingValue(i) == Scalar) {

+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {

+ if (PH->getIncomingValue(I) == Scalar) {

Instruction *IncomingTerminator =

- PH->getIncomingBlock(i)->getTerminator();

+ PH->getIncomingBlock(I)->getTerminator();

if (isa<CatchSwitchInst>(IncomingTerminator)) {

Builder.SetInsertPoint(VecI->getParent(),

std::next(VecI->getIterator()));

} else {

- Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());

+ Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());

}

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

- PH->setOperand(i, NewInst);

+ PH->setOperand(I, NewInst);

}

} else {

@@ -11012,7 +12100,7 @@ Value *BoUpSLP::vectorizeTree(

User->replaceUsesOfWith(Scalar, NewInst);

}

} else {

- Builder.SetInsertPoint(&F->getEntryBlock().front());

+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

User->replaceUsesOfWith(Scalar, NewInst);

}

@@ -11085,7 +12173,7 @@ Value *BoUpSLP::vectorizeTree(

// non-resizing mask.

if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())

->getNumElements() ||

- !ShuffleVectorInst::isIdentityMask(Mask))

+ !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

return CreateShuffle(Vals.front(), nullptr, Mask);

return Vals.front();

}

@@ -11676,7 +12764,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

}

- auto makeControlDependent = [&](Instruction *I) {

+ auto MakeControlDependent = [&](Instruction *I) {

auto *DepDest = getScheduleData(I);

assert(DepDest && "must be in schedule window");

DepDest->ControlDependencies.push_back(BundleMember);

@@ -11698,7 +12786,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

continue;

// Add the dependency

- makeControlDependent(I);

+ MakeControlDependent(I);

if (!isGuaranteedToTransferExecutionToSuccessor(I))

// Everything past here must be control dependent on I.

@@ -11724,7 +12812,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

continue;

// Add the dependency

- makeControlDependent(I);

+ MakeControlDependent(I);

}

@@ -11742,7 +12830,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

continue;

// Add the dependency

- makeControlDependent(I);

+ MakeControlDependent(I);

break;

}

@@ -11757,7 +12845,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

"NextLoadStore list for non memory effecting bundle?");

MemoryLocation SrcLoc = getLocation(SrcInst);

bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();

- unsigned numAliased = 0;

+ unsigned NumAliased = 0;

unsigned DistToSrc = 1;

for (; DepDest; DepDest = DepDest->NextLoadStore) {

@@ -11772,13 +12860,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

// check this limit even between two read-only instructions.

if (DistToSrc >= MaxMemDepDistance ||

((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&

- (numAliased >= AliasedCheckLimit ||

+ (NumAliased >= AliasedCheckLimit ||

SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

// We increment the counter only if the locations are aliased

// (instead of counting all alias checks). This gives a better

// balance between reduced runtime and accurate dependencies.

- numAliased++;

+ NumAliased++;

DepDest->MemoryDependencies.push_back(BundleMember);

BundleMember->Dependencies++;

@@ -11880,20 +12968,20 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

// Do the "real" scheduling.

while (!ReadyInsts.empty()) {

- ScheduleData *picked = *ReadyInsts.begin();

+ ScheduleData *Picked = *ReadyInsts.begin();

ReadyInsts.erase(ReadyInsts.begin());

// Move the scheduled instruction(s) to their dedicated places, if not

// there yet.

- for (ScheduleData *BundleMember = picked; BundleMember;

+ for (ScheduleData *BundleMember = Picked; BundleMember;

BundleMember = BundleMember->NextInBundle) {

- Instruction *pickedInst = BundleMember->Inst;

- if (pickedInst->getNextNode() != LastScheduledInst)

- pickedInst->moveBefore(LastScheduledInst);

- LastScheduledInst = pickedInst;

+ Instruction *PickedInst = BundleMember->Inst;

+ if (PickedInst->getNextNode() != LastScheduledInst)

+ PickedInst->moveBefore(LastScheduledInst);

+ LastScheduledInst = PickedInst;

}

- BS->schedule(picked, ReadyInsts);

+ BS->schedule(Picked, ReadyInsts);

}

// Check that we didn't break any of our invariants.

@@ -11994,21 +13082,22 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {

// Determine if a value V in a vectorizable expression Expr can be demoted to a

// smaller type with a truncation. We collect the values that will be demoted

// in ToDemote and additional roots that require investigating in Roots.

-static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

- SmallVectorImpl<Value *> &ToDemote,

- SmallVectorImpl<Value *> &Roots) {

+bool BoUpSLP::collectValuesToDemote(

+ Value *V, SmallVectorImpl<Value *> &ToDemote,

+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,

+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {

// We can always demote constants.

- if (isa<Constant>(V)) {

- ToDemote.push_back(V);

+ if (isa<Constant>(V))

return true;

- }

- // If the value is not an instruction in the expression with only one use, it

- // cannot be demoted.

+ // If the value is not a vectorized instruction in the expression with only

+ // one use, it cannot be demoted.

auto *I = dyn_cast<Instruction>(V);

- if (!I || !I->hasOneUse() || !Expr.count(I))

+ if (!I || !I->hasOneUse() || !getTreeEntry(I) || !Visited.insert(I).second)

return false;

+ unsigned Start = 0;

+ unsigned End = I->getNumOperands();

switch (I->getOpcode()) {

// We can always demote truncations and extensions. Since truncations can

@@ -12030,16 +13119,21 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

case Instruction::And:

case Instruction::Or:

case Instruction::Xor:

- if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||

- !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))

+ if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,

+ Visited) ||

+ !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,

+ Visited))

return false;

break;

// We can demote selects if we can demote their true and false values.

case Instruction::Select: {

+ Start = 1;

SelectInst *SI = cast<SelectInst>(I);

- if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||

- !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))

+ if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,

+ Roots, Visited) ||

+ !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,

+ Roots, Visited))

return false;

break;

}

@@ -12049,7 +13143,8 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

case Instruction::PHI: {

PHINode *PN = cast<PHINode>(I);

for (Value *IncValue : PN->incoming_values())

- if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))

+ if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,

+ Visited))

return false;

break;

}

@@ -12059,6 +13154,10 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

return false;

}

+ // Gather demoted constant operands.

+ for (unsigned Idx : seq<unsigned>(Start, End))

+ if (isa<Constant>(I->getOperand(Idx)))

+ DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);

// Record the value that we can demote.

ToDemote.push_back(V);

return true;

@@ -12076,44 +13175,26 @@ void BoUpSLP::computeMinimumValueSizes() {

if (!TreeRootIT)

return;

- // If the expression is not rooted by a store, these roots should have

- // external uses. We will rely on InstCombine to rewrite the expression in

- // the narrower type. However, InstCombine only rewrites single-use values.

- // This means that if a tree entry other than a root is used externally, it

- // must have multiple uses and InstCombine will not rewrite it. The code

- // below ensures that only the roots are used externally.

- SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());

- for (auto &EU : ExternalUses)

- if (!Expr.erase(EU.Scalar))

- return;

- if (!Expr.empty())

+ // Ensure the roots of the vectorizable tree don't form a cycle.

+ if (!VectorizableTree.front()->UserTreeIndices.empty())

return;

- // Collect the scalar values of the vectorizable expression. We will use this

- // context to determine which values can be demoted. If we see a truncation,

- // we mark it as seeding another demotion.

- for (auto &EntryPtr : VectorizableTree)

- Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());

- // Ensure the roots of the vectorizable tree don't form a cycle. They must

- // have a single external user that is not in the vectorizable tree.

- for (auto *Root : TreeRoot)

- if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))

- return;

// Conservatively determine if we can actually truncate the roots of the

// expression. Collect the values that can be demoted in ToDemote and

// additional roots that require investigating in Roots.

SmallVector<Value *, 32> ToDemote;

+ DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;

SmallVector<Value *, 4> Roots;

- for (auto *Root : TreeRoot)

- if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))

+ for (auto *Root : TreeRoot) {

+ DenseSet<Value *> Visited;

+ if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))

return;

+ }

// The maximum bit width required to represent all the values that can be

// demoted without loss of precision. It would be safe to truncate the roots

// of the expression to this width.

- auto MaxBitWidth = 8u;

+ auto MaxBitWidth = 1u;

// We first check if all the bits of the roots are demanded. If they're not,

// we can truncate the roots to this narrower type.

@@ -12138,9 +13219,9 @@ void BoUpSLP::computeMinimumValueSizes() {

// maximum bit width required to store the scalar by using ValueTracking to

// compute the number of high-order bits we can truncate.

if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&

- llvm::all_of(TreeRoot, [](Value *R) {

- assert(R->hasOneUse() && "Root should have only one use!");

- return isa<GetElementPtrInst>(R->user_back());

+ all_of(TreeRoot, [](Value *V) {

+ return all_of(V->users(),

+ [](User *U) { return isa<GetElementPtrInst>(U); });

})) {

MaxBitWidth = 8u;

@@ -12189,12 +13270,39 @@ void BoUpSLP::computeMinimumValueSizes() {

// If we can truncate the root, we must collect additional values that might

// be demoted as a result. That is, those seeded by truncations we will

// modify.

- while (!Roots.empty())

- collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);

+ while (!Roots.empty()) {

+ DenseSet<Value *> Visited;

+ collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,

+ Visited);

+ }

// Finally, map the values we can demote to the maximum bit with we computed.

- for (auto *Scalar : ToDemote)

- MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);

+ for (auto *Scalar : ToDemote) {

+ auto *TE = getTreeEntry(Scalar);

+ assert(TE && "Expected vectorized scalar.");

+ if (MinBWs.contains(TE))

+ continue;

+ bool IsSigned = any_of(TE->Scalars, [&](Value *R) {

+ KnownBits Known = computeKnownBits(R, *DL);

+ return !Known.isNonNegative();

+ });

+ MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);

+ const auto *I = cast<Instruction>(Scalar);

+ auto DCIt = DemotedConsts.find(I);

+ if (DCIt != DemotedConsts.end()) {

+ for (unsigned Idx : DCIt->getSecond()) {

+ // Check that all instructions operands are demoted.

+ if (all_of(TE->Scalars, [&](Value *V) {

+ auto SIt = DemotedConsts.find(cast<Instruction>(V));

+ return SIt != DemotedConsts.end() &&

+ is_contained(SIt->getSecond(), Idx);

+ })) {

+ const TreeEntry *CTE = getOperandEntry(TE, Idx);

+ MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);

+ }

}

PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

@@ -12348,139 +13456,206 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

BoUpSLP::ValueSet VectorizedStores;

bool Changed = false;

- int E = Stores.size();

- SmallBitVector Tails(E, false);

- int MaxIter = MaxStoreLookup.getValue();

- SmallVector<std::pair<int, int>, 16> ConsecutiveChain(

- E, std::make_pair(E, INT_MAX));

- SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));

- int IterCnt;

- auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,

- &CheckedPairs,

- &ConsecutiveChain](int K, int Idx) {

- if (IterCnt >= MaxIter)

- return true;

- if (CheckedPairs[Idx].test(K))

- return ConsecutiveChain[K].second == 1 &&

- ConsecutiveChain[K].first == Idx;

- ++IterCnt;

- CheckedPairs[Idx].set(K);

- CheckedPairs[K].set(Idx);

- std::optional<int> Diff = getPointersDiff(

- Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),

- Stores[Idx]->getValueOperand()->getType(),

- Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);

- if (!Diff || *Diff == 0)

- return false;

- int Val = *Diff;

- if (Val < 0) {

- if (ConsecutiveChain[Idx].second > -Val) {

- Tails.set(K);

- ConsecutiveChain[Idx] = std::make_pair(K, -Val);

- }

- return false;

+ // Stores the pair of stores (first_store, last_store) in a range, that were

+ // already tried to be vectorized. Allows to skip the store ranges that were

+ // already tried to be vectorized but the attempts were unsuccessful.

+ DenseSet<std::pair<Value *, Value *>> TriedSequences;

+ struct StoreDistCompare {

+ bool operator()(const std::pair<unsigned, int> &Op1,

+ const std::pair<unsigned, int> &Op2) const {

+ return Op1.second < Op2.second;

}

- if (ConsecutiveChain[K].second <= Val)

- return false;

- Tails.set(Idx);

- ConsecutiveChain[K] = std::make_pair(Idx, Val);

- return Val == 1;

};

- // Do a quadratic search on all of the given stores in reverse order and find

- // all of the pairs of stores that follow each other.

- for (int Idx = E - 1; Idx >= 0; --Idx) {

- // If a store has multiple consecutive store candidates, search according

- // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...

- // This is because usually pairing with immediate succeeding or preceding

- // candidate create the best chance to find slp vectorization opportunity.

- const int MaxLookDepth = std::max(E - Idx, Idx + 1);

- IterCnt = 0;

- for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)

- if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||

- (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))

- break;

- }

- // Tracks if we tried to vectorize stores starting from the given tail

- // already.

- SmallBitVector TriedTails(E, false);

- // For stores that start but don't end a link in the chain:

- for (int Cnt = E; Cnt > 0; --Cnt) {

- int I = Cnt - 1;

- if (ConsecutiveChain[I].first == E || Tails.test(I))

- continue;

- // We found a store instr that starts a chain. Now follow the chain and try

- // to vectorize it.

+ // A set of pairs (index of store in Stores array ref, Distance of the store

+ // address relative to base store address in units).

+ using StoreIndexToDistSet =

+ std::set<std::pair<unsigned, int>, StoreDistCompare>;

+ auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {

+ int PrevDist = -1;

BoUpSLP::ValueList Operands;

// Collect the chain into a list.

- while (I != E && !VectorizedStores.count(Stores[I])) {

- Operands.push_back(Stores[I]);

- Tails.set(I);

- if (ConsecutiveChain[I].second != 1) {

- // Mark the new end in the chain and go back, if required. It might be

- // required if the original stores come in reversed order, for example.

- if (ConsecutiveChain[I].first != E &&

- Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&

- !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {

- TriedTails.set(I);

- Tails.reset(ConsecutiveChain[I].first);

- if (Cnt < ConsecutiveChain[I].first + 2)

- Cnt = ConsecutiveChain[I].first + 2;

+ for (auto [Idx, Data] : enumerate(Set)) {

+ if (Operands.empty() || Data.second - PrevDist == 1) {

+ Operands.push_back(Stores[Data.first]);

+ PrevDist = Data.second;

+ if (Idx != Set.size() - 1)

+ continue;

+ }

+ if (Operands.size() <= 1) {

+ Operands.clear();

+ Operands.push_back(Stores[Data.first]);

+ PrevDist = Data.second;

+ continue;

+ }

+ unsigned MaxVecRegSize = R.getMaxVecRegSize();

+ unsigned EltSize = R.getVectorElementSize(Operands[0]);

+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);

+ unsigned MaxVF =

+ std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);

+ auto *Store = cast<StoreInst>(Operands[0]);

+ Type *StoreTy = Store->getValueOperand()->getType();

+ Type *ValueTy = StoreTy;

+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

+ ValueTy = Trunc->getSrcTy();

+ unsigned MinVF = TTI->getStoreMinimumVF(

+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);

+ if (MaxVF <= MinVF) {

+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF

+ << ") <= "

+ << "MinVF (" << MinVF << ")\n");

+ }

+ // FIXME: Is division-by-2 the correct step? Should we assert that the

+ // register size is a power-of-2?

+ unsigned StartIdx = 0;

+ for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {

+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {

+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);

+ assert(

+ all_of(

+ Slice,

+ [&](Value *V) {

+ return cast<StoreInst>(V)->getValueOperand()->getType() ==

+ cast<StoreInst>(Slice.front())

+ ->getValueOperand()

+ ->getType();

+ }) &&

+ "Expected all operands of same type.");

+ if (!VectorizedStores.count(Slice.front()) &&

+ !VectorizedStores.count(Slice.back()) &&

+ TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))

+ .second &&

+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {

+ // Mark the vectorized stores so that we don't vectorize them again.

+ VectorizedStores.insert(Slice.begin(), Slice.end());

+ Changed = true;

+ // If we vectorized initial block, no need to try to vectorize it

+ // again.

+ if (Cnt == StartIdx)

+ StartIdx += Size;

+ Cnt += Size;

+ continue;

+ }

+ ++Cnt;

}

- break;

+ // Check if the whole array was vectorized already - exit.

+ if (StartIdx >= Operands.size())

+ break;

}

- // Move to the next value in the chain.

- I = ConsecutiveChain[I].first;

+ Operands.clear();

+ Operands.push_back(Stores[Data.first]);

+ PrevDist = Data.second;

}

- assert(!Operands.empty() && "Expected non-empty list of stores.");

+ };

- unsigned MaxVecRegSize = R.getMaxVecRegSize();

- unsigned EltSize = R.getVectorElementSize(Operands[0]);

- unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);

- unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),

- MaxElts);

- auto *Store = cast<StoreInst>(Operands[0]);

- Type *StoreTy = Store->getValueOperand()->getType();

- Type *ValueTy = StoreTy;

- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

- ValueTy = Trunc->getSrcTy();

- unsigned MinVF = TTI->getStoreMinimumVF(

- R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);

- if (MaxVF <= MinVF) {

- LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "

- << "MinVF (" << MinVF << ")\n");

- }

- // FIXME: Is division-by-2 the correct step? Should we assert that the

- // register size is a power-of-2?

- unsigned StartIdx = 0;

- for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {

- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {

- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);

- if (!VectorizedStores.count(Slice.front()) &&

- !VectorizedStores.count(Slice.back()) &&

- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {

- // Mark the vectorized stores so that we don't vectorize them again.

- VectorizedStores.insert(Slice.begin(), Slice.end());

- Changed = true;

- // If we vectorized initial block, no need to try to vectorize it

- // again.

- if (Cnt == StartIdx)

- StartIdx += Size;

- Cnt += Size;

- continue;

- }

- ++Cnt;

+ // Stores pair (first: index of the store into Stores array ref, address of

+ // which taken as base, second: sorted set of pairs {index, dist}, which are

+ // indices of stores in the set and their store location distances relative to

+ // the base address).

+ // Need to store the index of the very first store separately, since the set

+ // may be reordered after the insertion and the first store may be moved. This

+ // container allows to reduce number of calls of getPointersDiff() function.

+ SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;

+ // Inserts the specified store SI with the given index Idx to the set of the

+ // stores. If the store with the same distance is found already - stop

+ // insertion, try to vectorize already found stores. If some stores from this

+ // sequence were not vectorized - try to vectorize them with the new store

+ // later. But this logic is applied only to the stores, that come before the

+ // previous store with the same distance.

+ // Example:

+ // 1. store x, %p

+ // 2. store y, %p+1

+ // 3. store z, %p+2

+ // 4. store a, %p

+ // 5. store b, %p+3

+ // - Scan this from the last to first store. The very first bunch of stores is

+ // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores

+ // vector).

+ // - The next store in the list - #1 - has the same distance from store #5 as

+ // the store #4.

+ // - Try to vectorize sequence of stores 4,2,3,5.

+ // - If all these stores are vectorized - just drop them.

+ // - If some of them are not vectorized (say, #3 and #5), do extra analysis.

+ // - Start new stores sequence.

+ // The new bunch of stores is {1, {1, 0}}.

+ // - Add the stores from previous sequence, that were not vectorized.

+ // Here we consider the stores in the reversed order, rather they are used in

+ // the IR (Stores are reversed already, see vectorizeStoreChains() function).

+ // Store #3 can be added -> comes after store #4 with the same distance as

+ // store #1.

+ // Store #5 cannot be added - comes before store #4.

+ // This logic allows to improve the compile time, we assume that the stores

+ // after previous store with the same distance most likely have memory

+ // dependencies and no need to waste compile time to try to vectorize them.

+ // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.

+ auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {

+ for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {

+ std::optional<int> Diff = getPointersDiff(

+ Stores[Set.first]->getValueOperand()->getType(),

+ Stores[Set.first]->getPointerOperand(),

+ SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,

+ /*StrictCheck=*/true);

+ if (!Diff)

+ continue;

+ auto It = Set.second.find(std::make_pair(Idx, *Diff));

+ if (It == Set.second.end()) {

+ Set.second.emplace(Idx, *Diff);

+ return;

}

- // Check if the whole array was vectorized already - exit.

- if (StartIdx >= Operands.size())

- break;

+ // Try to vectorize the first found set to avoid duplicate analysis.

+ TryToVectorize(Set.second);

+ StoreIndexToDistSet PrevSet;

+ PrevSet.swap(Set.second);

+ Set.first = Idx;

+ Set.second.emplace(Idx, 0);

+ // Insert stores that followed previous match to try to vectorize them

+ // with this store.

+ unsigned StartIdx = It->first + 1;

+ SmallBitVector UsedStores(Idx - StartIdx);

+ // Distances to previously found dup store (or this store, since they

+ // store to the same addresses).

+ SmallVector<int> Dists(Idx - StartIdx, 0);

+ for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {

+ // Do not try to vectorize sequences, we already tried.

+ if (Pair.first <= It->first ||

+ VectorizedStores.contains(Stores[Pair.first]))

+ break;

+ unsigned BI = Pair.first - StartIdx;

+ UsedStores.set(BI);

+ Dists[BI] = Pair.second - It->second;

+ }

+ for (unsigned I = StartIdx; I < Idx; ++I) {

+ unsigned BI = I - StartIdx;

+ if (UsedStores.test(BI))

+ Set.second.emplace(I, Dists[BI]);

+ }

+ return;

}

+ auto &Res = SortedStores.emplace_back();

+ Res.first = Idx;

+ Res.second.emplace(Idx, 0);

+ };

+ StoreInst *PrevStore = Stores.front();

+ for (auto [I, SI] : enumerate(Stores)) {

+ // Check that we do not try to vectorize stores of different types.

+ if (PrevStore->getValueOperand()->getType() !=

+ SI->getValueOperand()->getType()) {

+ for (auto &Set : SortedStores)

+ TryToVectorize(Set.second);

+ SortedStores.clear();

+ PrevStore = SI;

+ }

+ FillStoresSet(I, SI);

}

+ // Final vectorization attempt.

+ for (auto &Set : SortedStores)

+ TryToVectorize(Set.second);

return Changed;

}

@@ -12507,8 +13682,10 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

// constant index, or a pointer operand that doesn't point to a scalar

// type.

else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

- auto Idx = GEP->idx_begin()->get();

- if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))

+ if (GEP->getNumIndices() != 1)

+ continue;

+ Value *Idx = GEP->idx_begin()->get();

+ if (isa<Constant>(Idx))

continue;

if (!isValidElementType(Idx->getType()))

continue;

@@ -12542,8 +13719,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

// NOTE: the following will give user internal llvm type name, which may

// not be useful.

R.getORE()->emit([&]() {

- std::string type_str;

- llvm::raw_string_ostream rso(type_str);

+ std::string TypeStr;

+ llvm::raw_string_ostream rso(TypeStr);

Ty->print(rso);

return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)

<< "Cannot SLP vectorize list: type "

@@ -12878,10 +14055,12 @@ class HorizontalReduction {

static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,

Value *RHS, const Twine &Name,

const ReductionOpsListType &ReductionOps) {

- bool UseSelect = ReductionOps.size() == 2 ||

- // Logical or/and.

- (ReductionOps.size() == 1 &&

- isa<SelectInst>(ReductionOps.front().front()));

+ bool UseSelect =

+ ReductionOps.size() == 2 ||

+ // Logical or/and.

+ (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) {

+ return isa<SelectInst>(V);

+ }));

assert((!UseSelect || ReductionOps.size() != 2 ||

isa<SelectInst>(ReductionOps[1][0])) &&

"Expected cmp + select pairs for reduction");

@@ -13315,12 +14494,26 @@ public:

// Update the final value in the reduction.

Builder.SetCurrentDebugLocation(

cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

+ if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||

+ (isGuaranteedNotToBePoison(Res) &&

+ !isGuaranteedNotToBePoison(VectorizedTree))) {

+ auto It = ReducedValsToOps.find(Res);

+ if (It != ReducedValsToOps.end() &&

+ any_of(It->getSecond(),

+ [](Instruction *I) { return isBoolLogicOp(I); }))

+ std::swap(VectorizedTree, Res);

+ }

return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",

ReductionOps);

}

// Initialize the final value in the reduction.

return Res;

};

+ bool AnyBoolLogicOp =

+ any_of(ReductionOps.back(), [](Value *V) {

+ return isBoolLogicOp(cast<Instruction>(V));

+ });

// The reduction root is used as the insertion point for new instructions,

// so set it as externally used to prevent it from being deleted.

ExternallyUsedValues[ReductionRoot];

@@ -13364,10 +14557,12 @@ public:

// Check if the reduction value was not overriden by the extractelement

// instruction because of the vectorization and exclude it, if it is not

// compatible with other values.

- if (auto *Inst = dyn_cast<Instruction>(RdxVal))

- if (isVectorLikeInstWithConstOps(Inst) &&

- (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))

- continue;

+ // Also check if the instruction was folded to constant/other value.

+ auto *Inst = dyn_cast<Instruction>(RdxVal);

+ if ((Inst && isVectorLikeInstWithConstOps(Inst) &&

+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||

+ (S.getOpcode() && !Inst))

+ continue;

Candidates.push_back(RdxVal);

TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);

}

@@ -13543,11 +14738,9 @@ public:

for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {

if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))

continue;

- for_each(ReducedVals[Cnt],

- [&LocalExternallyUsedValues, &TrackedVals](Value *V) {

- if (isa<Instruction>(V))

- LocalExternallyUsedValues[TrackedVals[V]];

- });

+ for (Value *V : ReducedVals[Cnt])

+ if (isa<Instruction>(V))

+ LocalExternallyUsedValues[TrackedVals[V]];

}

if (!IsSupportedHorRdxIdentityOp) {

// Number of uses of the candidates in the vector of values.

@@ -13591,7 +14784,7 @@ public:

// Update LocalExternallyUsedValues for the scalar, replaced by

// extractelement instructions.

for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {

- auto It = ExternallyUsedValues.find(Pair.first);

+ auto *It = ExternallyUsedValues.find(Pair.first);

if (It == ExternallyUsedValues.end())

continue;

LocalExternallyUsedValues[Pair.second].append(It->second);

@@ -13605,7 +14798,8 @@ public:

InstructionCost ReductionCost =

getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);

InstructionCost Cost = TreeCost + ReductionCost;

- LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");

+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost

+ << " for reduction\n");

if (!Cost.isValid())

return nullptr;

if (Cost >= -SLPCostThreshold) {

@@ -13652,7 +14846,9 @@ public:

// To prevent poison from leaking across what used to be sequential,

// safe, scalar boolean logic operations, the reduction operand must be

// frozen.

- if (isBoolLogicOp(RdxRootInst))

+ if ((isBoolLogicOp(RdxRootInst) ||

+ (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&

+ !isGuaranteedNotToBePoison(VectorizedRoot))

VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

// Emit code to correctly handle reused reduced values, if required.

@@ -13664,6 +14860,16 @@ public:

Value *ReducedSubTree =

emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

+ if (ReducedSubTree->getType() != VL.front()->getType()) {

+ ReducedSubTree = Builder.CreateIntCast(

+ ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {

+ KnownBits Known = computeKnownBits(

+ R, cast<Instruction>(ReductionOps.front().front())

+ ->getModule()

+ ->getDataLayout());

+ return !Known.isNonNegative();

+ }));

+ }

// Improved analysis for add/fadd/xor reductions with same scale factor

// for all operands of reductions. We can emit scalar ops for them

@@ -13716,31 +14922,33 @@ public:

// RedOp2 = select i1 ?, i1 RHS, i1 false

// Then, we must freeze LHS in the new op.

- auto &&FixBoolLogicalOps =

- [&Builder, VectorizedTree](Value *&LHS, Value *&RHS,

- Instruction *RedOp1, Instruction *RedOp2) {

- if (!isBoolLogicOp(RedOp1))

- return;

- if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS ||

- isGuaranteedNotToBePoison(LHS))

- return;

- if (!isBoolLogicOp(RedOp2))

- return;

- if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS ||

- isGuaranteedNotToBePoison(RHS)) {

- std::swap(LHS, RHS);

- return;

- }

- LHS = Builder.CreateFreeze(LHS);

- };

+ auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,

+ Instruction *RedOp1,

+ Instruction *RedOp2,

+ bool InitStep) {

+ if (!AnyBoolLogicOp)

+ return;

+ if (isBoolLogicOp(RedOp1) &&

+ ((!InitStep && LHS == VectorizedTree) ||

+ getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))

+ return;

+ if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||

+ getRdxOperand(RedOp2, 0) == RHS ||

+ isGuaranteedNotToBePoison(RHS))) {

+ std::swap(LHS, RHS);

+ return;

+ }

+ if (LHS != VectorizedTree)

+ LHS = Builder.CreateFreeze(LHS);

+ };

// Finish the reduction.

// Need to add extra arguments and not vectorized possible reduction

// values.

// Try to avoid dependencies between the scalar remainders after

// reductions.

- auto &&FinalGen =

- [this, &Builder, &TrackedVals, &FixBoolLogicalOps](

- ArrayRef<std::pair<Instruction *, Value *>> InstVals) {

+ auto FinalGen =

+ [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,

+ bool InitStep) {

unsigned Sz = InstVals.size();

SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +

Sz % 2);

@@ -13761,7 +14969,7 @@ public:

// sequential, safe, scalar boolean logic operations, the

// reduction operand must be frozen.

FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,

- RedOp);

+ RedOp, InitStep);

Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,

StableRdxVal2, "op.rdx", ReductionOps);

ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);

@@ -13791,11 +14999,13 @@ public:

ExtraReductions.emplace_back(I, Pair.first);

}

// Iterate through all not-vectorized reduction values/extra arguments.

+ bool InitStep = true;

while (ExtraReductions.size() > 1) {

VectorizedTree = ExtraReductions.front().second;

SmallVector<std::pair<Instruction *, Value *>> NewReds =

- FinalGen(ExtraReductions);

+ FinalGen(ExtraReductions, InitStep);

ExtraReductions.swap(NewReds);

+ InitStep = false;

}

VectorizedTree = ExtraReductions.front().second;

@@ -13842,8 +15052,7 @@ private:

bool IsCmpSelMinMax, unsigned ReduxWidth,

FastMathFlags FMF) {

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

- Value *FirstReducedVal = ReducedVals.front();

- Type *ScalarTy = FirstReducedVal->getType();

+ Type *ScalarTy = ReducedVals.front()->getType();

FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);

InstructionCost VectorCost = 0, ScalarCost;

// If all of the reduced values are constant, the vector cost is 0, since

@@ -13917,7 +15126,7 @@ private:

}

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost

- << " for reduction that starts with " << *FirstReducedVal

+ << " for reduction of " << shortBundleName(ReducedVals)

<< " (It is a splitting reduction)\n");

return VectorCost - ScalarCost;

}

@@ -13932,7 +15141,7 @@ private:

"A call to the llvm.fmuladd intrinsic is not handled yet");

++NumVectorInstructions;

- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);

+ return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);

}

/// Emits optimized code for unique scalar value reused \p Cnt times.

@@ -13979,8 +15188,8 @@ private:

case RecurKind::Mul:

case RecurKind::FMul:

case RecurKind::FMulAdd:

- case RecurKind::SelectICmp:

- case RecurKind::SelectFCmp:

+ case RecurKind::IAnyOf:

+ case RecurKind::FAnyOf:

case RecurKind::None:

llvm_unreachable("Unexpected reduction kind for repeated scalar.");

}

@@ -14068,8 +15277,8 @@ private:

case RecurKind::Mul:

case RecurKind::FMul:

case RecurKind::FMulAdd:

- case RecurKind::SelectICmp:

- case RecurKind::SelectFCmp:

+ case RecurKind::IAnyOf:

+ case RecurKind::FAnyOf:

case RecurKind::None:

llvm_unreachable("Unexpected reduction kind for reused scalars.");

}

@@ -14164,8 +15373,8 @@ static bool findBuildAggregate(Instruction *LastInsertInst,

InsertElts.resize(*AggregateSize);

findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);

- llvm::erase_value(BuildVectorOpds, nullptr);

- llvm::erase_value(InsertElts, nullptr);

+ llvm::erase(BuildVectorOpds, nullptr);

+ llvm::erase(InsertElts, nullptr);

if (BuildVectorOpds.size() >= 2)

return true;

@@ -14401,8 +15610,7 @@ bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,

bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

BasicBlock *BB, BoUpSLP &R) {

- const DataLayout &DL = BB->getModule()->getDataLayout();

- if (!R.canMapToVector(IVI->getType(), DL))

+ if (!R.canMapToVector(IVI->getType()))

return false;

SmallVector<Value *, 16> BuildVectorOpds;

@@ -14541,11 +15749,11 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,

if (BasePred1 > BasePred2)

return false;

// Compare operands.

- bool LEPreds = Pred1 <= Pred2;

- bool GEPreds = Pred1 >= Pred2;

+ bool CI1Preds = Pred1 == BasePred1;

+ bool CI2Preds = Pred2 == BasePred1;

for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {

- auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);

- auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);

+ auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);

+ auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);

if (Op1->getValueID() < Op2->getValueID())

return !IsCompatibility;

if (Op1->getValueID() > Op2->getValueID())

@@ -14691,14 +15899,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

return true;

if (Opcodes1.size() > Opcodes2.size())

return false;

- std::optional<bool> ConstOrder;

for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

// Undefs are compatible with any other value.

if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {

- if (!ConstOrder)

- ConstOrder =

- !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);

- continue;

+ if (isa<Instruction>(Opcodes1[I]))

+ return true;

+ if (isa<Instruction>(Opcodes2[I]))

+ return false;

+ if (isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]))

+ return true;

+ if (isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]))

+ return false;

+ if (isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]))

+ continue;

+ return isa<UndefValue>(Opcodes2[I]);

}

if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

@@ -14714,21 +15928,26 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

if (NodeI1 != NodeI2)

return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

InstructionsState S = getSameOpcode({I1, I2}, *TLI);

- if (S.getOpcode())

+ if (S.getOpcode() && !S.isAltShuffle())

continue;

return I1->getOpcode() < I2->getOpcode();

}

- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {

- if (!ConstOrder)

- ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();

- continue;

- }

+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

+ return Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();

+ if (isa<Instruction>(Opcodes1[I]))

+ return true;

+ if (isa<Instruction>(Opcodes2[I]))

+ return false;

+ if (isa<Constant>(Opcodes1[I]))

+ return true;

+ if (isa<Constant>(Opcodes2[I]))

+ return false;

if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())

return true;

if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())

return false;

}

- return ConstOrder && *ConstOrder;

+ return false;

};

auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {

if (V1 == V2)

@@ -14776,6 +15995,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

Incoming.push_back(P);

}

+ if (Incoming.size() <= 1)

+ break;

// Find the corresponding non-phi nodes for better matching when trying to

// build the tree.

for (Value *V : Incoming) {

@@ -14838,41 +16060,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

return I->use_empty() &&

(I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));

};

- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

+ for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {

// Skip instructions with scalable type. The num of elements is unknown at

// compile-time for scalable type.

- if (isa<ScalableVectorType>(it->getType()))

+ if (isa<ScalableVectorType>(It->getType()))

continue;

// Skip instructions marked for the deletion.

- if (R.isDeleted(&*it))

+ if (R.isDeleted(&*It))

continue;

// We may go through BB multiple times so skip the one we have checked.

- if (!VisitedInstrs.insert(&*it).second) {

- if (HasNoUsers(&*it) &&

- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) {

+ if (!VisitedInstrs.insert(&*It).second) {

+ if (HasNoUsers(&*It) &&

+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {

// We would like to start over since some instructions are deleted

// and the iterator may become invalid value.

Changed = true;

- it = BB->begin();

- e = BB->end();

+ It = BB->begin();

+ E = BB->end();

}

continue;

}

- if (isa<DbgInfoIntrinsic>(it))

+ if (isa<DbgInfoIntrinsic>(It))

continue;

// Try to vectorize reductions that use PHINodes.

- if (PHINode *P = dyn_cast<PHINode>(it)) {

+ if (PHINode *P = dyn_cast<PHINode>(It)) {

// Check that the PHI is a reduction PHI.

if (P->getNumIncomingValues() == 2) {

// Try to match and vectorize a horizontal reduction.

Instruction *Root = getReductionInstr(DT, P, BB, LI);

if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {

Changed = true;

- it = BB->begin();

- e = BB->end();

+ It = BB->begin();

+ E = BB->end();

continue;

}

@@ -14897,23 +16119,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

continue;

}

- if (HasNoUsers(&*it)) {

+ if (HasNoUsers(&*It)) {

bool OpsChanged = false;

- auto *SI = dyn_cast<StoreInst>(it);

+ auto *SI = dyn_cast<StoreInst>(It);

bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;

if (SI) {

- auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

+ auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

// Try to vectorize chain in store, if this is the only store to the

// address in the block.

// TODO: This is just a temporarily solution to save compile time. Need

// to investigate if we can safely turn on slp-vectorize-hor-store

// instead to allow lookup for reduction chains in all non-vectorized

// stores (need to check side effects and compile time).

- TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&

- SI->getValueOperand()->hasOneUse();

+ TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&

+ SI->getValueOperand()->hasOneUse();

}

if (TryToVectorizeRoot) {

- for (auto *V : it->operand_values()) {

+ for (auto *V : It->operand_values()) {

// Postponed instructions should not be vectorized here, delay their

// vectorization.

if (auto *VI = dyn_cast<Instruction>(V);

@@ -14926,21 +16148,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

// top-tree instructions to try to vectorize as many instructions as

// possible.

OpsChanged |=

- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator());

+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());

if (OpsChanged) {

// We would like to start over since some instructions are deleted

// and the iterator may become invalid value.

Changed = true;

- it = BB->begin();

- e = BB->end();

+ It = BB->begin();

+ E = BB->end();

continue;

}

- if (isa<InsertElementInst, InsertValueInst>(it))

- PostProcessInserts.insert(&*it);

- else if (isa<CmpInst>(it))

- PostProcessCmps.insert(cast<CmpInst>(&*it));

+ if (isa<InsertElementInst, InsertValueInst>(It))

+ PostProcessInserts.insert(&*It);

+ else if (isa<CmpInst>(It))

+ PostProcessCmps.insert(cast<CmpInst>(&*It));

}

return Changed;

@@ -15044,6 +16266,12 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

// compatible (have the same opcode, same parent), otherwise it is

// definitely not profitable to try to vectorize them.

auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {

+ if (V->getValueOperand()->getType()->getTypeID() <

+ V2->getValueOperand()->getType()->getTypeID())

+ return true;

+ if (V->getValueOperand()->getType()->getTypeID() >

+ V2->getValueOperand()->getType()->getTypeID())

+ return false;

if (V->getPointerOperandType()->getTypeID() <

V2->getPointerOperandType()->getTypeID())

return true;

@@ -15082,6 +16310,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {

if (V1 == V2)

return true;

+ if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())

+ return false;

if (V1->getPointerOperandType() != V2->getPointerOperandType())

return false;

// Undefs are compatible with any other value.

@@ -15113,8 +16343,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))

continue;

+ // Reverse stores to do bottom-to-top analysis. This is important if the

+ // values are stores to the same addresses several times, in this case need

+ // to follow the stores order (reversed to meet the memory dependecies).

+ SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),

+ Pair.second.rend());

Changed |= tryToVectorizeSequence<StoreInst>(

- Pair.second, StoreSorter, AreCompatibleStores,

+ ReversedStores, StoreSorter, AreCompatibleStores,

[this, &R](ArrayRef<StoreInst *> Candidates, bool) {

return vectorizeStores(Candidates, R);