aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-12-18 20:30:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-04-06 20:11:55 +0000
commit5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch)
tree1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff)
parent312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4231
1 files changed, 2733 insertions, 1498 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9870ffbb586c..9d799124074c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19,7 +19,6 @@
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
@@ -34,6 +33,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
@@ -97,7 +97,6 @@
#include <string>
#include <tuple>
#include <utility>
-#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -108,8 +107,9 @@ using namespace slpvectorizer;
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
-cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
- cl::desc("Run the SLP vectorization passes"));
+static cl::opt<bool>
+ RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
+ cl::desc("Run the SLP vectorization passes"));
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
@@ -140,10 +140,6 @@ static cl::opt<unsigned>
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
-static cl::opt<int>
-MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
- cl::desc("Maximum depth of the lookup for consecutive stores."));
-
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
@@ -232,6 +228,17 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
return isConstant(I->getOperand(2));
}
+#if !defined(NDEBUG)
+/// Print a short descriptor of the instruction bundle suitable for debug output.
+static std::string shortBundleName(ArrayRef<Value *> VL) {
+ std::string Result;
+ raw_string_ostream OS(Result);
+ OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
+ OS.flush();
+ return Result;
+}
+#endif
+
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
@@ -384,8 +391,10 @@ static SmallBitVector isUndefVector(const Value *V,
if (isa<T>(II->getOperand(1)))
continue;
std::optional<unsigned> Idx = getInsertIndex(II);
- if (!Idx)
- continue;
+ if (!Idx) {
+ Res.reset();
+ return Res;
+ }
if (*Idx < UseMask.size() && !UseMask.test(*Idx))
Res.reset(*Idx);
}
@@ -429,26 +438,6 @@ static SmallBitVector isUndefVector(const Value *V,
/// i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
-/// We convert this initially to something like:
-/// %x0 = extractelement <4 x i8> %x, i32 0
-/// %x3 = extractelement <4 x i8> %x, i32 3
-/// %y1 = extractelement <4 x i8> %y, i32 1
-/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
-/// %5 = mul <4 x i8> %4, %4
-/// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
-/// %7 = extractelement <4 x i8> %5, i32 1
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
-/// %8 = extractelement <4 x i8> %5, i32 2
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
-/// %9 = extractelement <4 x i8> %5, i32 3
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
-/// ret <4 x i8> %ins4
-/// InstCombiner transforms this into a shuffle and vector mul
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
/// TODO: Can we split off and reuse the shuffle mask detection from
/// ShuffleVectorInst/getShuffleCost?
@@ -539,117 +528,6 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {
return *EI->idx_begin();
}
-/// Tries to find extractelement instructions with constant indices from fixed
-/// vector type and gather such instructions into a bunch, which highly likely
-/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
-/// successful, the matched scalars are replaced by poison values in \p VL for
-/// future analysis.
-static std::optional<TTI::ShuffleKind>
-tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
- SmallVectorImpl<int> &Mask) {
- // Scan list of gathered scalars for extractelements that can be represented
- // as shuffles.
- MapVector<Value *, SmallVector<int>> VectorOpToIdx;
- SmallVector<int> UndefVectorExtracts;
- for (int I = 0, E = VL.size(); I < E; ++I) {
- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
- if (!EI) {
- if (isa<UndefValue>(VL[I]))
- UndefVectorExtracts.push_back(I);
- continue;
- }
- auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
- if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
- continue;
- std::optional<unsigned> Idx = getExtractIndex(EI);
- // Undefined index.
- if (!Idx) {
- UndefVectorExtracts.push_back(I);
- continue;
- }
- SmallBitVector ExtractMask(VecTy->getNumElements(), true);
- ExtractMask.reset(*Idx);
- if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
- UndefVectorExtracts.push_back(I);
- continue;
- }
- VectorOpToIdx[EI->getVectorOperand()].push_back(I);
- }
- // Sort the vector operands by the maximum number of uses in extractelements.
- MapVector<unsigned, SmallVector<Value *>> VFToVector;
- for (const auto &Data : VectorOpToIdx)
- VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
- .push_back(Data.first);
- for (auto &Data : VFToVector) {
- stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
- return VectorOpToIdx.find(V1)->second.size() >
- VectorOpToIdx.find(V2)->second.size();
- });
- }
- // Find the best pair of the vectors with the same number of elements or a
- // single vector.
- const int UndefSz = UndefVectorExtracts.size();
- unsigned SingleMax = 0;
- Value *SingleVec = nullptr;
- unsigned PairMax = 0;
- std::pair<Value *, Value *> PairVec(nullptr, nullptr);
- for (auto &Data : VFToVector) {
- Value *V1 = Data.second.front();
- if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
- SingleMax = VectorOpToIdx[V1].size() + UndefSz;
- SingleVec = V1;
- }
- Value *V2 = nullptr;
- if (Data.second.size() > 1)
- V2 = *std::next(Data.second.begin());
- if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
- UndefSz) {
- PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
- PairVec = std::make_pair(V1, V2);
- }
- }
- if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
- return std::nullopt;
- // Check if better to perform a shuffle of 2 vectors or just of a single
- // vector.
- SmallVector<Value *> SavedVL(VL.begin(), VL.end());
- SmallVector<Value *> GatheredExtracts(
- VL.size(), PoisonValue::get(VL.front()->getType()));
- if (SingleMax >= PairMax && SingleMax) {
- for (int Idx : VectorOpToIdx[SingleVec])
- std::swap(GatheredExtracts[Idx], VL[Idx]);
- } else {
- for (Value *V : {PairVec.first, PairVec.second})
- for (int Idx : VectorOpToIdx[V])
- std::swap(GatheredExtracts[Idx], VL[Idx]);
- }
- // Add extracts from undefs too.
- for (int Idx : UndefVectorExtracts)
- std::swap(GatheredExtracts[Idx], VL[Idx]);
- // Check that gather of extractelements can be represented as just a
- // shuffle of a single/two vectors the scalars are extracted from.
- std::optional<TTI::ShuffleKind> Res =
- isFixedVectorShuffle(GatheredExtracts, Mask);
- if (!Res) {
- // TODO: try to check other subsets if possible.
- // Restore the original VL if attempt was not successful.
- VL.swap(SavedVL);
- return std::nullopt;
- }
- // Restore unused scalars from mask, if some of the extractelements were not
- // selected for shuffle.
- for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
- if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
- !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
- is_contained(UndefVectorExtracts, I))
- continue;
- if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))
- std::swap(VL[I], GatheredExtracts[I]);
- }
- return Res;
-}
-
namespace {
/// Main data required for vectorization of instructions.
@@ -695,7 +573,7 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {
return S.OpValue;
}
-/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
///
/// Example of unsupported opcode is SDIV that can potentially cause UB if the
@@ -889,18 +767,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
- Type *Ty = VL[0]->getType();
- for (int i = 1, e = VL.size(); i < e; i++)
- if (VL[i]->getType() != Ty)
- return false;
-
- return true;
+ Type *Ty = VL.front()->getType();
+ return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
}
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
-static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
- TargetLibraryInfo *TLI) {
+static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
@@ -914,11 +788,10 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
- return (CI->getArgOperand(i) == Scalar);
- }
- [[fallthrough]];
+ return any_of(enumerate(CI->args()), [&](auto &&Arg) {
+ return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
+ Arg.value().get() == Scalar;
+ });
}
default:
return false;
@@ -1181,6 +1054,7 @@ public:
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
+ MultiNodeScalars.clear();
MustGather.clear();
EntryToLastInstruction.clear();
ExternalUses.clear();
@@ -1273,7 +1147,7 @@ public:
/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
- unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+ unsigned canMapToVector(Type *T) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
@@ -1324,6 +1198,9 @@ public:
}
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
#endif
+ bool operator == (const EdgeInfo &Other) const {
+ return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
+ }
};
/// A helper class used for scoring candidates for two consecutive lanes.
@@ -1764,7 +1641,7 @@ public:
auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
return 0;
- return R.areAllUsersVectorized(IdxLaneI, std::nullopt)
+ return R.areAllUsersVectorized(IdxLaneI)
? LookAheadHeuristics::ScoreAllUserVectorized
: 0;
}
@@ -1941,7 +1818,7 @@ public:
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
- auto It = HashMap.find(NumFreeOpsHash.Hash);
+ auto *It = HashMap.find(NumFreeOpsHash.Hash);
if (It == HashMap.end())
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
else
@@ -2203,7 +2080,7 @@ public:
for (int Pass = 0; Pass != 2; ++Pass) {
// Check if no need to reorder operands since they're are perfect or
// shuffled diamond match.
- // Need to to do it to avoid extra external use cost counting for
+ // Need to do it to avoid extra external use cost counting for
// shuffled matches, which may cause regressions.
if (SkipReordering())
break;
@@ -2388,6 +2265,18 @@ public:
~BoUpSLP();
private:
+ /// Determine if a vectorized value \p V in can be demoted to
+ /// a smaller type with a truncation. We collect the values that will be
+ /// demoted in ToDemote and additional roots that require investigating in
+ /// Roots.
+ /// \param DemotedConsts list of Instruction/OperandIndex pairs that are
+ /// constant and to be demoted. Required to correctly identify constant nodes
+ /// to be demoted.
+ bool collectValuesToDemote(
+ Value *V, SmallVectorImpl<Value *> &ToDemote,
+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
+
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
/// user and reordarable).
@@ -2410,12 +2299,25 @@ private:
TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
TreeEntry *TE = nullptr;
- const auto *It = find_if(VL, [this, &TE](Value *V) {
+ const auto *It = find_if(VL, [&](Value *V) {
TE = getTreeEntry(V);
- return TE;
+ if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
+ return true;
+ auto It = MultiNodeScalars.find(V);
+ if (It != MultiNodeScalars.end()) {
+ for (TreeEntry *E : It->second) {
+ if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
+ TE = E;
+ return true;
+ }
+ }
+ }
+ return false;
});
- if (It != VL.end() && TE->isSame(VL))
+ if (It != VL.end()) {
+ assert(TE->isSame(VL) && "Expected same scalars.");
return TE;
+ }
return nullptr;
}
@@ -2428,13 +2330,16 @@ private:
}
/// Checks if all users of \p I are the part of the vectorization tree.
- bool areAllUsersVectorized(Instruction *I,
- ArrayRef<Value *> VectorizedVals) const;
+ bool areAllUsersVectorized(
+ Instruction *I,
+ const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
/// Return information about the vector formed for the specified index
/// of a vector of (the same) instruction.
- TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,
- unsigned OpIdx);
+ TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
+
+ /// \ returns the graph entry for the \p Idx operand of the \p E entry.
+ const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
@@ -2450,15 +2355,22 @@ private:
/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
/// returns false, setting \p CurrentOrder to either an empty vector or a
/// non-identity permutation that allows to reuse extract instructions.
+ /// \param ResizeAllowed indicates whether it is allowed to handle subvector
+ /// extract order.
bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
- SmallVectorImpl<unsigned> &CurrentOrder) const;
+ SmallVectorImpl<unsigned> &CurrentOrder,
+ bool ResizeAllowed = false) const;
/// Vectorize a single entry in the tree.
- Value *vectorizeTree(TreeEntry *E);
+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
+ /// avoid issues with def-use order.
+ Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
/// \p E.
- Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
+ /// avoid issues with def-use order.
+ Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
/// Create a new vector from a list of scalar values. Produces a sequence
/// which exploits values reused across lanes, and arranges the inserts
@@ -2477,17 +2389,50 @@ private:
/// instruction in the list).
Instruction &getLastInstructionInBundle(const TreeEntry *E);
- /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
- /// tree entries.
+ /// Tries to find extractelement instructions with constant indices from fixed
+ /// vector type and gather such instructions into a bunch, which highly likely
+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
+ /// was successful, the matched scalars are replaced by poison values in \p VL
+ /// for future analysis.
+ std::optional<TargetTransformInfo::ShuffleKind>
+ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
+ SmallVectorImpl<int> &Mask) const;
+
+ /// Tries to find extractelement instructions with constant indices from fixed
+ /// vector type and gather such instructions into a bunch, which highly likely
+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
+ /// was successful, the matched scalars are replaced by poison values in \p VL
+ /// for future analysis.
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+ tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+ SmallVectorImpl<int> &Mask,
+ unsigned NumParts) const;
+
+ /// Checks if the gathered \p VL can be represented as a single register
+ /// shuffle(s) of previous tree entries.
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
- /// permutations.
+ /// permutations. Must form single-register vector.
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
- /// previous tree entries. \p Mask is filled with the shuffle mask.
+ /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
std::optional<TargetTransformInfo::ShuffleKind>
- isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<const TreeEntry *> &Entries);
+ isGatherShuffledSingleRegisterEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+
+ /// Checks if the gathered \p VL can be represented as multi-register
+ /// shuffle(s) of previous tree entries.
+ /// \param TE Tree entry checked for permutation.
+ /// \param VL List of scalars (a subset of the TE scalar), checked for
+ /// permutations.
+ /// \returns per-register series of ShuffleKind, if gathered values can be
+ /// represented as shuffles of previous tree entries. \p Mask is filled with
+ /// the shuffle mask (also on per-register base).
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+ isGatherShuffledEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+ unsigned NumParts);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
@@ -2517,14 +2462,14 @@ private:
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
/// pointers to the collected stores.
- DenseMap<Value *, SmallVector<StoreInst *, 4>>
+ DenseMap<Value *, SmallVector<StoreInst *>>
collectUserStores(const BoUpSLP::TreeEntry *TE) const;
/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
- /// stores in \p StoresVec can form a vector instruction. If so it returns true
- /// and populates \p ReorderIndices with the shuffle indices of the the stores
- /// when compared to the sorted vector.
- bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+ /// stores in \p StoresVec can form a vector instruction. If so it returns
+ /// true and populates \p ReorderIndices with the shuffle indices of the
+ /// stores when compared to the sorted vector.
+ bool canFormVector(ArrayRef<StoreInst *> StoresVec,
OrdersType &ReorderIndices) const;
/// Iterates through the users of \p TE, looking for scalar stores that can be
@@ -2621,10 +2566,18 @@ private:
/// The Scalars are vectorized into this value. It is initialized to Null.
WeakTrackingVH VectorizedValue = nullptr;
+ /// New vector phi instructions emitted for the vectorized phi nodes.
+ PHINode *PHI = nullptr;
+
/// Do we need to gather this sequence or vectorize it
/// (either with vector instruction or with scatter/gather
/// intrinsics for store/load)?
- enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
+ enum EntryState {
+ Vectorize,
+ ScatterVectorize,
+ PossibleStridedVectorize,
+ NeedToGather
+ };
EntryState State;
/// Does this sequence require some shuffling?
@@ -2772,6 +2725,14 @@ private:
return FoundLane;
}
+ /// Build a shuffle mask for graph entry which represents a merge of main
+ /// and alternate operations.
+ void
+ buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<Value *> *OpScalars = nullptr,
+ SmallVectorImpl<Value *> *AltScalars = nullptr) const;
+
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
@@ -2792,6 +2753,9 @@ private:
case ScatterVectorize:
dbgs() << "ScatterVectorize\n";
break;
+ case PossibleStridedVectorize:
+ dbgs() << "PossibleStridedVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -2892,7 +2856,14 @@ private:
}
if (Last->State != TreeEntry::NeedToGather) {
for (Value *V : VL) {
- assert(!getTreeEntry(V) && "Scalar already in tree!");
+ const TreeEntry *TE = getTreeEntry(V);
+ assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
+ "Scalar already in tree!");
+ if (TE) {
+ if (TE != Last)
+ MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
+ continue;
+ }
ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
@@ -2905,7 +2876,8 @@ private:
for (Value *V : VL) {
if (doesNotNeedToBeScheduled(V))
continue;
- assert(BundleMember && "Unexpected end of bundle.");
+ if (!BundleMember)
+ continue;
BundleMember->TE = Last;
BundleMember = BundleMember->NextInBundle;
}
@@ -2913,6 +2885,10 @@ private:
assert(!BundleMember && "Bundle and VL out of sync");
} else {
MustGather.insert(VL.begin(), VL.end());
+ // Build a map for gathered scalars to the nodes where they are used.
+ for (Value *V : VL)
+ if (!isConstant(V))
+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
}
if (UserTreeIdx.UserTE)
@@ -2950,6 +2926,10 @@ private:
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
+ /// List of scalars, used in several vectorize nodes, and the list of the
+ /// nodes.
+ SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
+
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -2995,25 +2975,25 @@ private:
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
+ if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
+ return true;
// First check if the result is already in the cache.
- AliasCacheKey key = std::make_pair(Inst1, Inst2);
- std::optional<bool> &result = AliasCache[key];
- if (result) {
- return *result;
- }
- bool aliased = true;
- if (Loc1.Ptr && isSimple(Inst1))
- aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
+ AliasCacheKey Key = std::make_pair(Inst1, Inst2);
+ auto It = AliasCache.find(Key);
+ if (It != AliasCache.end())
+ return It->second;
+ bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
// Store the result in the cache.
- result = aliased;
- return aliased;
+ AliasCache.try_emplace(Key, Aliased);
+ AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
+ return Aliased;
}
using AliasCacheKey = std::pair<Instruction *, Instruction *>;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
- DenseMap<AliasCacheKey, std::optional<bool>> AliasCache;
+ DenseMap<AliasCacheKey, bool> AliasCache;
// Cache for pointerMayBeCaptured calls inside AA. This is preserved
// globally through SLP because we don't perform any action which
@@ -3047,7 +3027,7 @@ private:
SetVector<Instruction *> GatherShuffleExtractSeq;
/// A list of blocks that we are going to CSE.
- SetVector<BasicBlock *> CSEBlocks;
+ DenseSet<BasicBlock *> CSEBlocks;
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
@@ -3497,7 +3477,7 @@ private:
BasicBlock *BB;
/// Simple memory allocation for ScheduleData.
- std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+ SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
/// The size of a ScheduleData array in ScheduleDataChunks.
int ChunkSize;
@@ -3607,7 +3587,7 @@ private:
/// where "width" indicates the minimum bit width and "signed" is True if the
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
- MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+ DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace slpvectorizer
@@ -3676,7 +3656,7 @@ template <> struct GraphTraits<BoUpSLP *> {
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
using TreeEntry = BoUpSLP::TreeEntry;
- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+ DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
std::string Str;
@@ -3699,7 +3679,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
const BoUpSLP *) {
if (Entry->State == TreeEntry::NeedToGather)
return "color=red";
- if (Entry->State == TreeEntry::ScatterVectorize)
+ if (Entry->State == TreeEntry::ScatterVectorize ||
+ Entry->State == TreeEntry::PossibleStridedVectorize)
return "color=blue";
return "";
}
@@ -3761,7 +3742,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
inversePermutation(Order, MaskOrder);
}
reorderReuses(MaskOrder, Mask);
- if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
+ if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {
Order.clear();
return;
}
@@ -3779,7 +3760,40 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
OrdersType CurrentOrder(NumScalars, NumScalars);
SmallVector<int> Positions;
SmallBitVector UsedPositions(NumScalars);
- const TreeEntry *STE = nullptr;
+ DenseMap<const TreeEntry *, unsigned> UsedEntries;
+ DenseMap<Value *, std::pair<const TreeEntry *, unsigned>> ValueToEntryPos;
+ for (Value *V : TE.Scalars) {
+ if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
+ continue;
+ const auto *LocalSTE = getTreeEntry(V);
+ if (!LocalSTE)
+ continue;
+ unsigned Lane =
+ std::distance(LocalSTE->Scalars.begin(), find(LocalSTE->Scalars, V));
+ if (Lane >= NumScalars)
+ continue;
+ ++UsedEntries.try_emplace(LocalSTE, 0).first->getSecond();
+ ValueToEntryPos.try_emplace(V, LocalSTE, Lane);
+ }
+ if (UsedEntries.empty())
+ return std::nullopt;
+ const TreeEntry &BestSTE =
+ *std::max_element(UsedEntries.begin(), UsedEntries.end(),
+ [](const std::pair<const TreeEntry *, unsigned> &P1,
+ const std::pair<const TreeEntry *, unsigned> &P2) {
+ return P1.second < P2.second;
+ })
+ ->first;
+ UsedEntries.erase(&BestSTE);
+ const TreeEntry *SecondBestSTE = nullptr;
+ if (!UsedEntries.empty())
+ SecondBestSTE =
+ std::max_element(UsedEntries.begin(), UsedEntries.end(),
+ [](const std::pair<const TreeEntry *, unsigned> &P1,
+ const std::pair<const TreeEntry *, unsigned> &P2) {
+ return P1.second < P2.second;
+ })
+ ->first;
// Try to find all gathered scalars that are gets vectorized in other
// vectorize node. Here we can have only one single tree vector node to
// correctly identify order of the gathered scalars.
@@ -3787,58 +3801,56 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
Value *V = TE.Scalars[I];
if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
continue;
- if (const auto *LocalSTE = getTreeEntry(V)) {
- if (!STE)
- STE = LocalSTE;
- else if (STE != LocalSTE)
- // Take the order only from the single vector node.
- return std::nullopt;
- unsigned Lane =
- std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
- if (Lane >= NumScalars)
- return std::nullopt;
- if (CurrentOrder[Lane] != NumScalars) {
- if (Lane != I)
- continue;
- UsedPositions.reset(CurrentOrder[Lane]);
- }
- // The partial identity (where only some elements of the gather node are
- // in the identity order) is good.
- CurrentOrder[Lane] = I;
- UsedPositions.set(I);
+ const auto [LocalSTE, Lane] = ValueToEntryPos.lookup(V);
+ if (!LocalSTE || (LocalSTE != &BestSTE && LocalSTE != SecondBestSTE))
+ continue;
+ if (CurrentOrder[Lane] != NumScalars) {
+ if ((CurrentOrder[Lane] >= BestSTE.Scalars.size() ||
+ BestSTE.Scalars[CurrentOrder[Lane]] == V) &&
+ (Lane != I || LocalSTE == SecondBestSTE))
+ continue;
+ UsedPositions.reset(CurrentOrder[Lane]);
}
+ // The partial identity (where only some elements of the gather node are
+ // in the identity order) is good.
+ CurrentOrder[Lane] = I;
+ UsedPositions.set(I);
}
// Need to keep the order if we have a vector entry and at least 2 scalars or
// the vectorized entry has just 2 scalars.
- if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
- auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
- for (unsigned I = 0; I < NumScalars; ++I)
- if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
- return false;
- return true;
- };
- if (IsIdentityOrder(CurrentOrder))
- return OrdersType();
- auto *It = CurrentOrder.begin();
- for (unsigned I = 0; I < NumScalars;) {
- if (UsedPositions.test(I)) {
- ++I;
- continue;
- }
- if (*It == NumScalars) {
- *It = I;
- ++I;
- }
- ++It;
+ if (BestSTE.Scalars.size() != 2 && UsedPositions.count() <= 1)
+ return std::nullopt;
+ auto IsIdentityOrder = [&](ArrayRef<unsigned> CurrentOrder) {
+ for (unsigned I = 0; I < NumScalars; ++I)
+ if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
+ return false;
+ return true;
+ };
+ if (IsIdentityOrder(CurrentOrder))
+ return OrdersType();
+ auto *It = CurrentOrder.begin();
+ for (unsigned I = 0; I < NumScalars;) {
+ if (UsedPositions.test(I)) {
+ ++I;
+ continue;
}
- return std::move(CurrentOrder);
+ if (*It == NumScalars) {
+ *It = I;
+ ++I;
+ }
+ ++It;
}
- return std::nullopt;
+ return std::move(CurrentOrder);
}
namespace {
/// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize };
+enum class LoadsState {
+ Gather,
+ Vectorize,
+ ScatterVectorize,
+ PossibleStridedVectorize
+};
} // anonymous namespace
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3898,6 +3910,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
+ bool IsPossibleStrided = false;
if (IsSorted) {
Value *Ptr0;
Value *PtrN;
@@ -3913,6 +3926,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Check that the sorted loads are consecutive.
if (static_cast<unsigned>(*Diff) == VL.size() - 1)
return LoadsState::Vectorize;
+ // Simple check if not a strided access - clear order.
+ IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
}
// TODO: need to improve analysis of the pointers, if not all of them are
// GEPs or have > 2 operands, we end up with a gather node, which just
@@ -3934,7 +3949,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
!TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return LoadsState::ScatterVectorize;
+ return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
+ : LoadsState::ScatterVectorize;
}
}
@@ -4050,7 +4066,8 @@ static bool areTwoInsertFromSameBuildVector(
// Go through the vector operand of insertelement instructions trying to find
// either VU as the original vector for IE2 or V as the original vector for
// IE1.
- SmallSet<int, 8> ReusedIdx;
+ SmallBitVector ReusedIdx(
+ cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
bool IsReusedIdx = false;
do {
if (IE2 == VU && !IE1)
@@ -4058,16 +4075,18 @@ static bool areTwoInsertFromSameBuildVector(
if (IE1 == V && !IE2)
return V->hasOneUse();
if (IE1 && IE1 != V) {
- IsReusedIdx |=
- !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;
+ unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
+ IsReusedIdx |= ReusedIdx.test(Idx1);
+ ReusedIdx.set(Idx1);
if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
IE1 = nullptr;
else
IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
}
if (IE2 && IE2 != VU) {
- IsReusedIdx |=
- !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;
+ unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
+ IsReusedIdx |= ReusedIdx.test(Idx2);
+ ReusedIdx.set(Idx2);
if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
IE2 = nullptr;
else
@@ -4135,13 +4154,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::nullopt; // No need to reorder.
return std::move(ResOrder);
}
- if (TE.State == TreeEntry::Vectorize &&
+ if ((TE.State == TreeEntry::Vectorize ||
+ TE.State == TreeEntry::PossibleStridedVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
!TE.isAltShuffle())
return TE.ReorderIndices;
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
- auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {
+ auto PHICompare = [&](unsigned I1, unsigned I2) {
+ Value *V1 = TE.Scalars[I1];
+ Value *V2 = TE.Scalars[I2];
if (V1 == V2)
return false;
if (!V1->hasOneUse() || !V2->hasOneUse())
@@ -4180,14 +4202,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
};
if (!TE.ReorderIndices.empty())
return TE.ReorderIndices;
- DenseMap<Value *, unsigned> PhiToId;
- SmallVector<Value *, 4> Phis;
+ DenseMap<unsigned, unsigned> PhiToId;
+ SmallVector<unsigned> Phis(TE.Scalars.size());
+ std::iota(Phis.begin(), Phis.end(), 0);
OrdersType ResOrder(TE.Scalars.size());
- for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {
- PhiToId[TE.Scalars[Id]] = Id;
- Phis.push_back(TE.Scalars[Id]);
- }
- llvm::stable_sort(Phis, PHICompare);
+ for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
+ PhiToId[Id] = Id;
+ stable_sort(Phis, PHICompare);
for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
ResOrder[Id] = PhiToId[Phis[Id]];
if (IsIdentityOrder(ResOrder))
@@ -4214,7 +4235,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// Check that gather of extractelements can be represented as
// just a shuffle of a single vector.
OrdersType CurrentOrder;
- bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
+ /*ResizeAllowed=*/true);
if (Reuse || !CurrentOrder.empty()) {
if (!CurrentOrder.empty())
fixupOrderingIndices(CurrentOrder);
@@ -4270,7 +4292,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
unsigned Sz) {
ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
- if (ShuffleVectorInst::isIdentityMask(FirstCluster))
+ if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
return false;
for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
ArrayRef<int> Cluster = Mask.slice(I, Sz);
@@ -4386,7 +4408,9 @@ void BoUpSLP::reorderTopToBottom() {
++Cnt;
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
+ if (!(TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize) ||
+ !TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
TE->getOpcode() == Instruction::PHI)
@@ -4409,6 +4433,9 @@ void BoUpSLP::reorderTopToBottom() {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
+ // Last chance orders - scatter vectorize. Try to use their orders if no
+ // other orders or the order is counted already.
+ SmallVector<OrdersType> StridedVectorizeOrders;
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4455,6 +4482,11 @@ void BoUpSLP::reorderTopToBottom() {
if (Order.empty())
continue;
}
+ // Postpone scatter orders.
+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+ StridedVectorizeOrders.push_back(Order);
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4472,8 +4504,21 @@ void BoUpSLP::reorderTopToBottom() {
}
}
// Set order of the user node.
- if (OrdersUses.empty())
- continue;
+ if (OrdersUses.empty()) {
+ if (StridedVectorizeOrders.empty())
+ continue;
+ // Add (potentially!) strided vectorize orders.
+ for (OrdersType &Order : StridedVectorizeOrders)
+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ } else {
+ // Account (potentially!) strided vectorize orders only if it was used
+ // already.
+ for (OrdersType &Order : StridedVectorizeOrders) {
+ auto *It = OrdersUses.find(Order);
+ if (It != OrdersUses.end())
+ ++It->second;
+ }
+ }
// Choose the most used order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
unsigned Cnt = OrdersUses.front().second;
@@ -4514,7 +4559,8 @@ void BoUpSLP::reorderTopToBottom() {
}
continue;
}
- if (TE->State == TreeEntry::Vectorize &&
+ if ((TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize) &&
isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) &&
!TE->isAltShuffle()) {
@@ -4555,6 +4601,10 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+ // FIXME: Do not reorder (possible!) strided vectorized nodes, they
+ // require reordering of the operands, which is not implemented yet.
+ if (TE->State == TreeEntry::PossibleStridedVectorize)
+ return false;
// Do not reorder if operand node is used by many user nodes.
if (any_of(TE->UserTreeIndices,
[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4567,7 +4617,8 @@ bool BoUpSLP::canReorderOperands(
// simply add to the list of gathered ops.
// If there are reused scalars, process this node as a regular vectorize
// node, just reorder reuses mask.
- if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
+ if (TE->State != TreeEntry::Vectorize &&
+ TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
continue;
}
@@ -4602,18 +4653,19 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Currently the are vectorized loads,extracts without alternate operands +
// some gathering of extracts.
SmallVector<TreeEntry *> NonVectorized;
- for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
- &NonVectorized](
- const std::unique_ptr<TreeEntry> &TE) {
- if (TE->State != TreeEntry::Vectorize)
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ if (TE->State != TreeEntry::Vectorize &&
+ TE->State != TreeEntry::PossibleStridedVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
+ if (!(TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize) ||
+ !TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
- });
+ }
// 1. Propagate order to the graph nodes, which use only reordered nodes.
// I.e., if the node has operands, that are reordered, try to make at least
@@ -4627,6 +4679,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize ||
(TE->State == TreeEntry::NeedToGather &&
GathersToOrders.count(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4649,8 +4702,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
}
// Erase filtered entries.
- for_each(Filtered,
- [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
+ for (TreeEntry *TE : Filtered)
+ OrderedEntries.remove(TE);
SmallVector<
std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
UsersVec(Users.begin(), Users.end());
@@ -4662,10 +4715,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> GatherOps;
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
GatherOps)) {
- for_each(Data.second,
- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
- OrderedEntries.remove(Op.second);
- });
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
continue;
}
// All operands are reordered and used only in this node - propagate the
@@ -4673,6 +4724,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
+ // Last chance orders - scatter vectorize. Try to use their orders if no
+ // other orders or the order is counted already.
+ SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
// Do the analysis for each tree entry only once, otherwise the order of
// the same node my be considered several times, though might be not
// profitable.
@@ -4694,6 +4748,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
});
+ // Postpone scatter orders.
+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+ StridedVectorizeOrders.emplace_back(Order, NumOps);
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4754,11 +4813,27 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
// If no orders - skip current nodes and jump to the next one, if any.
if (OrdersUses.empty()) {
- for_each(Data.second,
- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
- OrderedEntries.remove(Op.second);
- });
- continue;
+ if (StridedVectorizeOrders.empty() ||
+ (Data.first->ReorderIndices.empty() &&
+ Data.first->ReuseShuffleIndices.empty() &&
+ !(IgnoreReorder &&
+ Data.first == VectorizableTree.front().get()))) {
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
+ continue;
+ }
+ // Add (potentially!) strided vectorize orders.
+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
+ OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
+ Pair.second;
+ } else {
+ // Account (potentially!) strided vectorize orders only if it was used
+ // already.
+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
+ auto *It = OrdersUses.find(Pair.first);
+ if (It != OrdersUses.end())
+ It->second += Pair.second;
+ }
}
// Choose the best order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
@@ -4771,10 +4846,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
// Set order of the user node (reordering of operands and user nodes).
if (BestOrder.empty()) {
- for_each(Data.second,
- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
- OrderedEntries.remove(Op.second);
- });
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
continue;
}
// Erase operands from OrderedEntries list and adjust their orders.
@@ -4796,7 +4869,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
continue;
}
// Gathers are processed separately.
- if (TE->State != TreeEntry::Vectorize)
+ if (TE->State != TreeEntry::Vectorize &&
+ TE->State != TreeEntry::PossibleStridedVectorize &&
+ (TE->State != TreeEntry::ScatterVectorize ||
+ TE->ReorderIndices.empty()))
continue;
assert((BestOrder.size() == TE->ReorderIndices.size() ||
TE->ReorderIndices.empty()) &&
@@ -4825,7 +4901,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->isAltShuffle())
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
- Data.first->isAltShuffle()) {
+ Data.first->isAltShuffle() ||
+ Data.first->State == TreeEntry::PossibleStridedVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder);
if (Data.first->ReuseShuffleIndices.empty() &&
@@ -4859,10 +4936,12 @@ void BoUpSLP::buildExternalUses(
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+ if (!isa<Instruction>(Scalar))
+ continue;
int FoundLane = Entry->findLaneForValue(Scalar);
// Check if the scalar is externally used as an extra arg.
- auto ExtI = ExternallyUsedValues.find(Scalar);
+ const auto *ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n");
@@ -4886,7 +4965,8 @@ void BoUpSLP::buildExternalUses(
// be used.
if (UseScalar != U ||
UseEntry->State == TreeEntry::ScatterVectorize ||
- !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ UseEntry->State == TreeEntry::PossibleStridedVectorize ||
+ !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
@@ -4906,9 +4986,9 @@ void BoUpSLP::buildExternalUses(
}
}
-DenseMap<Value *, SmallVector<StoreInst *, 4>>
+DenseMap<Value *, SmallVector<StoreInst *>>
BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;
+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
Value *V = TE->Scalars[Lane];
// To save compilation time we don't visit if we have too many users.
@@ -4947,14 +5027,14 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
return PtrToStoresMap;
}
-bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
OrdersType &ReorderIndices) const {
// We check whether the stores in StoreVec can form a vector by sorting them
// and checking whether they are consecutive.
// To avoid calling getPointersDiff() while sorting we create a vector of
// pairs {store, offset from first} and sort this instead.
- SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
+ SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
StoreInst *S0 = StoresVec[0];
StoreOffsetVec[0] = {S0, 0};
Type *S0Ty = S0->getValueOperand()->getType();
@@ -5023,7 +5103,7 @@ SmallVector<BoUpSLP::OrdersType, 1>
BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
unsigned NumLanes = TE->Scalars.size();
- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =
+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
collectUserStores(TE);
// Holds the reorder indices for each candidate store vector that is a user of
@@ -5244,6 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::Vectorize;
case LoadsState::ScatterVectorize:
return TreeEntry::ScatterVectorize;
+ case LoadsState::PossibleStridedVectorize:
+ return TreeEntry::PossibleStridedVectorize;
case LoadsState::Gather:
#ifndef NDEBUG
Type *ScalarTy = VL0->getType();
@@ -5416,7 +5498,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
VFShape Shape = VFShape::get(
- *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
+ CI->getFunctionType(),
+ ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
@@ -5488,9 +5571,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<int> ReuseShuffleIndicies;
SmallVector<Value *> UniqueValues;
- auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
- &UserTreeIdx,
- this](const InstructionsState &S) {
+ SmallVector<Value *> NonUniqueValueVL;
+ auto TryToFindDuplicates = [&](const InstructionsState &S,
+ bool DoNotFail = false) {
// Check that every instruction appears once in this bundle.
DenseMap<Value *, unsigned> UniquePositions(VL.size());
for (Value *V : VL) {
@@ -5517,6 +5600,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
!isConstant(V);
})) ||
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+ if (DoNotFail && UniquePositions.size() > 1 &&
+ NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
+ all_of(UniqueValues, [=](Value *V) {
+ return isa<ExtractElementInst>(V) ||
+ areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ })) {
+ unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+ if (PWSz == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
+ NonUniqueValueVL.append(PWSz - UniqueValues.size(),
+ UniqueValues.back());
+ VL = NonUniqueValueVL;
+ }
+ return true;
+ }
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
@@ -5528,6 +5629,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
InstructionsState S = getSameOpcode(VL, *TLI);
+ // Don't vectorize ephemeral values.
+ if (!EphValues.empty()) {
+ for (Value *V : VL) {
+ if (EphValues.count(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is ephemeral.\n");
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+ }
+
// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
// a load), in which case peek through to include it in the tree, without
// ballooning over-budget.
@@ -5633,7 +5746,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BasicBlock *BB = nullptr;
bool IsScatterVectorizeUserTE =
UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
bool AreAllSameInsts =
(S.getOpcode() && allSameBlock(VL)) ||
(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5665,39 +5779,44 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We now know that this is a vector of instructions of the same type from
// the same block.
- // Don't vectorize ephemeral values.
- if (!EphValues.empty()) {
- for (Value *V : VL) {
- if (EphValues.count(V)) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
- << ") is ephemeral.\n");
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- }
- }
-
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- if (TryToFindDuplicates(S))
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
+ auto It = MultiNodeScalars.find(S.OpValue);
+ if (It != MultiNodeScalars.end()) {
+ auto *TEIt = find_if(It->getSecond(),
+ [&](TreeEntry *ME) { return ME->isSame(VL); });
+ if (TEIt != It->getSecond().end())
+ E = *TEIt;
+ else
+ E = nullptr;
+ } else {
+ E = nullptr;
+ }
+ }
+ if (!E) {
+ if (!doesNotNeedToBeScheduled(S.OpValue)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ } else {
+ // Record the reuse of the tree node. FIXME, currently this is only used
+ // to properly draw the graph rather than for the actual vectorization.
+ E->UserTreeIndices.push_back(UserTreeIdx);
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ << ".\n");
return;
}
- // Record the reuse of the tree node. FIXME, currently this is only used to
- // properly draw the graph rather than for the actual vectorization.
- E->UserTreeIndices.push_back(UserTreeIdx);
- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
- << ".\n");
- return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (Value *V : VL) {
- if (!IsScatterVectorizeUserTE && !isa<Instruction>(V))
+ if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
+ doesNotNeedToBeScheduled(V))
continue;
if (getTreeEntry(V)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
@@ -5725,7 +5844,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Special processing for sorted pointers for ScatterVectorize node with
// constant indeces only.
if (AreAllSameInsts && UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
!(S.getOpcode() && allSameBlock(VL))) {
assert(S.OpValue->getType()->isPointerTy() &&
count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -5760,7 +5880,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// Check that every instruction appears once in this bundle.
- if (!TryToFindDuplicates(S))
+ if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
return;
// Perform specific checks for each particular instruction kind.
@@ -5780,7 +5900,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef;
- std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+ std::optional<ScheduleData *> Bundle =
+ BS.tryScheduleBundle(UniqueValues, this, S);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
@@ -5905,6 +6026,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
TreeEntry *TE = nullptr;
+ fixupOrderingIndices(CurrentOrder);
switch (State) {
case TreeEntry::Vectorize:
if (CurrentOrder.empty()) {
@@ -5913,7 +6035,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
- fixupOrderingIndices(CurrentOrder);
// Need to reorder.
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
@@ -5921,6 +6042,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
break;
+ case TreeEntry::PossibleStridedVectorize:
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ if (CurrentOrder.empty()) {
+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ } else {
+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
+ }
+ TE->setOperandsInOrder();
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ break;
case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
@@ -5951,13 +6085,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
+ Operands.push_back(cast<Instruction>(V)->getOperand(I));
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6031,13 +6165,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
+ Operands.push_back(cast<Instruction>(V)->getOperand(I));
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6087,8 +6221,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!CI)
Operands.back().push_back(Op);
else
- Operands.back().push_back(ConstantExpr::getIntegerCast(
- CI, Ty, CI->getValue().isSignBitSet()));
+ Operands.back().push_back(ConstantFoldIntegerCast(
+ CI, Ty, CI->getValue().isSignBitSet(), *DL));
}
TE->setOperand(IndexIdx, Operands.back());
@@ -6132,18 +6266,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
TE->setOperandsInOrder();
- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
- // For scalar operands no need to to create an entry since no need to
+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+ // For scalar operands no need to create an entry since no need to
// vectorize it.
- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
continue;
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL) {
auto *CI2 = cast<CallInst>(V);
- Operands.push_back(CI2->getArgOperand(i));
+ Operands.push_back(CI2->getArgOperand(I));
}
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6194,13 +6328,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
+ Operands.push_back(cast<Instruction>(V)->getOperand(I));
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6210,7 +6344,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
llvm_unreachable("Unexpected vectorization of the instructions.");
}
-unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+unsigned BoUpSLP::canMapToVector(Type *T) const {
unsigned N = 1;
Type *EltTy = T;
@@ -6234,15 +6368,16 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
if (!isValidElementType(EltTy))
return 0;
- uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
+ uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
- VTSize != DL.getTypeStoreSizeInBits(T))
+ VTSize != DL->getTypeStoreSizeInBits(T))
return 0;
return N;
}
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
- SmallVectorImpl<unsigned> &CurrentOrder) const {
+ SmallVectorImpl<unsigned> &CurrentOrder,
+ bool ResizeAllowed) const {
const auto *It = find_if(VL, [](Value *V) {
return isa<ExtractElementInst, ExtractValueInst>(V);
});
@@ -6263,8 +6398,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
- const DataLayout &DL = E0->getModule()->getDataLayout();
- NElts = canMapToVector(Vec->getType(), DL);
+ NElts = canMapToVector(Vec->getType());
if (!NElts)
return false;
// Check if load can be rewritten as load of vector.
@@ -6275,46 +6409,55 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
}
- if (NElts != VL.size())
- return false;
-
- // Check that all of the indices extract from the correct offset.
- bool ShouldKeepOrder = true;
unsigned E = VL.size();
- // Assign to all items the initial value E + 1 so we can check if the extract
- // instruction index was used already.
- // Also, later we can check that all the indices are used and we have a
- // consecutive access in the extract instructions, by checking that no
- // element of CurrentOrder still has value E + 1.
- CurrentOrder.assign(E, E);
- unsigned I = 0;
- for (; I < E; ++I) {
- auto *Inst = dyn_cast<Instruction>(VL[I]);
+ if (!ResizeAllowed && NElts != E)
+ return false;
+ SmallVector<int> Indices(E, PoisonMaskElem);
+ unsigned MinIdx = NElts, MaxIdx = 0;
+ for (auto [I, V] : enumerate(VL)) {
+ auto *Inst = dyn_cast<Instruction>(V);
if (!Inst)
continue;
if (Inst->getOperand(0) != Vec)
- break;
+ return false;
if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
if (isa<UndefValue>(EE->getIndexOperand()))
continue;
std::optional<unsigned> Idx = getExtractIndex(Inst);
if (!Idx)
- break;
+ return false;
const unsigned ExtIdx = *Idx;
- if (ExtIdx != I) {
- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)
- break;
- ShouldKeepOrder = false;
- CurrentOrder[ExtIdx] = I;
- } else {
- if (CurrentOrder[I] != E)
- break;
- CurrentOrder[I] = I;
- }
+ if (ExtIdx >= NElts)
+ continue;
+ Indices[I] = ExtIdx;
+ if (MinIdx > ExtIdx)
+ MinIdx = ExtIdx;
+ if (MaxIdx < ExtIdx)
+ MaxIdx = ExtIdx;
}
- if (I < E) {
- CurrentOrder.clear();
+ if (MaxIdx - MinIdx + 1 > E)
return false;
+ if (MaxIdx + 1 <= E)
+ MinIdx = 0;
+
+ // Check that all of the indices extract from the correct offset.
+ bool ShouldKeepOrder = true;
+ // Assign to all items the initial value E + 1 so we can check if the extract
+ // instruction index was used already.
+ // Also, later we can check that all the indices are used and we have a
+ // consecutive access in the extract instructions, by checking that no
+ // element of CurrentOrder still has value E + 1.
+ CurrentOrder.assign(E, E);
+ for (unsigned I = 0; I < E; ++I) {
+ if (Indices[I] == PoisonMaskElem)
+ continue;
+ const unsigned ExtIdx = Indices[I] - MinIdx;
+ if (CurrentOrder[ExtIdx] != E) {
+ CurrentOrder.clear();
+ return false;
+ }
+ ShouldKeepOrder &= ExtIdx == I;
+ CurrentOrder[ExtIdx] = I;
}
if (ShouldKeepOrder)
CurrentOrder.clear();
@@ -6322,9 +6465,9 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
return ShouldKeepOrder;
}
-bool BoUpSLP::areAllUsersVectorized(Instruction *I,
- ArrayRef<Value *> VectorizedVals) const {
- return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
+bool BoUpSLP::areAllUsersVectorized(
+ Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
+ return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
all_of(I->users(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0 ||
isVectorLikeInstWithConstOps(U) ||
@@ -6351,8 +6494,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
- auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
- VecTy->getNumElements())),
+ auto Shape = VFShape::get(CI->getFunctionType(),
+ ElementCount::getFixed(VecTy->getNumElements()),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
auto LibCost = IntrinsicCost;
@@ -6365,16 +6508,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
-/// Build shuffle mask for shuffle graph entries and lists of main and alternate
-/// operations operands.
-static void
-buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
- ArrayRef<int> ReusesIndices,
- const function_ref<bool(Instruction *)> IsAltOp,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<Value *> *OpScalars = nullptr,
- SmallVectorImpl<Value *> *AltScalars = nullptr) {
- unsigned Sz = VL.size();
+void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
+ const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<Value *> *OpScalars,
+ SmallVectorImpl<Value *> *AltScalars) const {
+ unsigned Sz = Scalars.size();
Mask.assign(Sz, PoisonMaskElem);
SmallVector<int> OrderMask;
if (!ReorderIndices.empty())
@@ -6383,7 +6521,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
unsigned Idx = I;
if (!ReorderIndices.empty())
Idx = OrderMask[I];
- auto *OpInst = cast<Instruction>(VL[Idx]);
+ auto *OpInst = cast<Instruction>(Scalars[Idx]);
if (IsAltOp(OpInst)) {
Mask[I] = Sz + Idx;
if (AltScalars)
@@ -6394,9 +6532,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
OpScalars->push_back(OpInst);
}
}
- if (!ReusesIndices.empty()) {
- SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem);
- transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
+ if (!ReuseShuffleIndices.empty()) {
+ SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
+ transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
});
Mask.swap(NewMask);
@@ -6429,52 +6567,27 @@ static bool isAlternateInstruction(const Instruction *I,
return I->getOpcode() == AltOp->getOpcode();
}
-TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,
- unsigned OpIdx) {
- assert(!VL.empty());
- const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof));
- const auto *Op0 = I0->getOperand(OpIdx);
+TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
+ assert(!Ops.empty());
+ const auto *Op0 = Ops.front();
- const bool IsConstant = all_of(VL, [&](Value *V) {
+ const bool IsConstant = all_of(Ops, [](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return true;
- auto *Op = I->getOperand(OpIdx);
- return isConstant(Op) && !isa<UndefValue>(Op);
+ return isConstant(V) && !isa<UndefValue>(V);
});
- const bool IsUniform = all_of(VL, [&](Value *V) {
+ const bool IsUniform = all_of(Ops, [=](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
- return I->getOperand(OpIdx) == Op0;
+ return V == Op0;
});
- const bool IsPowerOfTwo = all_of(VL, [&](Value *V) {
+ const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- assert((isa<UndefValue>(V) ||
- I0->getOpcode() == Instruction::GetElementPtr) &&
- "Expected undef or GEP.");
- return true;
- }
- auto *Op = I->getOperand(OpIdx);
- if (auto *CI = dyn_cast<ConstantInt>(Op))
+ if (auto *CI = dyn_cast<ConstantInt>(V))
return CI->getValue().isPowerOf2();
return false;
});
- const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) {
+ const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- assert((isa<UndefValue>(V) ||
- I0->getOpcode() == Instruction::GetElementPtr) &&
- "Expected undef or GEP.");
- return true;
- }
- const auto *Op = I->getOperand(OpIdx);
- if (auto *CI = dyn_cast<ConstantInt>(Op))
+ if (auto *CI = dyn_cast<ConstantInt>(V))
return CI->getValue().isNegatedPowerOf2();
return false;
});
@@ -6505,9 +6618,24 @@ protected:
bool IsStrict) {
int Limit = Mask.size();
int VF = VecTy->getNumElements();
- return (VF == Limit || !IsStrict) &&
- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask);
+ int Index = -1;
+ if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
+ return true;
+ if (!IsStrict) {
+ // Consider extract subvector starting from index 0.
+ if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
+ Index == 0)
+ return true;
+ // All VF-size submasks are identity (e.g.
+ // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
+ if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
+ ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
+ return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
+ ShuffleVectorInst::isIdentityMask(Slice, VF);
+ }))
+ return true;
+ }
+ return false;
}
/// Tries to combine 2 different masks into single one.
@@ -6577,7 +6705,8 @@ protected:
if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
if (!IdentityOp || !SinglePermute ||
(isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
- !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {
+ !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
+ IdentityMask.size()))) {
IdentityOp = SV;
// Store current mask in the IdentityMask so later we did not lost
// this info if IdentityOp is selected as the best candidate for the
@@ -6647,7 +6776,7 @@ protected:
}
if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
!OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
- ShuffleVectorInst::isZeroEltSplatMask(Mask)) {
+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
if (IdentityOp) {
V = IdentityOp;
assert(Mask.size() == IdentityMask.size() &&
@@ -6663,7 +6792,7 @@ protected:
/*IsStrict=*/true) ||
(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
Shuffle->isZeroEltSplat() &&
- ShuffleVectorInst::isZeroEltSplatMask(Mask)));
+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
}
V = Op;
return false;
@@ -6768,11 +6897,9 @@ protected:
CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
}
}
- const int Limit = CombinedMask1.size() * 2;
- if (Op1 == Op2 && Limit == 2 * VF &&
- all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&
- (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||
- (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&
+ if (Op1 == Op2 &&
+ (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
+ (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
isa<ShuffleVectorInst>(Op1) &&
cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
ArrayRef(CombinedMask1))))
@@ -6807,10 +6934,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
const TargetTransformInfo &TTI;
InstructionCost Cost = 0;
- ArrayRef<Value *> VectorizedVals;
+ SmallDenseSet<Value *> VectorizedVals;
BoUpSLP &R;
SmallPtrSetImpl<Value *> &CheckedExtracts;
constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ /// While set, still trying to estimate the cost for the same nodes and we
+ /// can delay actual cost estimation (virtual shuffle instruction emission).
+ /// May help better estimate the cost if same nodes must be permuted + allows
+ /// to move most of the long shuffles cost estimation to TTI.
+ bool SameNodesEstimated = true;
+
+ static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
+ if (Ty->getScalarType()->isPointerTy()) {
+ Constant *Res = ConstantExpr::getIntToPtr(
+ ConstantInt::getAllOnesValue(
+ IntegerType::get(Ty->getContext(),
+ DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
+ Ty->getScalarType());
+ if (auto *VTy = dyn_cast<VectorType>(Ty))
+ Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
+ return Res;
+ }
+ return Constant::getAllOnesValue(Ty);
+ }
InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
@@ -6821,20 +6967,35 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// Improve gather cost for gather of loads, if we can group some of the
// loads into vector loads.
InstructionsState S = getSameOpcode(VL, *R.TLI);
- if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
- !S.isAltShuffle() &&
+ const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
+ unsigned MinVF = R.getMinVF(2 * Sz);
+ if (VL.size() > 2 &&
+ ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
+ (InVectors.empty() &&
+ any_of(seq<unsigned>(0, VL.size() / MinVF),
+ [&](unsigned Idx) {
+ ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
+ InstructionsState S = getSameOpcode(SubVL, *R.TLI);
+ return S.getOpcode() == Instruction::Load &&
+ !S.isAltShuffle();
+ }))) &&
!all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
!isSplat(Gathers)) {
- BoUpSLP::ValueSet VectorizedLoads;
+ SetVector<Value *> VectorizedLoads;
+ SmallVector<LoadInst *> VectorizedStarts;
+ SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized;
unsigned StartIdx = 0;
unsigned VF = VL.size() / 2;
- unsigned VectorizedCnt = 0;
- unsigned ScatterVectorizeCnt = 0;
- const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
- for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+ for (; VF >= MinVF; VF /= 2) {
for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
Cnt += VF) {
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
+ InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
+ if (SliceS.getOpcode() != Instruction::Load ||
+ SliceS.isAltShuffle())
+ continue;
+ }
if (!VectorizedLoads.count(Slice.front()) &&
!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
SmallVector<Value *> PointerOps;
@@ -6845,12 +7006,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
+ case LoadsState::PossibleStridedVectorize:
// Mark the vectorized loads so that we don't vectorize them
// again.
- if (LS == LoadsState::Vectorize)
- ++VectorizedCnt;
+ // TODO: better handling of loads with reorders.
+ if (LS == LoadsState::Vectorize && CurrentOrder.empty())
+ VectorizedStarts.push_back(cast<LoadInst>(Slice.front()));
else
- ++ScatterVectorizeCnt;
+ ScatterVectorized.emplace_back(Cnt, VF);
VectorizedLoads.insert(Slice.begin(), Slice.end());
// If we vectorized initial block, no need to try to vectorize
// it again.
@@ -6881,8 +7044,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
// Exclude potentially vectorized loads from list of gathered
// scalars.
- auto *LI = cast<LoadInst>(S.MainOp);
- Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));
+ Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
// The cost for vectorized loads.
InstructionCost ScalarsCost = 0;
for (Value *V : VectorizedLoads) {
@@ -6892,17 +7054,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
LI->getAlign(), LI->getPointerAddressSpace(),
CostKind, TTI::OperandValueInfo(), LI);
}
- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
- Align Alignment = LI->getAlign();
- GatherCost +=
- VectorizedCnt *
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI);
- GatherCost += ScatterVectorizeCnt *
- TTI.getGatherScatterOpCost(
- Instruction::Load, LoadTy, LI->getPointerOperand(),
- /*VariableMask=*/false, Alignment, CostKind, LI);
+ auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
+ for (LoadInst *LI : VectorizedStarts) {
+ Align Alignment = LI->getAlign();
+ GatherCost +=
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI);
+ }
+ for (std::pair<unsigned, unsigned> P : ScatterVectorized) {
+ auto *LI0 = cast<LoadInst>(VL[P.first]);
+ Align CommonAlignment = LI0->getAlign();
+ for (Value *V : VL.slice(P.first + 1, VF - 1))
+ CommonAlignment =
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ GatherCost += TTI.getGatherScatterOpCost(
+ Instruction::Load, LoadTy, LI0->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
+ }
if (NeedInsertSubvectorAnalysis) {
// Add the cost for the subvectors insert.
for (int I = VF, E = VL.size(); I < E; I += VF)
@@ -6938,77 +7107,137 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
: R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
};
- /// Compute the cost of creating a vector of type \p VecTy containing the
- /// extracted values from \p VL.
- InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
- TTI::ShuffleKind ShuffleKind) {
- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
- unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-
- if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||
- !NumOfParts || VecTy->getNumElements() < NumOfParts)
- return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
- bool AllConsecutive = true;
- unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
- unsigned Idx = -1;
+ /// Compute the cost of creating a vector containing the extracted values from
+ /// \p VL.
+ InstructionCost
+ computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+ unsigned NumParts) {
+ assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
+ unsigned NumElts =
+ std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
+ auto *EE = dyn_cast<ExtractElementInst>(V);
+ if (!EE)
+ return Sz;
+ auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+ return std::max(Sz, VecTy->getNumElements());
+ });
+ unsigned NumSrcRegs = TTI.getNumberOfParts(
+ FixedVectorType::get(VL.front()->getType(), NumElts));
+ if (NumSrcRegs == 0)
+ NumSrcRegs = 1;
+ // FIXME: this must be moved to TTI for better estimation.
+ unsigned EltsPerVector = PowerOf2Ceil(std::max(
+ divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+ auto CheckPerRegistersShuffle =
+ [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+ DenseSet<int> RegIndices;
+ // Check that if trying to permute same single/2 input vectors.
+ TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
+ int FirstRegId = -1;
+ for (int &I : Mask) {
+ if (I == PoisonMaskElem)
+ continue;
+ int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+ if (FirstRegId < 0)
+ FirstRegId = RegId;
+ RegIndices.insert(RegId);
+ if (RegIndices.size() > 2)
+ return std::nullopt;
+ if (RegIndices.size() == 2)
+ ShuffleKind = TTI::SK_PermuteTwoSrc;
+ I = (I % NumElts) % EltsPerVector +
+ (RegId == FirstRegId ? 0 : EltsPerVector);
+ }
+ return ShuffleKind;
+ };
InstructionCost Cost = 0;
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a
// shuffle to extract the values into a vector register.
- SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
- for (auto *V : VL) {
- ++Idx;
-
- // Reached the start of a new vector registers.
- if (Idx % EltsPerVector == 0) {
- RegMask.assign(EltsPerVector, PoisonMaskElem);
- AllConsecutive = true;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ if (!ShuffleKinds[Part])
continue;
- }
-
- // Need to exclude undefs from analysis.
- if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+ ArrayRef<int> MaskSlice =
+ Mask.slice(Part * EltsPerVector,
+ (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
+ ? Mask.size() % EltsPerVector
+ : EltsPerVector);
+ SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
+ copy(MaskSlice, SubMask.begin());
+ std::optional<TTI::ShuffleKind> RegShuffleKind =
+ CheckPerRegistersShuffle(SubMask);
+ if (!RegShuffleKind) {
+ Cost += TTI.getShuffleCost(
+ *ShuffleKinds[Part],
+ FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
continue;
-
- // Check all extracts for a vector register on the target directly
- // extract values in order.
- unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
- if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) {
- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
- CurrentIdx % EltsPerVector == Idx % EltsPerVector;
- RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
}
-
- if (AllConsecutive)
- continue;
-
- // Skip all indices, except for the last index per vector block.
- if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
- continue;
-
- // If we have a series of extracts which are not consecutive and hence
- // cannot re-use the source vector register directly, compute the shuffle
- // cost to extract the vector with EltsPerVector elements.
- Cost += TTI.getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(VecTy->getElementType(), EltsPerVector),
- RegMask);
+ if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
+ !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
+ Cost += TTI.getShuffleCost(
+ *RegShuffleKind,
+ FixedVectorType::get(VL.front()->getType(), EltsPerVector),
+ SubMask);
+ }
}
return Cost;
}
+ /// Transforms mask \p CommonMask per given \p Mask to make proper set after
+ /// shuffle emission.
+ static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
+ ArrayRef<int> Mask) {
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ }
+ /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
+ /// mask \p Mask, register number \p Part, that includes \p SliceSize
+ /// elements.
+ void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
+ ArrayRef<int> Mask, unsigned Part,
+ unsigned SliceSize) {
+ if (SameNodesEstimated) {
+ // Delay the cost estimation if the same nodes are reshuffling.
+ // If we already requested the cost of reshuffling of E1 and E2 before, no
+ // need to estimate another cost with the sub-Mask, instead include this
+ // sub-Mask into the CommonMask to estimate it later and avoid double cost
+ // estimation.
+ if ((InVectors.size() == 2 &&
+ InVectors.front().get<const TreeEntry *>() == &E1 &&
+ InVectors.back().get<const TreeEntry *>() == E2) ||
+ (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
+ assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+ [](int Idx) { return Idx == PoisonMaskElem; }) &&
+ "Expected all poisoned elements.");
+ ArrayRef<int> SubMask =
+ ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+ copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
+ return;
+ }
+ // Found non-matching nodes - need to estimate the cost for the matched
+ // and transform mask.
+ Cost += createShuffle(InVectors.front(),
+ InVectors.size() == 1 ? nullptr : InVectors.back(),
+ CommonMask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ }
+ SameNodesEstimated = false;
+ Cost += createShuffle(&E1, E2, Mask);
+ transformMaskAfterShuffle(CommonMask, Mask);
+ }
class ShuffleCostBuilder {
const TargetTransformInfo &TTI;
static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
- int Limit = 2 * VF;
+ int Index = -1;
return Mask.empty() ||
(VF == Mask.size() &&
- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask));
+ ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
+ (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
+ Index == 0);
}
public:
@@ -7021,21 +7250,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
if (isEmptyOrIdentity(Mask, VF))
return TTI::TCC_Free;
- return TTI.getShuffleCost(
- TTI::SK_PermuteTwoSrc,
- FixedVectorType::get(
- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
- Mask);
+ return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
+ cast<VectorType>(V1->getType()), Mask);
}
InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
// Empty mask or identity mask are free.
- if (isEmptyOrIdentity(Mask, Mask.size()))
+ unsigned VF =
+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+ if (isEmptyOrIdentity(Mask, VF))
return TTI::TCC_Free;
- return TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc,
- FixedVectorType::get(
- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
- Mask);
+ return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,
+ cast<VectorType>(V1->getType()), Mask);
}
InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
InstructionCost createPoison(Type *Ty, unsigned VF) const {
@@ -7052,139 +7277,226 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
const PointerUnion<Value *, const TreeEntry *> &P2,
ArrayRef<int> Mask) {
ShuffleCostBuilder Builder(TTI);
+ SmallVector<int> CommonMask(Mask.begin(), Mask.end());
Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
- unsigned CommonVF = 0;
- if (!V1) {
+ unsigned CommonVF = Mask.size();
+ if (!V1 && !V2 && !P2.isNull()) {
+ // Shuffle 2 entry nodes.
const TreeEntry *E = P1.get<const TreeEntry *>();
unsigned VF = E->getVectorFactor();
- if (V2) {
- unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
- if (V2VF != VF && V2VF == E->Scalars.size())
- VF = E->Scalars.size();
- } else if (!P2.isNull()) {
- const TreeEntry *E2 = P2.get<const TreeEntry *>();
- if (E->Scalars.size() == E2->Scalars.size())
- CommonVF = VF = E->Scalars.size();
- } else {
- // P2 is empty, check that we have same node + reshuffle (if any).
- if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
- VF = E->Scalars.size();
- SmallVector<int> CommonMask(Mask.begin(), Mask.end());
- ::addMask(CommonMask, E->getCommonMask());
- V1 = Constant::getNullValue(
- FixedVectorType::get(E->Scalars.front()->getType(), VF));
- return BaseShuffleAnalysis::createShuffle<InstructionCost>(
- V1, nullptr, CommonMask, Builder);
+ const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ CommonVF = std::max(VF, E2->getVectorFactor());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (E->Scalars.size() == E2->Scalars.size()) {
+ SmallVector<int> EMask = E->getCommonMask();
+ SmallVector<int> E2Mask = E2->getCommonMask();
+ if (!EMask.empty() || !E2Mask.empty()) {
+ for (int &Idx : CommonMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
+ Idx = EMask[Idx];
+ else if (Idx >= static_cast<int>(CommonVF))
+ Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
+ E->Scalars.size();
+ }
}
+ CommonVF = E->Scalars.size();
}
V1 = Constant::getNullValue(
- FixedVectorType::get(E->Scalars.front()->getType(), VF));
- }
- if (!V2 && !P2.isNull()) {
- const TreeEntry *E = P2.get<const TreeEntry *>();
+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ } else if (!V1 && P2.isNull()) {
+ // Shuffle single entry node.
+ const TreeEntry *E = P1.get<const TreeEntry *>();
unsigned VF = E->getVectorFactor();
- unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
- if (!CommonVF && V1VF == E->Scalars.size())
+ CommonVF = VF;
+ assert(
+ all_of(Mask,
+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
+ "All elements in mask must be less than CommonVF.");
+ if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
+ SmallVector<int> EMask = E->getCommonMask();
+ assert(!EMask.empty() && "Expected non-empty common mask.");
+ for (int &Idx : CommonMask) {
+ if (Idx != PoisonMaskElem)
+ Idx = EMask[Idx];
+ }
CommonVF = E->Scalars.size();
- if (CommonVF)
- VF = CommonVF;
- V2 = Constant::getNullValue(
- FixedVectorType::get(E->Scalars.front()->getType(), VF));
- }
- return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,
- Builder);
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ } else if (V1 && P2.isNull()) {
+ // Shuffle single vector.
+ CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ assert(
+ all_of(Mask,
+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
+ "All elements in mask must be less than CommonVF.");
+ } else if (V1 && !V2) {
+ // Shuffle vector and tree node.
+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ CommonVF = std::max(VF, E2->getVectorFactor());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (E2->Scalars.size() == VF && VF != CommonVF) {
+ SmallVector<int> E2Mask = E2->getCommonMask();
+ assert(!E2Mask.empty() && "Expected non-empty common mask.");
+ for (int &Idx : CommonMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (Idx >= static_cast<int>(CommonVF))
+ Idx = E2Mask[Idx - CommonVF] + VF;
+ }
+ CommonVF = VF;
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL,
+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+ } else if (!V1 && V2) {
+ // Shuffle vector and tree node.
+ unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
+ const TreeEntry *E1 = P1.get<const TreeEntry *>();
+ CommonVF = std::max(VF, E1->getVectorFactor());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (E1->Scalars.size() == VF && VF != CommonVF) {
+ SmallVector<int> E1Mask = E1->getCommonMask();
+ assert(!E1Mask.empty() && "Expected non-empty common mask.");
+ for (int &Idx : CommonMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (Idx >= static_cast<int>(CommonVF))
+ Idx = E1Mask[Idx - CommonVF] + VF;
+ }
+ CommonVF = VF;
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL,
+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
+ } else {
+ assert(V1 && V2 && "Expected both vectors.");
+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ CommonVF =
+ std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (V1->getType() != V2->getType()) {
+ V1 = Constant::getNullValue(FixedVectorType::get(
+ cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL, FixedVectorType::get(
+ cast<FixedVectorType>(V1->getType())->getElementType(),
+ CommonVF));
+ }
+ }
+ InVectors.front() = Constant::getNullValue(FixedVectorType::get(
+ cast<FixedVectorType>(V1->getType())->getElementType(),
+ CommonMask.size()));
+ if (InVectors.size() == 2)
+ InVectors.pop_back();
+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(
+ V1, V2, CommonMask, Builder);
}
public:
ShuffleCostEstimator(TargetTransformInfo &TTI,
ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
SmallPtrSetImpl<Value *> &CheckedExtracts)
- : TTI(TTI), VectorizedVals(VectorizedVals), R(R),
- CheckedExtracts(CheckedExtracts) {}
- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
- TTI::ShuffleKind ShuffleKind) {
+ : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
+ R(R), CheckedExtracts(CheckedExtracts) {}
+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+ unsigned NumParts, bool &UseVecBaseAsInput) {
+ UseVecBaseAsInput = false;
if (Mask.empty())
return nullptr;
Value *VecBase = nullptr;
ArrayRef<Value *> VL = E->Scalars;
- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
// If the resulting type is scalarized, do not adjust the cost.
- unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
- if (VecNumParts == VecTy->getNumElements())
+ if (NumParts == VL.size())
return nullptr;
- DenseMap<Value *, int> ExtractVectorsTys;
- for (auto [I, V] : enumerate(VL)) {
- // Ignore non-extractelement scalars.
- if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
- continue;
- // If all users of instruction are going to be vectorized and this
- // instruction itself is not going to be vectorized, consider this
- // instruction as dead and remove its cost from the final cost of the
- // vectorized tree.
- // Also, avoid adjusting the cost for extractelements with multiple uses
- // in different graph entries.
- const TreeEntry *VE = R.getTreeEntry(V);
- if (!CheckedExtracts.insert(V).second ||
- !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
- (VE && VE != E))
- continue;
- auto *EE = cast<ExtractElementInst>(V);
- VecBase = EE->getVectorOperand();
- std::optional<unsigned> EEIdx = getExtractIndex(EE);
- if (!EEIdx)
- continue;
- unsigned Idx = *EEIdx;
- if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
- auto It =
- ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
- It->getSecond() = std::min<int>(It->second, Idx);
- }
- // Take credit for instruction that will become dead.
- if (EE->hasOneUse()) {
- Instruction *Ext = EE->user_back();
- if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
- return isa<GetElementPtrInst>(U);
- })) {
- // Use getExtractWithExtendCost() to calculate the cost of
- // extractelement/ext pair.
- Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
- EE->getVectorOperandType(), Idx);
- // Add back the cost of s|zext which is subtracted separately.
- Cost += TTI.getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), EE->getType(),
- TTI::getCastContextHint(Ext), CostKind, Ext);
+ // Check if it can be considered reused if same extractelements were
+ // vectorized already.
+ bool PrevNodeFound = any_of(
+ ArrayRef(R.VectorizableTree).take_front(E->Idx),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return ((!TE->isAltShuffle() &&
+ TE->getOpcode() == Instruction::ExtractElement) ||
+ TE->State == TreeEntry::NeedToGather) &&
+ all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+ return VL.size() > Data.index() &&
+ (Mask[Data.index()] == PoisonMaskElem ||
+ isa<UndefValue>(VL[Data.index()]) ||
+ Data.value() == VL[Data.index()]);
+ });
+ });
+ SmallPtrSet<Value *, 4> UniqueBases;
+ unsigned SliceSize = VL.size() / NumParts;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+ for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+ // Ignore non-extractelement scalars.
+ if (isa<UndefValue>(V) ||
+ (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
continue;
- }
- }
- Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
- Idx);
- }
- // Add a cost for subvector extracts/inserts if required.
- for (const auto &Data : ExtractVectorsTys) {
- auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
- unsigned NumElts = VecTy->getNumElements();
- if (Data.second % NumElts == 0)
- continue;
- if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
- unsigned Idx = (Data.second / NumElts) * NumElts;
- unsigned EENumElts = EEVTy->getNumElements();
- if (Idx % NumElts == 0)
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ // Also, avoid adjusting the cost for extractelements with multiple uses
+ // in different graph entries.
+ auto *EE = cast<ExtractElementInst>(V);
+ VecBase = EE->getVectorOperand();
+ UniqueBases.insert(VecBase);
+ const TreeEntry *VE = R.getTreeEntry(V);
+ if (!CheckedExtracts.insert(V).second ||
+ !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+ (VE && VE != E))
continue;
- if (Idx + NumElts <= EENumElts) {
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, VecTy);
- } else {
- // Need to round up the subvector type vectorization factor to avoid a
- // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
- // <= EENumElts.
- auto *SubVT =
- FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, SubVT);
+ std::optional<unsigned> EEIdx = getExtractIndex(EE);
+ if (!EEIdx)
+ continue;
+ unsigned Idx = *EEIdx;
+ // Take credit for instruction that will become dead.
+ if (EE->hasOneUse() || !PrevNodeFound) {
+ Instruction *Ext = EE->user_back();
+ if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+ return isa<GetElementPtrInst>(U);
+ })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ Cost -=
+ TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+ EE->getVectorOperandType(), Idx);
+ // Add back the cost of s|zext which is subtracted separately.
+ Cost += TTI.getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), EE->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
+ continue;
+ }
}
- } else {
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
- VecTy, std::nullopt, CostKind, 0, EEVTy);
+ Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
+ CostKind, Idx);
}
}
// Check that gather of extractelements can be represented as just a
@@ -7192,31 +7504,152 @@ public:
// Found the bunch of extractelement instructions that must be gathered
// into a vector and can be represented as a permutation elements in a
// single input vector or of 2 input vectors.
- Cost += computeExtractCost(VL, Mask, ShuffleKind);
+ // Done for reused if same extractelements were vectorized already.
+ if (!PrevNodeFound)
+ Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
+ InVectors.assign(1, E);
+ CommonMask.assign(Mask.begin(), Mask.end());
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ SameNodesEstimated = false;
+ if (NumParts != 1 && UniqueBases.size() != 1) {
+ UseVecBaseAsInput = true;
+ VecBase = Constant::getNullValue(
+ FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
+ }
return VecBase;
}
- void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign({E1, E2});
+ /// Checks if the specified entry \p E needs to be delayed because of its
+ /// dependency nodes.
+ std::optional<InstructionCost>
+ needToDelay(const TreeEntry *,
+ ArrayRef<SmallVector<const TreeEntry *>>) const {
+ // No need to delay the cost estimation during analysis.
+ return std::nullopt;
}
- void add(const TreeEntry *E1, ArrayRef<int> Mask) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(1, E1);
+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
+ if (&E1 == &E2) {
+ assert(all_of(Mask,
+ [&](int Idx) {
+ return Idx < static_cast<int>(E1.getVectorFactor());
+ }) &&
+ "Expected single vector shuffle mask.");
+ add(E1, Mask);
+ return;
+ }
+ if (InVectors.empty()) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign({&E1, &E2});
+ return;
+ }
+ assert(!CommonMask.empty() && "Expected non-empty common mask.");
+ auto *MaskVecTy =
+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+ if (NumParts == 0 || NumParts >= Mask.size())
+ NumParts = 1;
+ unsigned SliceSize = Mask.size() / NumParts;
+ const auto *It =
+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+ estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
+ }
+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {
+ if (InVectors.empty()) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, &E1);
+ return;
+ }
+ assert(!CommonMask.empty() && "Expected non-empty common mask.");
+ auto *MaskVecTy =
+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+ if (NumParts == 0 || NumParts >= Mask.size())
+ NumParts = 1;
+ unsigned SliceSize = Mask.size() / NumParts;
+ const auto *It =
+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+ estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
+ if (!SameNodesEstimated && InVectors.size() == 1)
+ InVectors.emplace_back(&E1);
+ }
+ /// Adds 2 input vectors and the mask for their shuffling.
+ void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
+ // May come only for shuffling of 2 vectors with extractelements, already
+ // handled in adjustExtracts.
+ assert(InVectors.size() == 1 &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ if (P.value() == PoisonMaskElem)
+ return Mask[P.index()] == PoisonMaskElem;
+ auto *EI =
+ cast<ExtractElementInst>(InVectors.front()
+ .get<const TreeEntry *>()
+ ->Scalars[P.index()]);
+ return EI->getVectorOperand() == V1 ||
+ EI->getVectorOperand() == V2;
+ }) &&
+ "Expected extractelement vectors.");
}
/// Adds another one input vector and the mask for the shuffling.
- void add(Value *V1, ArrayRef<int> Mask) {
- assert(CommonMask.empty() && InVectors.empty() &&
- "Expected empty input mask/vectors.");
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(1, V1);
+ void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
+ if (InVectors.empty()) {
+ assert(CommonMask.empty() && !ForExtracts &&
+ "Expected empty input mask/vectors.");
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, V1);
+ return;
+ }
+ if (ForExtracts) {
+ // No need to add vectors here, already handled them in adjustExtracts.
+ assert(InVectors.size() == 1 &&
+ InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ Value *Scalar = InVectors.front()
+ .get<const TreeEntry *>()
+ ->Scalars[P.index()];
+ if (P.value() == PoisonMaskElem)
+ return P.value() == Mask[P.index()] ||
+ isa<UndefValue>(Scalar);
+ if (isa<Constant>(V1))
+ return true;
+ auto *EI = cast<ExtractElementInst>(Scalar);
+ return EI->getVectorOperand() == V1;
+ }) &&
+ "Expected only tree entry for extractelement vectors.");
+ return;
+ }
+ assert(!InVectors.empty() && !CommonMask.empty() &&
+ "Expected only tree entries from extracts/reused buildvectors.");
+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ if (InVectors.size() == 2) {
+ Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ VF = std::max<unsigned>(VF, CommonMask.size());
+ } else if (const auto *InTE =
+ InVectors.front().dyn_cast<const TreeEntry *>()) {
+ VF = std::max(VF, InTE->getVectorFactor());
+ } else {
+ VF = std::max(
+ VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
+ ->getNumElements());
+ }
+ InVectors.push_back(V1);
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
+ CommonMask[Idx] = Mask[Idx] + VF;
}
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
+ Value *Root = nullptr) {
Cost += getBuildVectorCost(VL, Root);
if (!Root) {
- assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
// FIXME: Need to find a way to avoid use of getNullValue here.
SmallVector<Constant *> Vals;
- for (Value *V : VL) {
+ unsigned VF = VL.size();
+ if (MaskVF != 0)
+ VF = std::min(VF, MaskVF);
+ for (Value *V : VL.take_front(VF)) {
if (isa<UndefValue>(V)) {
Vals.push_back(cast<Constant>(V));
continue;
@@ -7226,9 +7659,11 @@ public:
return ConstantVector::get(Vals);
}
return ConstantVector::getSplat(
- ElementCount::getFixed(VL.size()),
- Constant::getNullValue(VL.front()->getType()));
+ ElementCount::getFixed(
+ cast<FixedVectorType>(Root->getType())->getNumElements()),
+ getAllOnesValue(*R.DL, VL.front()->getType()));
}
+ InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
/// Finalize emission of the shuffles.
InstructionCost
finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
@@ -7236,31 +7671,24 @@ public:
IsFinalized = true;
if (Action) {
const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
- if (InVectors.size() == 2) {
+ if (InVectors.size() == 2)
Cost += createShuffle(Vec, InVectors.back(), CommonMask);
- InVectors.pop_back();
- } else {
+ else
Cost += createShuffle(Vec, nullptr, CommonMask);
- }
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
if (CommonMask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
assert(VF > 0 &&
"Expected vector length for the final value before action.");
- Value *V = Vec.dyn_cast<Value *>();
- if (!Vec.isNull() && !V)
- V = Constant::getNullValue(FixedVectorType::get(
- Vec.get<const TreeEntry *>()->Scalars.front()->getType(),
- CommonMask.size()));
+ Value *V = Vec.get<Value *>();
Action(V, CommonMask);
+ InVectors.front() = V;
}
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
- if (CommonMask.empty())
- return Cost;
- int Limit = CommonMask.size() * 2;
- if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(CommonMask))
+ if (CommonMask.empty()) {
+ assert(InVectors.size() == 1 && "Expected only one vector with no mask");
return Cost;
+ }
return Cost +
createShuffle(InVectors.front(),
InVectors.size() == 2 ? InVectors.back() : nullptr,
@@ -7273,28 +7701,63 @@ public:
}
};
+const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
+ unsigned Idx) const {
+ Value *Op = E->getOperand(Idx).front();
+ if (const TreeEntry *TE = getTreeEntry(Op)) {
+ if (find_if(E->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.EdgeIdx == Idx && EI.UserTE == E;
+ }) != TE->UserTreeIndices.end())
+ return TE;
+ auto MIt = MultiNodeScalars.find(Op);
+ if (MIt != MultiNodeScalars.end()) {
+ for (const TreeEntry *TE : MIt->second) {
+ if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.EdgeIdx == Idx && EI.UserTE == E;
+ }) != TE->UserTreeIndices.end())
+ return TE;
+ }
+ }
+ }
+ const auto *It =
+ find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather &&
+ find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.EdgeIdx == Idx && EI.UserTE == E;
+ }) != TE->UserTreeIndices.end();
+ });
+ assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
+ return It->get();
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
ArrayRef<Value *> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
- if (auto *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
- else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
- ScalarTy = CI->getOperand(0)->getType();
- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
- ScalarTy = IE->getOperand(1)->getType();
+ if (E->State != TreeEntry::NeedToGather) {
+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+ ScalarTy = IE->getOperand(1)->getType();
+ }
+ if (!FixedVectorType::isValidElementType(ScalarTy))
+ return InstructionCost::getInvalid();
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
- if (MinBWs.count(VL[0]))
- VecTy = FixedVectorType::get(
- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ auto It = MinBWs.find(E);
+ if (It != MinBWs.end()) {
+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+ VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ }
unsigned EntryVF = E->getVectorFactor();
- auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+ auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->State == TreeEntry::NeedToGather) {
@@ -7302,121 +7765,13 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return 0;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,
- CheckedExtracts);
- unsigned VF = E->getVectorFactor();
- SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
- E->ReuseShuffleIndices.end());
- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
- // Build a mask out of the reorder indices and reorder scalars per this
- // mask.
- SmallVector<int> ReorderMask;
- inversePermutation(E->ReorderIndices, ReorderMask);
- if (!ReorderMask.empty())
- reorderScalars(GatheredScalars, ReorderMask);
- SmallVector<int> Mask;
- SmallVector<int> ExtractMask;
- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
- SmallVector<const TreeEntry *> Entries;
- Type *ScalarTy = GatheredScalars.front()->getType();
- // Check for gathered extracts.
- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
- SmallVector<Value *> IgnoredVals;
- if (UserIgnoreList)
- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
-
- bool Resized = false;
- if (Value *VecBase = Estimator.adjustExtracts(
- E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
- Resized = true;
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
-
- // Do not try to look for reshuffled loads for gathered loads (they will be
- // handled later), for vectorized scalars, and cases, which are definitely
- // not profitable (splats and small gather nodes.)
- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
- E->isAltShuffle() ||
- all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
- isSplat(E->Scalars) ||
- (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
- if (GatherShuffle) {
- assert((Entries.size() == 1 || Entries.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
- Entries.front()->isSame(E->Scalars)) {
- // Perfect match in the graph, will reuse the previously vectorized
- // node. Cost is 0.
- LLVM_DEBUG(
- dbgs()
- << "SLP: perfect diamond match for gather bundle that starts with "
- << *VL.front() << ".\n");
- // Restore the mask for previous partially matched values.
- for (auto [I, V] : enumerate(E->Scalars)) {
- if (isa<PoisonValue>(V)) {
- Mask[I] = PoisonMaskElem;
- continue;
- }
- if (Mask[I] == PoisonMaskElem)
- Mask[I] = Entries.front()->findLaneForValue(V);
- }
- Estimator.add(Entries.front(), Mask);
- return Estimator.finalize(E->ReuseShuffleIndices);
- }
- if (!Resized) {
- unsigned VF1 = Entries.front()->getVectorFactor();
- unsigned VF2 = Entries.back()->getVectorFactor();
- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
- // Remove shuffled elements from list of gathers.
- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
- if (Mask[I] != PoisonMaskElem)
- GatheredScalars[I] = PoisonValue::get(ScalarTy);
- }
- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
- << " entries for bundle that starts with "
- << *VL.front() << ".\n";);
- if (Entries.size() == 1)
- Estimator.add(Entries.front(), Mask);
- else
- Estimator.add(Entries.front(), Entries.back(), Mask);
- if (all_of(GatheredScalars, PoisonValue ::classof))
- return Estimator.finalize(E->ReuseShuffleIndices);
- return Estimator.finalize(
- E->ReuseShuffleIndices, E->Scalars.size(),
- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
- Vec = Estimator.gather(GatheredScalars,
- Constant::getNullValue(FixedVectorType::get(
- GatheredScalars.front()->getType(),
- GatheredScalars.size())));
- });
- }
- if (!all_of(GatheredScalars, PoisonValue::classof)) {
- auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
- bool SameGathers = VL.equals(Gathers);
- Value *BV = Estimator.gather(
- Gathers, SameGathers ? nullptr
- : Constant::getNullValue(FixedVectorType::get(
- GatheredScalars.front()->getType(),
- GatheredScalars.size())));
- SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
- std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
- Estimator.add(BV, ReuseMask);
- }
- if (ExtractShuffle)
- Estimator.add(E, std::nullopt);
- return Estimator.finalize(E->ReuseShuffleIndices);
+ return processBuildVector<ShuffleCostEstimator, InstructionCost>(
+ E, *TTI, VectorizedVals, *this, CheckedExtracts);
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
- if (!E->ReorderIndices.empty()) {
+ if (!E->ReorderIndices.empty() &&
+ E->State != TreeEntry::PossibleStridedVectorize) {
SmallVector<int> NewMask;
if (E->getOpcode() == Instruction::Store) {
// For stores the order is actually a mask.
@@ -7429,11 +7784,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
if (NeedToShuffleReuses)
::addMask(Mask, E->ReuseShuffleIndices);
- if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
+ if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
CommonCost =
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::ScatterVectorize) &&
+ E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
"Unhandled state");
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
@@ -7443,7 +7799,34 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
- const unsigned Sz = VL.size();
+ SetVector<Value *> UniqueValues(VL.begin(), VL.end());
+ const unsigned Sz = UniqueValues.size();
+ SmallBitVector UsedScalars(Sz, false);
+ for (unsigned I = 0; I < Sz; ++I) {
+ if (getTreeEntry(UniqueValues[I]) == E)
+ continue;
+ UsedScalars.set(I);
+ }
+ auto GetCastContextHint = [&](Value *V) {
+ if (const TreeEntry *OpTE = getTreeEntry(V)) {
+ if (OpTE->State == TreeEntry::ScatterVectorize)
+ return TTI::CastContextHint::GatherScatter;
+ if (OpTE->State == TreeEntry::Vectorize &&
+ OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
+ if (OpTE->ReorderIndices.empty())
+ return TTI::CastContextHint::Normal;
+ SmallVector<int> Mask;
+ inversePermutation(OpTE->ReorderIndices, Mask);
+ if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+ return TTI::CastContextHint::Reversed;
+ }
+ } else {
+ InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
+ if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
+ return TTI::CastContextHint::GatherScatter;
+ }
+ return TTI::CastContextHint::None;
+ };
auto GetCostDiff =
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
function_ref<InstructionCost(InstructionCost)> VectorCost) {
@@ -7453,13 +7836,49 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// For some of the instructions no need to calculate cost for each
// particular instruction, we can use the cost of the single
// instruction x total number of scalar instructions.
- ScalarCost = Sz * ScalarEltCost(0);
+ ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
} else {
- for (unsigned I = 0; I < Sz; ++I)
+ for (unsigned I = 0; I < Sz; ++I) {
+ if (UsedScalars.test(I))
+ continue;
ScalarCost += ScalarEltCost(I);
+ }
}
InstructionCost VecCost = VectorCost(CommonCost);
+ // Check if the current node must be resized, if the parent node is not
+ // resized.
+ if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
+ const EdgeInfo &EI = E->UserTreeIndices.front();
+ if ((EI.UserTE->getOpcode() != Instruction::Select ||
+ EI.EdgeIdx != 0) &&
+ It != MinBWs.end()) {
+ auto UserBWIt = MinBWs.find(EI.UserTE);
+ Type *UserScalarTy =
+ EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
+ if (UserBWIt != MinBWs.end())
+ UserScalarTy = IntegerType::get(ScalarTy->getContext(),
+ UserBWIt->second.first);
+ if (ScalarTy != UserScalarTy) {
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+ unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
+ unsigned VecOpcode;
+ auto *SrcVecTy =
+ FixedVectorType::get(UserScalarTy, E->getVectorFactor());
+ if (BWSz > SrcBWSz)
+ VecOpcode = Instruction::Trunc;
+ else
+ VecOpcode =
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
+ TTI::CastContextHint CCH = GetCastContextHint(VL0);
+ VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
+ CostKind);
+ ScalarCost +=
+ Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy,
+ CCH, CostKind);
+ }
+ }
+ }
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
ScalarCost, "Calculated costs for Tree"));
return VecCost - ScalarCost;
@@ -7550,7 +7969,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Count reused scalars.
InstructionCost ScalarCost = 0;
SmallPtrSet<const TreeEntry *, 4> CountedOps;
- for (Value *V : VL) {
+ for (Value *V : UniqueValues) {
auto *PHI = dyn_cast<PHINode>(V);
if (!PHI)
continue;
@@ -7571,8 +7990,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *I = cast<Instruction>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *I = cast<Instruction>(UniqueValues[Idx]);
VectorType *SrcVecTy;
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *EE = cast<ExtractElementInst>(I);
@@ -7680,8 +8099,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// need to shift the vector.
// Do not calculate the cost if the actual size is the register size and
// we can merge this shuffle with the following SK_Select.
- auto *InsertVecTy =
- FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
+ auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
if (!IsIdentity)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
InsertVecTy, Mask);
@@ -7697,8 +8115,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
if (InsertVecSz != VecSz) {
- auto *ActualVecTy =
- FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
+ auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
std::nullopt, CostKind, OffsetBeg - Offset,
InsertVecTy);
@@ -7729,22 +8146,52 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
- return TTI->getCastInstrCost(E->getOpcode(), ScalarTy,
- VI->getOperand(0)->getType(),
+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ Type *SrcScalarTy = VL0->getOperand(0)->getType();
+ auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
+ unsigned Opcode = ShuffleOrOp;
+ unsigned VecOpcode = Opcode;
+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {
+ // Check if the values are candidates to demote.
+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
+ if (SrcIt != MinBWs.end()) {
+ SrcBWSz = SrcIt->second.first;
+ SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
+ SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
+ }
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+ if (BWSz == SrcBWSz) {
+ VecOpcode = Instruction::BitCast;
+ } else if (BWSz < SrcBWSz) {
+ VecOpcode = Instruction::Trunc;
+ } else if (It != MinBWs.end()) {
+ assert(BWSz > SrcBWSz && "Invalid cast!");
+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+ }
+ }
+ auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
+ // Do not count cost here if minimum bitwidth is in effect and it is just
+ // a bitcast (here it is just a noop).
+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
+ return TTI::TCC_Free;
+ auto *VI = VL0->getOpcode() == Opcode
+ ? cast<Instruction>(UniqueValues[Idx])
+ : nullptr;
+ return TTI->getCastInstrCost(Opcode, VL0->getType(),
+ VL0->getOperand(0)->getType(),
TTI::getCastContextHint(VI), CostKind, VI);
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
- Type *SrcTy = VL0->getOperand(0)->getType();
- auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
- InstructionCost VecCost = CommonCost;
- // Check if the values are candidates to demote.
- if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
- VecCost +=
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
- TTI::getCastContextHint(VL0), CostKind, VL0);
- return VecCost;
+ // Do not count cost here if minimum bitwidth is in effect and it is just
+ // a bitcast (here it is just a noop).
+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
+ return CommonCost;
+ auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
+ TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
+ return CommonCost +
+ TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
+ VecOpcode == Opcode ? VI : nullptr);
};
return GetCostDiff(GetScalarCost, GetVectorCost);
}
@@ -7761,7 +8208,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
auto GetScalarCost = [&](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
+ auto *VI = cast<Instruction>(UniqueValues[Idx]);
CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
@@ -7821,8 +8268,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *VI = cast<Instruction>(UniqueValues[Idx]);
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
TTI::OperandValueInfo Op2Info =
@@ -7833,8 +8280,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
- TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
- TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
+ TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
+ TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
Op2Info) +
CommonCost;
@@ -7845,23 +8292,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return CommonCost + GetGEPCostDiff(VL, VL0);
}
case Instruction::Load: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<LoadInst>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *VI = cast<LoadInst>(UniqueValues[Idx]);
return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
VI->getPointerAddressSpace(), CostKind,
TTI::OperandValueInfo(), VI);
};
auto *LI0 = cast<LoadInst>(VL0);
- auto GetVectorCost = [=](InstructionCost CommonCost) {
+ auto GetVectorCost = [&](InstructionCost CommonCost) {
InstructionCost VecLdCost;
if (E->State == TreeEntry::Vectorize) {
VecLdCost = TTI->getMemoryOpCost(
Instruction::Load, VecTy, LI0->getAlign(),
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
} else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+ assert((E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
+ "Unknown EntryState");
Align CommonAlignment = LI0->getAlign();
- for (Value *V : VL)
+ for (Value *V : UniqueValues)
CommonAlignment =
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
VecLdCost = TTI->getGatherScatterOpCost(
@@ -7874,7 +8323,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
// If this node generates masked gather load then it is not a terminal node.
// Hence address operand cost is estimated separately.
- if (E->State == TreeEntry::ScatterVectorize)
+ if (E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize)
return Cost;
// Estimate cost of GEPs since this tree node is a terminator.
@@ -7887,7 +8337,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
bool IsReorder = !E->ReorderIndices.empty();
auto GetScalarCost = [=](unsigned Idx) {
auto *VI = cast<StoreInst>(VL[Idx]);
- TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
VI->getPointerAddressSpace(), CostKind,
OpInfo, VI);
@@ -7896,7 +8346,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
auto GetVectorCost = [=](InstructionCost CommonCost) {
// We know that we can merge the stores. Calculate the cost.
- TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
+ TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
BaseSI->getPointerAddressSpace(), CostKind,
OpInfo) +
@@ -7912,8 +8362,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
}
case Instruction::Call: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *CI = cast<CallInst>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *CI = cast<CallInst>(UniqueValues[Idx]);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (ID != Intrinsic::not_intrinsic) {
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
@@ -7954,8 +8404,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
return false;
};
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *VI = cast<Instruction>(UniqueValues[Idx]);
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
(void)E;
return TTI->getInstructionCost(VI, CostKind);
@@ -7995,21 +8445,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
TTI::CastContextHint::None, CostKind);
}
- if (E->ReuseShuffleIndices.empty()) {
- VecCost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
- } else {
- SmallVector<int> Mask;
- buildShuffleEntryMask(
- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
- [E](Instruction *I) {
- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return I->getOpcode() == E->getAltOpcode();
- },
- Mask);
- VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
- FinalVecTy, Mask);
- }
+ SmallVector<int> Mask;
+ E->buildAltOpShuffleMask(
+ [E](Instruction *I) {
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ return I->getOpcode() == E->getAltOpcode();
+ },
+ Mask);
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ FinalVecTy, Mask);
return VecCost;
};
return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -8065,7 +8509,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
- VectorizableTree[0]->State != TreeEntry::ScatterVectorize))
+ VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
+ VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
return false;
return true;
@@ -8144,6 +8589,23 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
allConstant(VectorizableTree[1]->Scalars))))
return true;
+ // If the graph includes only PHI nodes and gathers, it is defnitely not
+ // profitable for the vectorization, we can skip it, if the cost threshold is
+ // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
+ // gathers/buildvectors.
+ constexpr int Limit = 4;
+ if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
+ !VectorizableTree.empty() &&
+ all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return (TE->State == TreeEntry::NeedToGather &&
+ TE->getOpcode() != Instruction::ExtractElement &&
+ count_if(TE->Scalars,
+ [](Value *V) { return isa<ExtractElementInst>(V); }) <=
+ Limit) ||
+ TE->getOpcode() == Instruction::PHI;
+ }))
+ return true;
+
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -8435,16 +8897,6 @@ static T *performExtractsShuffleAction(
}
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
- // Build a map for gathered scalars to the nodes where they are used.
- ValueToGatherNodes.clear();
- for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
- if (EntryPtr->State != TreeEntry::NeedToGather)
- continue;
- for (Value *V : EntryPtr->Scalars)
- if (!isConstant(V))
- ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
- EntryPtr.get());
- }
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
@@ -8460,8 +8912,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
E->isSame(TE.Scalars)) {
// Some gather nodes might be absolutely the same as some vectorizable
// nodes after reordering, need to handle it.
- LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with "
- << *TE.Scalars[0] << ".\n"
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
+ << shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
continue;
}
@@ -8469,9 +8921,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for bundle that starts with " << *TE.Scalars[0]
- << ".\n"
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
+ << shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
}
@@ -8480,6 +8931,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<APInt> DemandedElts;
+ SmallDenseSet<Value *, 4> UsedInserts;
+ DenseSet<Value *> VectorCasts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -8500,6 +8953,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// to detect it as a final shuffled/identity match.
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
+ if (!UsedInserts.insert(VU).second)
+ continue;
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
@@ -8546,6 +9001,28 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
FirstUsers.emplace_back(VU, ScalarTE);
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
VecId = FirstUsers.size() - 1;
+ auto It = MinBWs.find(ScalarTE);
+ if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) {
+ unsigned BWSz = It->second.second;
+ unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType());
+ unsigned VecOpcode;
+ if (BWSz < SrcBWSz)
+ VecOpcode = Instruction::Trunc;
+ else
+ VecOpcode =
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost C = TTI->getCastInstrCost(
+ VecOpcode, FTy,
+ FixedVectorType::get(
+ IntegerType::get(FTy->getContext(), It->second.first),
+ FTy->getNumElements()),
+ TTI::CastContextHint::None, CostKind);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for extending externally used vector with "
+ "non-equal minimum bitwidth.\n");
+ Cost += C;
+ }
} else {
if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
It->first = VU;
@@ -8567,11 +9044,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
- if (MinBWs.count(ScalarRoot)) {
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto Extend =
- MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
+ auto It = MinBWs.find(getTreeEntry(EU.Scalar));
+ if (It != MinBWs.end()) {
+ auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
+ unsigned Extend =
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
VecTy = FixedVectorType::get(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
@@ -8580,6 +9057,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
CostKind, EU.Lane);
}
}
+ // Add reduced value cost, if resized.
+ if (!VectorizedVals.empty()) {
+ auto BWIt = MinBWs.find(VectorizableTree.front().get());
+ if (BWIt != MinBWs.end()) {
+ Type *DstTy = VectorizableTree.front()->Scalars.front()->getType();
+ unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
+ unsigned Opcode = Instruction::Trunc;
+ if (OriginalSz < BWIt->second.first)
+ Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
+ Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
+ Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
+ TTI::CastContextHint::None,
+ TTI::TCK_RecipThroughput);
+ }
+ }
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
@@ -8590,9 +9082,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
unsigned VecVF = TE->getVectorFactor();
if (VF != VecVF &&
(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
- (all_of(Mask,
- [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
- !ShuffleVectorInst::isIdentityMask(Mask)))) {
+ !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
OrigMask.begin());
@@ -8611,19 +9101,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Calculate the cost of the reshuffled vectors, if any.
for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
- unsigned VF = ShuffleMasks[I].begin()->second.size();
- auto *FTy = FixedVectorType::get(
- cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
auto Vector = ShuffleMasks[I].takeVector();
- auto &&EstimateShufflesCost = [this, FTy,
- &Cost](ArrayRef<int> Mask,
- ArrayRef<const TreeEntry *> TEs) {
+ unsigned VF = 0;
+ auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
+ ArrayRef<const TreeEntry *> TEs) {
assert((TEs.size() == 1 || TEs.size() == 2) &&
"Expected exactly 1 or 2 tree entries.");
if (TEs.size() == 1) {
- int Limit = 2 * Mask.size();
- if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
- !ShuffleVectorInst::isIdentityMask(Mask)) {
+ if (VF == 0)
+ VF = TEs.front()->getVectorFactor();
+ auto *FTy =
+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
+ if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
+ !all_of(enumerate(Mask), [=](const auto &Data) {
+ return Data.value() == PoisonMaskElem ||
+ (Data.index() < VF &&
+ static_cast<int>(Data.index()) == Data.value());
+ })) {
InstructionCost C =
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
@@ -8634,6 +9128,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
Cost += C;
}
} else {
+ if (VF == 0) {
+ if (TEs.front() &&
+ TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
+ VF = TEs.front()->getVectorFactor();
+ else
+ VF = Mask.size();
+ }
+ auto *FTy =
+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
InstructionCost C =
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
@@ -8643,6 +9146,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
dbgs() << "SLP: Current total cost = " << Cost << "\n");
Cost += C;
}
+ VF = Mask.size();
return TEs.back();
};
(void)performExtractsShuffleAction<const TreeEntry>(
@@ -8671,54 +9175,198 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return Cost;
}
-std::optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<const TreeEntry *> &Entries) {
- Entries.clear();
- // No need to check for the topmost gather node.
- if (TE == VectorizableTree.front().get())
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+std::optional<TTI::ShuffleKind>
+BoUpSLP::tryToGatherSingleRegisterExtractElements(
+ MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
+ // Scan list of gathered scalars for extractelements that can be represented
+ // as shuffles.
+ MapVector<Value *, SmallVector<int>> VectorOpToIdx;
+ SmallVector<int> UndefVectorExtracts;
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI) {
+ if (isa<UndefValue>(VL[I]))
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
+ if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
+ continue;
+ std::optional<unsigned> Idx = getExtractIndex(EI);
+ // Undefined index.
+ if (!Idx) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ SmallBitVector ExtractMask(VecTy->getNumElements(), true);
+ ExtractMask.reset(*Idx);
+ if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ VectorOpToIdx[EI->getVectorOperand()].push_back(I);
+ }
+ // Sort the vector operands by the maximum number of uses in extractelements.
+ MapVector<unsigned, SmallVector<Value *>> VFToVector;
+ for (const auto &Data : VectorOpToIdx)
+ VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
+ .push_back(Data.first);
+ for (auto &Data : VFToVector) {
+ stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
+ return VectorOpToIdx.find(V1)->second.size() >
+ VectorOpToIdx.find(V2)->second.size();
+ });
+ }
+ // Find the best pair of the vectors with the same number of elements or a
+ // single vector.
+ const int UndefSz = UndefVectorExtracts.size();
+ unsigned SingleMax = 0;
+ Value *SingleVec = nullptr;
+ unsigned PairMax = 0;
+ std::pair<Value *, Value *> PairVec(nullptr, nullptr);
+ for (auto &Data : VFToVector) {
+ Value *V1 = Data.second.front();
+ if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
+ SingleMax = VectorOpToIdx[V1].size() + UndefSz;
+ SingleVec = V1;
+ }
+ Value *V2 = nullptr;
+ if (Data.second.size() > 1)
+ V2 = *std::next(Data.second.begin());
+ if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
+ UndefSz) {
+ PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
+ PairVec = std::make_pair(V1, V2);
+ }
+ }
+ if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
+ return std::nullopt;
+ // Check if better to perform a shuffle of 2 vectors or just of a single
+ // vector.
+ SmallVector<Value *> SavedVL(VL.begin(), VL.end());
+ SmallVector<Value *> GatheredExtracts(
+ VL.size(), PoisonValue::get(VL.front()->getType()));
+ if (SingleMax >= PairMax && SingleMax) {
+ for (int Idx : VectorOpToIdx[SingleVec])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ } else {
+ for (Value *V : {PairVec.first, PairVec.second})
+ for (int Idx : VectorOpToIdx[V])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ }
+ // Add extracts from undefs too.
+ for (int Idx : UndefVectorExtracts)
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ std::optional<TTI::ShuffleKind> Res =
+ isFixedVectorShuffle(GatheredExtracts, Mask);
+ if (!Res) {
+ // TODO: try to check other subsets if possible.
+ // Restore the original VL if attempt was not successful.
+ copy(SavedVL, VL.begin());
return std::nullopt;
+ }
+ // Restore unused scalars from mask, if some of the extractelements were not
+ // selected for shuffle.
+ for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
+ if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
+ isa<UndefValue>(GatheredExtracts[I])) {
+ std::swap(VL[I], GatheredExtracts[I]);
+ continue;
+ }
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
+ !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
+ is_contained(UndefVectorExtracts, I))
+ continue;
+ }
+ return Res;
+}
+
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+SmallVector<std::optional<TTI::ShuffleKind>>
+BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+ SmallVectorImpl<int> &Mask,
+ unsigned NumParts) const {
+ assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
+ SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
Mask.assign(VL.size(), PoisonMaskElem);
- assert(TE->UserTreeIndices.size() == 1 &&
- "Expected only single user of the gather node.");
+ unsigned SliceSize = VL.size() / NumParts;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ // Scan list of gathered scalars for extractelements that can be represented
+ // as shuffles.
+ MutableArrayRef<Value *> SubVL =
+ MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+ SmallVector<int> SubMask;
+ std::optional<TTI::ShuffleKind> Res =
+ tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
+ ShufflesRes[Part] = Res;
+ copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
+ }
+ if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
+ return Res.has_value();
+ }))
+ ShufflesRes.clear();
+ return ShufflesRes;
+}
+
+std::optional<TargetTransformInfo::ShuffleKind>
+BoUpSLP::isGatherShuffledSingleRegisterEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+ Entries.clear();
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
- Instruction &UserInst =
- getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);
- BasicBlock *ParentBB = nullptr;
+ const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
+ const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
+ const BasicBlock *TEInsertBlock = nullptr;
// Main node of PHI entries keeps the correct order of operands/incoming
// blocks.
- if (auto *PHI =
- dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) {
- ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx);
+ if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
+ TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
+ TEInsertPt = TEInsertBlock->getTerminator();
} else {
- ParentBB = UserInst.getParent();
+ TEInsertBlock = TEInsertPt->getParent();
}
- auto *NodeUI = DT->getNode(ParentBB);
+ auto *NodeUI = DT->getNode(TEInsertBlock);
assert(NodeUI && "Should only process reachable instructions");
SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
- auto CheckOrdering = [&](Instruction *LastEI) {
- // Check if the user node of the TE comes after user node of EntryPtr,
- // otherwise EntryPtr depends on TE.
- // Gather nodes usually are not scheduled and inserted before their first
- // user node. So, instead of checking dependency between the gather nodes
- // themselves, we check the dependency between their user nodes.
- // If one user node comes before the second one, we cannot use the second
- // gather node as the source vector for the first gather node, because in
- // the list of instructions it will be emitted later.
- auto *EntryParent = LastEI->getParent();
- auto *NodeEUI = DT->getNode(EntryParent);
+ auto CheckOrdering = [&](const Instruction *InsertPt) {
+ // Argument InsertPt is an instruction where vector code for some other
+ // tree entry (one that shares one or more scalars with TE) is going to be
+ // generated. This lambda returns true if insertion point of vector code
+ // for the TE dominates that point (otherwise dependency is the other way
+ // around). The other node is not limited to be of a gather kind. Gather
+ // nodes are not scheduled and their vector code is inserted before their
+ // first user. If user is PHI, that is supposed to be at the end of a
+ // predecessor block. Otherwise it is the last instruction among scalars of
+ // the user node. So, instead of checking dependency between instructions
+ // themselves, we check dependency between their insertion points for vector
+ // code (since each scalar instruction ends up as a lane of a vector
+ // instruction).
+ const BasicBlock *InsertBlock = InsertPt->getParent();
+ auto *NodeEUI = DT->getNode(InsertBlock);
if (!NodeEUI)
return false;
assert((NodeUI == NodeEUI) ==
(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
// Check the order of the gather nodes users.
- if (UserInst.getParent() != EntryParent &&
+ if (TEInsertPt->getParent() != InsertBlock &&
(DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
return false;
- if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI))
+ if (TEInsertPt->getParent() == InsertBlock &&
+ TEInsertPt->comesBefore(InsertPt))
return false;
return true;
};
@@ -8743,43 +9391,42 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
[&](Value *V) { return GatheredScalars.contains(V); }) &&
"Must contain at least single gathered value.");
assert(TEPtr->UserTreeIndices.size() == 1 &&
- "Expected only single user of the gather node.");
- PHINode *EntryPHI =
- dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp());
- Instruction *EntryUserInst =
- EntryPHI ? nullptr
- : &getLastInstructionInBundle(
- TEPtr->UserTreeIndices.front().UserTE);
- if (&UserInst == EntryUserInst) {
- assert(!EntryPHI && "Unexpected phi node entry.");
- // If 2 gathers are operands of the same entry, compare operands
- // indices, use the earlier one as the base.
- if (TE->UserTreeIndices.front().UserTE ==
- TEPtr->UserTreeIndices.front().UserTE &&
- TE->UserTreeIndices.front().EdgeIdx <
- TEPtr->UserTreeIndices.front().EdgeIdx)
+ "Expected only single user of a gather node.");
+ const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
+
+ PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
+ const Instruction *InsertPt =
+ UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
+ : &getLastInstructionInBundle(UseEI.UserTE);
+ if (TEInsertPt == InsertPt) {
+ // If 2 gathers are operands of the same entry (regardless of whether
+ // user is PHI or else), compare operands indices, use the earlier one
+ // as the base.
+ if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
+ continue;
+ // If the user instruction is used for some reason in different
+ // vectorized nodes - make it depend on index.
+ if (TEUseEI.UserTE != UseEI.UserTE &&
+ TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
continue;
}
- // Check if the user node of the TE comes after user node of EntryPtr,
- // otherwise EntryPtr depends on TE.
- auto *EntryI =
- EntryPHI
- ? EntryPHI
- ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx)
- ->getTerminator()
- : EntryUserInst;
- if ((ParentBB != EntryI->getParent() ||
- TE->UserTreeIndices.front().EdgeIdx <
- TEPtr->UserTreeIndices.front().EdgeIdx ||
- TE->UserTreeIndices.front().UserTE !=
- TEPtr->UserTreeIndices.front().UserTE) &&
- !CheckOrdering(EntryI))
+
+ // Check if the user node of the TE comes after user node of TEPtr,
+ // otherwise TEPtr depends on TE.
+ if ((TEInsertBlock != InsertPt->getParent() ||
+ TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
+ !CheckOrdering(InsertPt))
continue;
VToTEs.insert(TEPtr);
}
if (const TreeEntry *VTE = getTreeEntry(V)) {
- Instruction &EntryUserInst = getLastInstructionInBundle(VTE);
- if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst))
+ Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
+ if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
+ continue;
+ auto It = MinBWs.find(VTE);
+ // If vectorize node is demoted - do not match.
+ if (It != MinBWs.end() &&
+ It->second.first != DL->getTypeSizeInBits(V->getType()))
continue;
VToTEs.insert(VTE);
}
@@ -8823,8 +9470,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
}
}
- if (UsedTEs.empty())
+ if (UsedTEs.empty()) {
+ Entries.clear();
return std::nullopt;
+ }
unsigned VF = 0;
if (UsedTEs.size() == 1) {
@@ -8838,9 +9487,19 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
});
- if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
+ if (It != FirstEntries.end() &&
+ ((*It)->getVectorFactor() == VL.size() ||
+ ((*It)->getVectorFactor() == TE->Scalars.size() &&
+ TE->ReuseShuffleIndices.size() == VL.size() &&
+ (*It)->isSame(TE->Scalars)))) {
Entries.push_back(*It);
- std::iota(Mask.begin(), Mask.end(), 0);
+ if ((*It)->getVectorFactor() == VL.size()) {
+ std::iota(std::next(Mask.begin(), Part * VL.size()),
+ std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
+ } else {
+ SmallVector<int> CommonMask = TE->getCommonMask();
+ copy(CommonMask, Mask.begin());
+ }
// Clear undef scalars.
for (int I = 0, Sz = VL.size(); I < Sz; ++I)
if (isa<PoisonValue>(VL[I]))
@@ -8923,12 +9582,9 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// by extractelements processing) or may form vector node in future.
auto MightBeIgnored = [=](Value *V) {
auto *I = dyn_cast<Instruction>(V);
- SmallVector<Value *> IgnoredVals;
- if (UserIgnoreList)
- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
!isVectorLikeInstWithConstOps(I) &&
- !areAllUsersVectorized(I, IgnoredVals) && isSimple(I);
+ !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
};
// Check that the neighbor instruction may form a full vector node with the
// current instruction V. It is possible, if they have same/alternate opcode
@@ -8980,7 +9636,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
TempEntries.push_back(Entries[I]);
}
Entries.swap(TempEntries);
- if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
+ if (EntryLanes.size() == Entries.size() &&
+ !VL.equals(ArrayRef(TE->Scalars)
+ .slice(Part * VL.size(),
+ std::min<int>(VL.size(), TE->Scalars.size())))) {
// We may have here 1 or 2 entries only. If the number of scalars is equal
// to the number of entries, no need to do the analysis, it is not very
// profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -8993,9 +9652,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// Pair.first is the offset to the vector, while Pair.second is the index of
// scalar in the list.
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
- Mask[Pair.second] = Pair.first * VF +
- Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
- IsIdentity &= Mask[Pair.second] == Pair.second;
+ unsigned Idx = Part * VL.size() + Pair.second;
+ Mask[Idx] = Pair.first * VF +
+ Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+ IsIdentity &= Mask[Idx] == Pair.second;
}
switch (Entries.size()) {
case 1:
@@ -9010,9 +9670,64 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
break;
}
Entries.clear();
+ // Clear the corresponding mask elements.
+ std::fill(std::next(Mask.begin(), Part * VL.size()),
+ std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
return std::nullopt;
}
+SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+BoUpSLP::isGatherShuffledEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+ unsigned NumParts) {
+ assert(NumParts > 0 && NumParts < VL.size() &&
+ "Expected positive number of registers.");
+ Entries.clear();
+ // No need to check for the topmost gather node.
+ if (TE == VectorizableTree.front().get())
+ return {};
+ Mask.assign(VL.size(), PoisonMaskElem);
+ assert(TE->UserTreeIndices.size() == 1 &&
+ "Expected only single user of the gather node.");
+ assert(VL.size() % NumParts == 0 &&
+ "Number of scalars must be divisible by NumParts.");
+ unsigned SliceSize = VL.size() / NumParts;
+ SmallVector<std::optional<TTI::ShuffleKind>> Res;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+ SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
+ std::optional<TTI::ShuffleKind> SubRes =
+ isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+ if (!SubRes)
+ SubEntries.clear();
+ Res.push_back(SubRes);
+ if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
+ SubEntries.front()->getVectorFactor() == VL.size() &&
+ (SubEntries.front()->isSame(TE->Scalars) ||
+ SubEntries.front()->isSame(VL))) {
+ SmallVector<const TreeEntry *> LocalSubEntries;
+ LocalSubEntries.swap(SubEntries);
+ Entries.clear();
+ Res.clear();
+ std::iota(Mask.begin(), Mask.end(), 0);
+ // Clear undef scalars.
+ for (int I = 0, Sz = VL.size(); I < Sz; ++I)
+ if (isa<PoisonValue>(VL[I]))
+ Mask[I] = PoisonMaskElem;
+ Entries.emplace_back(1, LocalSubEntries.front());
+ Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
+ return Res;
+ }
+ }
+ if (all_of(Res,
+ [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
+ Entries.clear();
+ return {};
+ }
+ return Res;
+}
+
InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
bool ForPoisonSrc) const {
// Find the type of the operands in VL.
@@ -9224,18 +9939,20 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
auto *Front = E->getMainOp();
Instruction *LastInst = &getLastInstructionInBundle(E);
assert(LastInst && "Failed to find last instruction in bundle");
+ BasicBlock::iterator LastInstIt = LastInst->getIterator();
// If the instruction is PHI, set the insert point after all the PHIs.
bool IsPHI = isa<PHINode>(LastInst);
if (IsPHI)
- LastInst = LastInst->getParent()->getFirstNonPHI();
+ LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
if (IsPHI || (E->State != TreeEntry::NeedToGather &&
doesNotNeedToSchedule(E->Scalars))) {
- Builder.SetInsertPoint(LastInst);
+ Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
} else {
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(LastInst->getParent(),
- std::next(LastInst->getIterator()));
+ Builder.SetInsertPoint(
+ LastInst->getParent(),
+ LastInst->getNextNonDebugInstruction()->getIterator());
}
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -9271,10 +9988,12 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
GatherShuffleExtractSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
- if (TreeEntry *Entry = getTreeEntry(V)) {
- // Find which lane we need to extract.
- unsigned FoundLane = Entry->findLaneForValue(V);
- ExternalUses.emplace_back(V, InsElt, FoundLane);
+ if (isa<Instruction>(V)) {
+ if (TreeEntry *Entry = getTreeEntry(V)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = Entry->findLaneForValue(V);
+ ExternalUses.emplace_back(V, InsElt, FoundLane);
+ }
}
return Vec;
};
@@ -9367,12 +10086,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> &GatherShuffleExtractSeq;
/// A list of blocks that we are going to CSE.
- SetVector<BasicBlock *> &CSEBlocks;
+ DenseSet<BasicBlock *> &CSEBlocks;
public:
ShuffleIRBuilder(IRBuilderBase &Builder,
SetVector<Instruction *> &GatherShuffleExtractSeq,
- SetVector<BasicBlock *> &CSEBlocks)
+ DenseSet<BasicBlock *> &CSEBlocks)
: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
CSEBlocks(CSEBlocks) {}
~ShuffleIRBuilder() = default;
@@ -9392,7 +10111,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
return V1;
unsigned VF = Mask.size();
unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
- if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask))
+ if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
return V1;
Value *Vec = Builder.CreateShuffleVector(V1, Mask);
if (auto *I = dyn_cast<Instruction>(Vec)) {
@@ -9455,7 +10174,11 @@ public:
: Builder(Builder), R(R) {}
/// Adjusts extractelements after reusing them.
- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+ unsigned NumParts, bool &UseVecBaseAsInput) {
+ UseVecBaseAsInput = false;
+ SmallPtrSet<Value *, 4> UniqueBases;
Value *VecBase = nullptr;
for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
int Idx = Mask[I];
@@ -9463,6 +10186,10 @@ public:
continue;
auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
VecBase = EI->getVectorOperand();
+ if (const TreeEntry *TE = R.getTreeEntry(VecBase))
+ VecBase = TE->VectorizedValue;
+ assert(VecBase && "Expected vectorized value.");
+ UniqueBases.insert(VecBase);
// If the only one use is vectorized - can delete the extractelement
// itself.
if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -9471,14 +10198,97 @@ public:
continue;
R.eraseInstruction(EI);
}
- return VecBase;
+ if (NumParts == 1 || UniqueBases.size() == 1)
+ return VecBase;
+ UseVecBaseAsInput = true;
+ auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
+ for (auto [I, Idx] : enumerate(Mask))
+ if (Idx != PoisonMaskElem)
+ Idx = I;
+ };
+ // Perform multi-register vector shuffle, joining them into a single virtual
+ // long vector.
+ // Need to shuffle each part independently and then insert all this parts
+ // into a long virtual vector register, forming the original vector.
+ Value *Vec = nullptr;
+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+ unsigned SliceSize = E->Scalars.size() / NumParts;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ ArrayRef<Value *> VL =
+ ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
+ MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+ constexpr int MaxBases = 2;
+ SmallVector<Value *, MaxBases> Bases(MaxBases);
+#ifndef NDEBUG
+ int PrevSize = 0;
+#endif // NDEBUG
+ for (const auto [I, V]: enumerate(VL)) {
+ if (SubMask[I] == PoisonMaskElem)
+ continue;
+ Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
+ if (const TreeEntry *TE = R.getTreeEntry(VecOp))
+ VecOp = TE->VectorizedValue;
+ assert(VecOp && "Expected vectorized value.");
+ const int Size =
+ cast<FixedVectorType>(VecOp->getType())->getNumElements();
+#ifndef NDEBUG
+ assert((PrevSize == Size || PrevSize == 0) &&
+ "Expected vectors of the same size.");
+ PrevSize = Size;
+#endif // NDEBUG
+ Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
+ }
+ if (!Bases.front())
+ continue;
+ Value *SubVec;
+ if (Bases.back()) {
+ SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
+ TransformToIdentity(SubMask);
+ } else {
+ SubVec = Bases.front();
+ }
+ if (!Vec) {
+ Vec = SubVec;
+ assert((Part == 0 || all_of(seq<unsigned>(0, Part),
+ [&](unsigned P) {
+ ArrayRef<int> SubMask =
+ Mask.slice(P * SliceSize, SliceSize);
+ return all_of(SubMask, [](int Idx) {
+ return Idx == PoisonMaskElem;
+ });
+ })) &&
+ "Expected first part or all previous parts masked.");
+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+ } else {
+ unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+ if (Vec->getType() != SubVec->getType()) {
+ unsigned SubVecVF =
+ cast<FixedVectorType>(SubVec->getType())->getNumElements();
+ VF = std::max(VF, SubVecVF);
+ }
+ // Adjust SubMask.
+ for (auto [I, Idx] : enumerate(SubMask))
+ if (Idx != PoisonMaskElem)
+ Idx += VF;
+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+ Vec = createShuffle(Vec, SubVec, VecMask);
+ TransformToIdentity(VecMask);
+ }
+ }
+ copy(VecMask, Mask.begin());
+ return Vec;
}
/// Checks if the specified entry \p E needs to be delayed because of its
/// dependency nodes.
- Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
+ std::optional<Value *>
+ needToDelay(const TreeEntry *E,
+ ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
// No need to delay emission if all deps are ready.
- if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
- return nullptr;
+ if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
+ return all_of(
+ TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
+ }))
+ return std::nullopt;
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
@@ -9487,6 +10297,16 @@ public:
VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
MaybeAlign());
}
+ /// Adds 2 input vectors (in form of tree entries) and the mask for their
+ /// shuffling.
+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
+ add(E1.VectorizedValue, E2.VectorizedValue, Mask);
+ }
+ /// Adds single input vector (in form of tree entry) and the mask for its
+ /// shuffling.
+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {
+ add(E1.VectorizedValue, Mask);
+ }
/// Adds 2 input vectors and the mask for their shuffling.
void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
@@ -9516,7 +10336,7 @@ public:
InVectors.push_back(V1);
}
/// Adds another one input vector and the mask for the shuffling.
- void add(Value *V1, ArrayRef<int> Mask) {
+ void add(Value *V1, ArrayRef<int> Mask, bool = false) {
if (InVectors.empty()) {
if (!isa<FixedVectorType>(V1->getType())) {
V1 = createShuffle(V1, nullptr, CommonMask);
@@ -9578,7 +10398,8 @@ public:
inversePermutation(Order, NewMask);
add(V1, NewMask);
}
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
+ Value *Root = nullptr) {
return R.gather(VL, Root);
}
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -9639,8 +10460,14 @@ public:
}
};
-Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
- ArrayRef<Value *> VL = E->getOperand(NodeIdx);
+Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
+ bool PostponedPHIs) {
+ ValueList &VL = E->getOperand(NodeIdx);
+ if (E->State == TreeEntry::PossibleStridedVectorize &&
+ !E->ReorderIndices.empty()) {
+ SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
+ reorderScalars(VL, Mask);
+ }
const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
@@ -9651,23 +10478,39 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
S = getSameOpcode(*It, *TLI);
}
if (S.getOpcode()) {
- if (TreeEntry *VE = getTreeEntry(S.OpValue);
- VE && VE->isSame(VL) &&
- (any_of(VE->UserTreeIndices,
- [E, NodeIdx](const EdgeInfo &EI) {
- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
- }) ||
- any_of(VectorizableTree,
- [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isOperandGatherNode({E, NodeIdx}) &&
- VE->isSame(TE->Scalars);
- }))) {
+ auto CheckSameVE = [&](const TreeEntry *VE) {
+ return VE->isSame(VL) &&
+ (any_of(VE->UserTreeIndices,
+ [E, NodeIdx](const EdgeInfo &EI) {
+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
+ }) ||
+ any_of(VectorizableTree,
+ [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isOperandGatherNode({E, NodeIdx}) &&
+ VE->isSame(TE->Scalars);
+ }));
+ };
+ TreeEntry *VE = getTreeEntry(S.OpValue);
+ bool IsSameVE = VE && CheckSameVE(VE);
+ if (!IsSameVE) {
+ auto It = MultiNodeScalars.find(S.OpValue);
+ if (It != MultiNodeScalars.end()) {
+ auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
+ return TE != VE && CheckSameVE(TE);
+ });
+ if (I != It->getSecond().end()) {
+ VE = *I;
+ IsSameVE = true;
+ }
+ }
+ }
+ if (IsSameVE) {
auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
ShuffleBuilder.add(V, Mask);
return ShuffleBuilder.finalize(std::nullopt);
};
- Value *V = vectorizeTree(VE);
+ Value *V = vectorizeTree(VE, PostponedPHIs);
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
if (!VE->ReuseShuffleIndices.empty()) {
// Reshuffle to get only unique values.
@@ -9740,14 +10583,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
assert(I->get()->UserTreeIndices.size() == 1 &&
"Expected only single user for the gather node.");
assert(I->get()->isSame(VL) && "Expected same list of scalars.");
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (E->getOpcode() != Instruction::InsertElement &&
- E->getOpcode() != Instruction::PHI) {
- Instruction *LastInst = &getLastInstructionInBundle(E);
- assert(LastInst && "Failed to find last instruction in bundle");
- Builder.SetInsertPoint(LastInst);
- }
- return vectorizeTree(I->get());
+ return vectorizeTree(I->get(), PostponedPHIs);
}
template <typename BVTy, typename ResTy, typename... Args>
@@ -9765,7 +10601,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
inversePermutation(E->ReorderIndices, ReorderMask);
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
- auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {
+ auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) {
if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
return isa<UndefValue>(V) && !isa<PoisonValue>(V);
}))
@@ -9782,70 +10618,102 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
});
if (It == VectorizableTree.end())
return false;
- unsigned I =
- *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
- int Sz = Mask.size();
- if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
- ShuffleVectorInst::isIdentityMask(Mask))
+ int Idx;
+ if ((Mask.size() < InputVF &&
+ ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
+ Idx == 0) ||
+ (Mask.size() == InputVF &&
+ ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
std::iota(Mask.begin(), Mask.end(), 0);
- else
+ } else {
+ unsigned I =
+ *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
std::fill(Mask.begin(), Mask.end(), I);
+ }
return true;
};
BVTy ShuffleBuilder(Params...);
ResTy Res = ResTy();
SmallVector<int> Mask;
- SmallVector<int> ExtractMask;
- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
- SmallVector<const TreeEntry *> Entries;
+ SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
+ SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
+ Value *ExtractVecBase = nullptr;
+ bool UseVecBaseAsInput = false;
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
+ SmallVector<SmallVector<const TreeEntry *>> Entries;
Type *ScalarTy = GatheredScalars.front()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
+ unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ if (NumParts == 0 || NumParts >= GatheredScalars.size())
+ NumParts = 1;
if (!all_of(GatheredScalars, UndefValue::classof)) {
// Check for gathered extracts.
- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
- SmallVector<Value *> IgnoredVals;
- if (UserIgnoreList)
- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
bool Resized = false;
- if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
- Resized = true;
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
+ ExtractShuffles =
+ tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+ if (!ExtractShuffles.empty()) {
+ SmallVector<const TreeEntry *> ExtractEntries;
+ for (auto [Idx, I] : enumerate(ExtractMask)) {
+ if (I == PoisonMaskElem)
+ continue;
+ if (const auto *TE = getTreeEntry(
+ cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
+ ExtractEntries.push_back(TE);
+ }
+ if (std::optional<ResTy> Delayed =
+ ShuffleBuilder.needToDelay(E, ExtractEntries)) {
+ // Delay emission of gathers which are not ready yet.
+ PostponedGathers.insert(E);
+ // Postpone gather emission, will be emitted after the end of the
+ // process to keep correct order.
+ return *Delayed;
+ }
+ if (Value *VecBase = ShuffleBuilder.adjustExtracts(
+ E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
+ ExtractVecBase = VecBase;
+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+ if (VF == VecBaseTy->getNumElements() &&
+ GatheredScalars.size() != VF) {
+ Resized = true;
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+ }
+ }
// Gather extracts after we check for full matched gathers only.
- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+ if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
E->isAltShuffle() ||
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
isSplat(E->Scalars) ||
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+ GatherShuffles =
+ isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
}
- if (GatherShuffle) {
- if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
+ if (!GatherShuffles.empty()) {
+ if (std::optional<ResTy> Delayed =
+ ShuffleBuilder.needToDelay(E, Entries)) {
// Delay emission of gathers which are not ready yet.
PostponedGathers.insert(E);
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
- return Delayed;
+ return *Delayed;
}
- assert((Entries.size() == 1 || Entries.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
- Entries.front()->isSame(E->Scalars)) {
+ if (GatherShuffles.size() == 1 &&
+ *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+ Entries.front().front()->isSame(E->Scalars)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
dbgs()
- << "SLP: perfect diamond match for gather bundle that starts with "
- << *E->Scalars.front() << ".\n");
+ << "SLP: perfect diamond match for gather bundle "
+ << shortBundleName(E->Scalars) << ".\n");
// Restore the mask for previous partially matched values.
- if (Entries.front()->ReorderIndices.empty() &&
- ((Entries.front()->ReuseShuffleIndices.empty() &&
- E->Scalars.size() == Entries.front()->Scalars.size()) ||
- (E->Scalars.size() ==
- Entries.front()->ReuseShuffleIndices.size()))) {
+ Mask.resize(E->Scalars.size());
+ const TreeEntry *FrontTE = Entries.front().front();
+ if (FrontTE->ReorderIndices.empty() &&
+ ((FrontTE->ReuseShuffleIndices.empty() &&
+ E->Scalars.size() == FrontTE->Scalars.size()) ||
+ (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
std::iota(Mask.begin(), Mask.end(), 0);
} else {
for (auto [I, V] : enumerate(E->Scalars)) {
@@ -9853,17 +10721,20 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
Mask[I] = PoisonMaskElem;
continue;
}
- Mask[I] = Entries.front()->findLaneForValue(V);
+ Mask[I] = FrontTE->findLaneForValue(V);
}
}
- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+ ShuffleBuilder.add(*FrontTE, Mask);
Res = ShuffleBuilder.finalize(E->getCommonMask());
return Res;
}
if (!Resized) {
- unsigned VF1 = Entries.front()->getVectorFactor();
- unsigned VF2 = Entries.back()->getVectorFactor();
- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+ if (GatheredScalars.size() != VF &&
+ any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
+ return any_of(TEs, [&](const TreeEntry *TE) {
+ return TE->getVectorFactor() == VF;
+ });
+ }))
GatheredScalars.append(VF - GatheredScalars.size(),
PoisonValue::get(ScalarTy));
}
@@ -9943,78 +10814,108 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
if (It != Scalars.end()) {
// Replace undefs by the non-poisoned scalars and emit broadcast.
int Pos = std::distance(Scalars.begin(), It);
- for_each(UndefPos, [&](int I) {
+ for (int I : UndefPos) {
// Set the undef position to the non-poisoned scalar.
ReuseMask[I] = Pos;
// Replace the undef by the poison, in the mask it is replaced by
// non-poisoned scalar already.
if (I != Pos)
Scalars[I] = PoisonValue::get(ScalarTy);
- });
+ }
} else {
// Replace undefs by the poisons, emit broadcast and then emit
// freeze.
- for_each(UndefPos, [&](int I) {
+ for (int I : UndefPos) {
ReuseMask[I] = PoisonMaskElem;
if (isa<UndefValue>(Scalars[I]))
Scalars[I] = PoisonValue::get(ScalarTy);
- });
+ }
NeedFreeze = true;
}
}
};
- if (ExtractShuffle || GatherShuffle) {
+ if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
bool IsNonPoisoned = true;
- bool IsUsedInExpr = false;
+ bool IsUsedInExpr = true;
Value *Vec1 = nullptr;
- if (ExtractShuffle) {
+ if (!ExtractShuffles.empty()) {
// Gather of extractelements can be represented as just a shuffle of
// a single/two vectors the scalars are extracted from.
// Find input vectors.
Value *Vec2 = nullptr;
for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
- if (ExtractMask[I] == PoisonMaskElem ||
- (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+ if (!Mask.empty() && Mask[I] != PoisonMaskElem)
ExtractMask[I] = PoisonMaskElem;
- continue;
- }
- if (isa<UndefValue>(E->Scalars[I]))
- continue;
- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
- if (!Vec1) {
- Vec1 = EI->getVectorOperand();
- } else if (Vec1 != EI->getVectorOperand()) {
- assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
- "Expected only 1 or 2 vectors shuffle.");
- Vec2 = EI->getVectorOperand();
+ }
+ if (UseVecBaseAsInput) {
+ Vec1 = ExtractVecBase;
+ } else {
+ for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+ if (ExtractMask[I] == PoisonMaskElem)
+ continue;
+ if (isa<UndefValue>(E->Scalars[I]))
+ continue;
+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ Value *VecOp = EI->getVectorOperand();
+ if (const auto *TE = getTreeEntry(VecOp))
+ if (TE->VectorizedValue)
+ VecOp = TE->VectorizedValue;
+ if (!Vec1) {
+ Vec1 = VecOp;
+ } else if (Vec1 != EI->getVectorOperand()) {
+ assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+ "Expected only 1 or 2 vectors shuffle.");
+ Vec2 = VecOp;
+ }
}
}
if (Vec2) {
+ IsUsedInExpr = false;
IsNonPoisoned &=
isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
} else if (Vec1) {
- IsUsedInExpr = FindReusedSplat(ExtractMask);
- ShuffleBuilder.add(Vec1, ExtractMask);
+ IsUsedInExpr &= FindReusedSplat(
+ ExtractMask,
+ cast<FixedVectorType>(Vec1->getType())->getNumElements());
+ ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
} else {
+ IsUsedInExpr = false;
ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
ScalarTy, GatheredScalars.size())),
- ExtractMask);
+ ExtractMask, /*ForExtracts=*/true);
}
}
- if (GatherShuffle) {
- if (Entries.size() == 1) {
- IsUsedInExpr = FindReusedSplat(Mask);
- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
- } else {
- ShuffleBuilder.add(Entries.front()->VectorizedValue,
- Entries.back()->VectorizedValue, Mask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
- isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+ if (!GatherShuffles.empty()) {
+ unsigned SliceSize = E->Scalars.size() / NumParts;
+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+ for (const auto [I, TEs] : enumerate(Entries)) {
+ if (TEs.empty()) {
+ assert(!GatherShuffles[I] &&
+ "No shuffles with empty entries list expected.");
+ continue;
+ }
+ assert((TEs.size() == 1 || TEs.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
+ auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+ VecMask.assign(VecMask.size(), PoisonMaskElem);
+ copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
+ if (TEs.size() == 1) {
+ IsUsedInExpr &=
+ FindReusedSplat(VecMask, TEs.front()->getVectorFactor());
+ ShuffleBuilder.add(*TEs.front(), VecMask);
+ if (TEs.front()->VectorizedValue)
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
+ } else {
+ IsUsedInExpr = false;
+ ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
+ if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+ }
}
}
// Try to figure out best way to combine values: build a shuffle and insert
@@ -10025,16 +10926,24 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
int MSz = Mask.size();
// Try to build constant vector and shuffle with it only if currently we
// have a single permutation and more than 1 scalar constants.
- bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
+ bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
bool IsIdentityShuffle =
- (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
- TTI::SK_PermuteSingleSrc &&
+ ((UseVecBaseAsInput ||
+ all_of(ExtractShuffles,
+ [](const std::optional<TTI::ShuffleKind> &SK) {
+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc;
+ })) &&
none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
- ShuffleVectorInst::isIdentityMask(ExtractMask)) ||
- (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
- TTI::SK_PermuteSingleSrc &&
+ ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
+ (!GatherShuffles.empty() &&
+ all_of(GatherShuffles,
+ [](const std::optional<TTI::ShuffleKind> &SK) {
+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc;
+ }) &&
none_of(Mask, [&](int I) { return I >= MSz; }) &&
- ShuffleVectorInst::isIdentityMask(Mask));
+ ShuffleVectorInst::isIdentityMask(Mask, MSz));
bool EnoughConstsForShuffle =
IsSingleShuffle &&
(none_of(GatheredScalars,
@@ -10064,7 +10973,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
if (!all_of(GatheredScalars, PoisonValue::classof)) {
SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
ShuffleBuilder.add(BV, BVMask);
}
if (all_of(NonConstants, [=](Value *V) {
@@ -10078,13 +10987,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
E->ReuseShuffleIndices, E->Scalars.size(),
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
- Vec = ShuffleBuilder.gather(NonConstants, Vec);
+ Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
});
} else if (!allConstant(GatheredScalars)) {
// Gather unique scalars and all constants.
SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
ShuffleBuilder.add(BV, ReuseMask);
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
} else {
@@ -10109,10 +11018,12 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
*this);
}
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
IRBuilder<>::InsertPointGuard Guard(Builder);
- if (E->VectorizedValue) {
+ if (E->VectorizedValue &&
+ (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
+ E->isAltShuffle())) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
@@ -10126,13 +11037,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return Vec;
}
- auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
+ auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
+ bool IsSigned) {
+ if (V->getType() != VecTy)
+ V = Builder.CreateIntCast(V, VecTy, IsSigned);
ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
if (E->getOpcode() == Instruction::Store) {
ArrayRef<int> Mask =
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
E->ReorderIndices.size());
ShuffleBuilder.add(V, Mask);
+ } else if (E->State == TreeEntry::PossibleStridedVectorize) {
+ ShuffleBuilder.addOrdered(V, std::nullopt);
} else {
ShuffleBuilder.addOrdered(V, E->ReorderIndices);
}
@@ -10140,7 +11056,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
};
assert((E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::ScatterVectorize) &&
+ E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
"Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -10150,6 +11067,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ScalarTy = Store->getValueOperand()->getType();
else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
ScalarTy = IE->getOperand(1)->getType();
+ bool IsSigned = false;
+ auto It = MinBWs.find(E);
+ if (It != MinBWs.end()) {
+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+ IsSigned = It->second.second;
+ }
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
@@ -10157,32 +11080,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E != VectorizableTree.front().get() ||
!E->UserTreeIndices.empty()) &&
"PHI reordering is free.");
+ if (PostponedPHIs && E->VectorizedValue)
+ return E->VectorizedValue;
auto *PH = cast<PHINode>(VL0);
- Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
- Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- Value *V = NewPhi;
-
- // Adjust insertion point once all PHI's have been generated.
- Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());
+ Builder.SetInsertPoint(PH->getParent(),
+ PH->getParent()->getFirstNonPHIIt());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ if (PostponedPHIs || !E->VectorizedValue) {
+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+ E->PHI = NewPhi;
+ Value *V = NewPhi;
+
+ // Adjust insertion point once all PHI's have been generated.
+ Builder.SetInsertPoint(PH->getParent(),
+ PH->getParent()->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
- E->VectorizedValue = V;
+ E->VectorizedValue = V;
+ if (PostponedPHIs)
+ return V;
+ }
+ PHINode *NewPhi = cast<PHINode>(E->PHI);
+ // If phi node is fully emitted - exit.
+ if (NewPhi->getNumIncomingValues() != 0)
+ return NewPhi;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
SmallPtrSet<BasicBlock *, 4> VisitedBBs;
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
ValueList Operands;
- BasicBlock *IBB = PH->getIncomingBlock(i);
+ BasicBlock *IBB = PH->getIncomingBlock(I);
// Stop emission if all incoming values are generated.
if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return V;
+ return NewPhi;
}
if (!VisitedBBs.insert(IBB).second) {
@@ -10192,37 +11128,54 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- Value *Vec = vectorizeOperand(E, i);
+ Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
+ if (VecTy != Vec->getType()) {
+ assert(MinBWs.contains(getOperandEntry(E, I)) &&
+ "Expected item in MinBWs.");
+ Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);
+ }
NewPhi->addIncoming(Vec, IBB);
}
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
- return V;
+ return NewPhi;
}
case Instruction::ExtractElement: {
Value *V = E->getSingleOperand(0);
setInsertPointAfterBundle(E);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
auto *LI = cast<LoadInst>(E->getSingleOperand(0));
Builder.SetInsertPoint(LI);
- auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
- Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ Value *Ptr = LI->getPointerOperand();
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
Value *NewV = propagateMetadata(V, E->Scalars);
- NewV = FinalShuffle(NewV, E);
+ NewV = FinalShuffle(NewV, E, VecTy, IsSigned);
E->VectorizedValue = NewV;
return NewV;
}
case Instruction::InsertElement: {
assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
- Value *V = vectorizeOperand(E, 1);
+ Value *V = vectorizeOperand(E, 1, PostponedPHIs);
+ ArrayRef<Value *> Op = E->getOperand(1);
+ Type *ScalarTy = Op.front()->getType();
+ if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
+ assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
+ std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
+ assert(Res.first > 0 && "Expected item in MinBWs.");
+ V = Builder.CreateIntCast(
+ V,
+ FixedVectorType::get(
+ ScalarTy,
+ cast<FixedVectorType>(V->getType())->getNumElements()),
+ Res.second);
+ }
// Create InsertVector shuffle if necessary
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
@@ -10255,7 +11208,57 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Mask[InsertIdx - Offset] = I;
}
if (!IsIdentity || NumElts != NumScalars) {
- V = Builder.CreateShuffleVector(V, Mask);
+ Value *V2 = nullptr;
+ bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
+ SmallVector<int> InsertMask(Mask);
+ if (NumElts != NumScalars && Offset == 0) {
+ // Follow all insert element instructions from the current buildvector
+ // sequence.
+ InsertElementInst *Ins = cast<InsertElementInst>(VL0);
+ do {
+ std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
+ if (!InsertIdx)
+ break;
+ if (InsertMask[*InsertIdx] == PoisonMaskElem)
+ InsertMask[*InsertIdx] = *InsertIdx;
+ if (!Ins->hasOneUse())
+ break;
+ Ins = dyn_cast_or_null<InsertElementInst>(
+ Ins->getUniqueUndroppableUser());
+ } while (Ins);
+ SmallBitVector UseMask =
+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ SmallBitVector IsFirstUndef =
+ isUndefVector(FirstInsert->getOperand(0), UseMask);
+ if (!IsFirstPoison.all()) {
+ unsigned Idx = 0;
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
+ IsFirstUndef.test(I)) {
+ if (IsVNonPoisonous) {
+ InsertMask[I] = I < NumScalars ? I : 0;
+ continue;
+ }
+ if (!V2)
+ V2 = UndefValue::get(V->getType());
+ if (Idx >= NumScalars)
+ Idx = NumScalars - 1;
+ InsertMask[I] = NumScalars + Idx;
+ ++Idx;
+ } else if (InsertMask[I] != PoisonMaskElem &&
+ Mask[I] == PoisonMaskElem) {
+ InsertMask[I] = PoisonMaskElem;
+ }
+ }
+ } else {
+ InsertMask = Mask;
+ }
+ }
+ if (!V2)
+ V2 = PoisonValue::get(V->getType());
+ V = Builder.CreateShuffleVector(V, V2, InsertMask);
if (auto *I = dyn_cast<Instruction>(V)) {
GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
@@ -10274,15 +11277,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
NumElts != NumScalars) {
if (IsFirstUndef.all()) {
- if (!ShuffleVectorInst::isIdentityMask(InsertMask)) {
- SmallBitVector IsFirstPoison =
- isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
- if (!IsFirstPoison.all()) {
- for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
- InsertMask[I] = I + NumElts;
+ if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ if (!IsFirstPoison.all()) {
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
+ InsertMask[I] = I + NumElts;
+ }
}
- }
V = Builder.CreateShuffleVector(
V,
IsFirstPoison.all() ? PoisonValue::get(V->getType())
@@ -10330,15 +11333,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::BitCast: {
setInsertPointAfterBundle(E);
- Value *InVec = vectorizeOperand(E, 0);
+ Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
auto *CI = cast<CastInst>(VL0);
- Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
- V = FinalShuffle(V, E);
+ Instruction::CastOps VecOpcode = CI->getOpcode();
+ Type *SrcScalarTy = VL0->getOperand(0)->getType();
+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {
+ // Check if the values are candidates to demote.
+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
+ if (SrcIt != MinBWs.end())
+ SrcBWSz = SrcIt->second.first;
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+ if (BWSz == SrcBWSz) {
+ VecOpcode = Instruction::BitCast;
+ } else if (BWSz < SrcBWSz) {
+ VecOpcode = Instruction::Trunc;
+ } else if (It != MinBWs.end()) {
+ assert(BWSz > SrcBWSz && "Invalid cast!");
+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+ }
+ }
+ Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
+ ? InVec
+ : Builder.CreateCast(VecOpcode, InVec, VecTy);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10348,21 +11372,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::ICmp: {
setInsertPointAfterBundle(E);
- Value *L = vectorizeOperand(E, 0);
+ Value *L = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *R = vectorizeOperand(E, 1);
+ Value *R = vectorizeOperand(E, 1, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (L->getType() != R->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ L = Builder.CreateIntCast(L, VecTy, IsSigned);
+ R = Builder.CreateIntCast(R, VecTy, IsSigned);
+ }
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
- V = FinalShuffle(V, E);
+ // Do not cast for cmps.
+ VecTy = cast<FixedVectorType>(V->getType());
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10371,24 +11404,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Select: {
setInsertPointAfterBundle(E);
- Value *Cond = vectorizeOperand(E, 0);
+ Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *True = vectorizeOperand(E, 1);
+ Value *True = vectorizeOperand(E, 1, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *False = vectorizeOperand(E, 2);
+ Value *False = vectorizeOperand(E, 2, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (True->getType() != False->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 1)) ||
+ MinBWs.contains(getOperandEntry(E, 2))) &&
+ "Expected item in MinBWs.");
+ True = Builder.CreateIntCast(True, VecTy, IsSigned);
+ False = Builder.CreateIntCast(False, VecTy, IsSigned);
+ }
Value *V = Builder.CreateSelect(Cond, True, False);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10397,7 +11437,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::FNeg: {
setInsertPointAfterBundle(E);
- Value *Op = vectorizeOperand(E, 0);
+ Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -10410,7 +11450,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10437,16 +11477,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
setInsertPointAfterBundle(E);
- Value *LHS = vectorizeOperand(E, 0);
+ Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *RHS = vectorizeOperand(E, 1);
+ Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (LHS->getType() != RHS->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+ }
Value *V = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
@@ -10455,7 +11502,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10476,14 +11523,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// The pointer operand uses an in-tree scalar so we add the new
// LoadInst to ExternalUses list to make sure that an extract will
// be generated in the future.
- if (TreeEntry *Entry = getTreeEntry(PO)) {
- // Find which lane we need to extract.
- unsigned FoundLane = Entry->findLaneForValue(PO);
- ExternalUses.emplace_back(PO, NewLI, FoundLane);
+ if (isa<Instruction>(PO)) {
+ if (TreeEntry *Entry = getTreeEntry(PO)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = Entry->findLaneForValue(PO);
+ ExternalUses.emplace_back(PO, NewLI, FoundLane);
+ }
}
} else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
- Value *VecPtr = vectorizeOperand(E, 0);
+ assert((E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
+ "Unhandled state");
+ Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -10497,35 +11548,32 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = propagateMetadata(NewLI, E->Scalars);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
}
case Instruction::Store: {
auto *SI = cast<StoreInst>(VL0);
- unsigned AS = SI->getPointerAddressSpace();
setInsertPointAfterBundle(E);
- Value *VecValue = vectorizeOperand(E, 0);
- VecValue = FinalShuffle(VecValue, E);
+ Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
+ VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);
- Value *ScalarPtr = SI->getPointerOperand();
- Value *VecPtr = Builder.CreateBitCast(
- ScalarPtr, VecValue->getType()->getPointerTo(AS));
+ Value *Ptr = SI->getPointerOperand();
StoreInst *ST =
- Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());
+ Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
- // The pointer operand uses an in-tree scalar, so add the new BitCast or
- // StoreInst to ExternalUses to make sure that an extract will be
- // generated in the future.
- if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {
- // Find which lane we need to extract.
- unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);
- ExternalUses.push_back(ExternalUser(
- ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,
- FoundLane));
+ // The pointer operand uses an in-tree scalar, so add the new StoreInst to
+ // ExternalUses to make sure that an extract will be generated in the
+ // future.
+ if (isa<Instruction>(Ptr)) {
+ if (TreeEntry *Entry = getTreeEntry(Ptr)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = Entry->findLaneForValue(Ptr);
+ ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane));
+ }
}
Value *V = propagateMetadata(ST, E->Scalars);
@@ -10538,7 +11586,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *GEP0 = cast<GetElementPtrInst>(VL0);
setInsertPointAfterBundle(E);
- Value *Op0 = vectorizeOperand(E, 0);
+ Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -10546,7 +11594,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<Value *> OpVecs;
for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
- Value *OpVec = vectorizeOperand(E, J);
+ Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -10564,7 +11612,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = propagateMetadata(I, GEPs);
}
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10586,41 +11634,42 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
VecCallCosts.first <= VecCallCosts.second;
Value *ScalarArg = nullptr;
- std::vector<Value *> OpVecs;
+ SmallVector<Value *> OpVecs;
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))
TysForDecl.push_back(
FixedVectorType::get(CI->getType(), E->Scalars.size()));
- for (int j = 0, e = CI->arg_size(); j < e; ++j) {
+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
CallInst *CEI = cast<CallInst>(VL0);
- ScalarArg = CEI->getArgOperand(j);
- OpVecs.push_back(CEI->getArgOperand(j));
- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+ ScalarArg = CEI->getArgOperand(I);
+ OpVecs.push_back(CEI->getArgOperand(I));
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
TysForDecl.push_back(ScalarArg->getType());
continue;
}
- Value *OpVec = vectorizeOperand(E, j);
+ Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
TysForDecl.push_back(OpVec->getType());
}
Function *CF;
if (!UseIntrinsic) {
VFShape Shape =
- VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
- VecTy->getNumElements())),
+ VFShape::get(CI->getFunctionType(),
+ ElementCount::getFixed(
+ static_cast<unsigned>(VecTy->getNumElements())),
false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
} else {
@@ -10634,7 +11683,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// The scalar argument uses an in-tree scalar so we add the new vectorized
// call to ExternalUses list to make sure that an extract will be
// generated in the future.
- if (ScalarArg) {
+ if (isa_and_present<Instruction>(ScalarArg)) {
if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(ScalarArg);
@@ -10644,7 +11693,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
propagateIRFlags(V, E->Scalars, VL0);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10662,20 +11711,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *LHS = nullptr, *RHS = nullptr;
if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
setInsertPointAfterBundle(E);
- LHS = vectorizeOperand(E, 0);
+ LHS = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- RHS = vectorizeOperand(E, 1);
+ RHS = vectorizeOperand(E, 1, PostponedPHIs);
} else {
setInsertPointAfterBundle(E);
- LHS = vectorizeOperand(E, 0);
+ LHS = vectorizeOperand(E, 0, PostponedPHIs);
}
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (LHS && RHS && LHS->getType() != RHS->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+ }
Value *V0, *V1;
if (Instruction::isBinaryOp(E->getOpcode())) {
@@ -10708,8 +11764,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// each vector operation.
ValueList OpScalars, AltScalars;
SmallVector<int> Mask;
- buildShuffleEntryMask(
- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+ E->buildAltOpShuffleMask(
[E, this](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
@@ -10727,6 +11782,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CSEBlocks.insert(I->getParent());
}
+ if (V->getType() != VecTy && !isa<CmpInst>(VL0))
+ V = Builder.CreateIntCast(
+ V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10767,9 +11825,19 @@ Value *BoUpSLP::vectorizeTree(
// need to rebuild it.
EntryToLastInstruction.clear();
- Builder.SetInsertPoint(ReductionRoot ? ReductionRoot
- : &F->getEntryBlock().front());
- auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+ if (ReductionRoot)
+ Builder.SetInsertPoint(ReductionRoot->getParent(),
+ ReductionRoot->getIterator());
+ else
+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
+
+ // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
+ (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
+ if (TE->State == TreeEntry::Vectorize &&
+ TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
+ TE->VectorizedValue)
+ (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
// Run through the list of postponed gathers and emit them, replacing the temp
// emitted allocas with actual vector instructions.
ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
@@ -10786,9 +11854,32 @@ Value *BoUpSLP::vectorizeTree(
TE->VectorizedValue = nullptr;
auto *UserI =
cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
- Builder.SetInsertPoint(PrevVec);
+ // If user is a PHI node, its vector code have to be inserted right before
+ // block terminator. Since the node was delayed, there were some unresolved
+ // dependencies at the moment when stab instruction was emitted. In a case
+ // when any of these dependencies turn out an operand of another PHI, coming
+ // from this same block, position of a stab instruction will become invalid.
+ // The is because source vector that supposed to feed this gather node was
+ // inserted at the end of the block [after stab instruction]. So we need
+ // to adjust insertion point again to the end of block.
+ if (isa<PHINode>(UserI)) {
+ // Insert before all users.
+ Instruction *InsertPt = PrevVec->getParent()->getTerminator();
+ for (User *U : PrevVec->users()) {
+ if (U == UserI)
+ continue;
+ auto *UI = dyn_cast<Instruction>(U);
+ if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
+ continue;
+ if (UI->comesBefore(InsertPt))
+ InsertPt = UI;
+ }
+ Builder.SetInsertPoint(InsertPt);
+ } else {
+ Builder.SetInsertPoint(PrevVec);
+ }
Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
- Value *Vec = vectorizeTree(TE);
+ Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
PrevVec->replaceAllUsesWith(Vec);
PostponedValues.try_emplace(Vec).first->second.push_back(TE);
// Replace the stub vector node, if it was used before for one of the
@@ -10801,26 +11892,6 @@ Value *BoUpSLP::vectorizeTree(
eraseInstruction(PrevVec);
}
- // If the vectorized tree can be rewritten in a smaller type, we truncate the
- // vectorized root. InstCombine will then rewrite the entire expression. We
- // sign extend the extracted values below.
- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
- if (MinBWs.count(ScalarRoot)) {
- if (auto *I = dyn_cast<Instruction>(VectorRoot)) {
- // If current instr is a phi and not the last phi, insert it after the
- // last phi node.
- if (isa<PHINode>(I))
- Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());
- else
- Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
- }
- auto BundleWidth = VectorizableTree[0]->Scalars.size();
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
- auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
- VectorizableTree[0]->VectorizedValue = Trunc;
- }
-
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");
@@ -10830,6 +11901,8 @@ Value *BoUpSLP::vectorizeTree(
// Maps extract Scalar to the corresponding extractelement instruction in the
// basic block. Only one extractelement per block should be emitted.
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
+ SmallDenseSet<Value *, 4> UsedInserts;
+ DenseMap<Value *, Value *> VectorCasts;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -10864,7 +11937,8 @@ Value *BoUpSLP::vectorizeTree(
Instruction *I = EEIt->second;
if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
Builder.GetInsertPoint()->comesBefore(I))
- I->moveBefore(&*Builder.GetInsertPoint());
+ I->moveBefore(*Builder.GetInsertPoint()->getParent(),
+ Builder.GetInsertPoint());
Ex = I;
}
}
@@ -10887,11 +11961,10 @@ Value *BoUpSLP::vectorizeTree(
}
// If necessary, sign-extend or zero-extend ScalarRoot
// to the larger type.
- if (!MinBWs.count(ScalarRoot))
- return Ex;
- if (MinBWs[ScalarRoot].second)
- return Builder.CreateSExt(Ex, Scalar->getType());
- return Builder.CreateZExt(Ex, Scalar->getType());
+ if (Scalar->getType() != Ex->getType())
+ return Builder.CreateIntCast(Ex, Scalar->getType(),
+ MinBWs.find(E)->second.second);
+ return Ex;
}
assert(isa<FixedVectorType>(Scalar->getType()) &&
isa<InsertElementInst>(Scalar) &&
@@ -10909,12 +11982,13 @@ Value *BoUpSLP::vectorizeTree(
"ExternallyUsedValues map");
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (auto *PHI = dyn_cast<PHINode>(VecI))
- Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI());
+ Builder.SetInsertPoint(PHI->getParent(),
+ PHI->getParent()->getFirstNonPHIIt());
else
Builder.SetInsertPoint(VecI->getParent(),
std::next(VecI->getIterator()));
} else {
- Builder.SetInsertPoint(&F->getEntryBlock().front());
+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
// Required to update internally referenced instructions.
@@ -10927,12 +12001,26 @@ Value *BoUpSLP::vectorizeTree(
// Skip if the scalar is another vector op or Vec is not an instruction.
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
+ if (!UsedInserts.insert(VU).second)
+ continue;
+ // Need to use original vector, if the root is truncated.
+ auto BWIt = MinBWs.find(E);
+ if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
+ auto VecIt = VectorCasts.find(Scalar);
+ if (VecIt == VectorCasts.end()) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (auto *IVec = dyn_cast<Instruction>(Vec))
+ Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
+ Vec = Builder.CreateIntCast(Vec, VU->getType(),
+ BWIt->second.second);
+ VectorCasts.try_emplace(Scalar, Vec);
+ } else {
+ Vec = VecIt->second;
+ }
+ }
+
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
- // Need to use original vector, if the root is truncated.
- if (MinBWs.count(Scalar) &&
- VectorizableTree[0]->VectorizedValue == Vec)
- Vec = VectorRoot;
auto *It =
find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
// Checks if 2 insertelements are from the same buildvector.
@@ -10992,18 +12080,18 @@ Value *BoUpSLP::vectorizeTree(
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (PHINode *PH = dyn_cast<PHINode>(User)) {
- for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
- if (PH->getIncomingValue(i) == Scalar) {
+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
+ if (PH->getIncomingValue(I) == Scalar) {
Instruction *IncomingTerminator =
- PH->getIncomingBlock(i)->getTerminator();
+ PH->getIncomingBlock(I)->getTerminator();
if (isa<CatchSwitchInst>(IncomingTerminator)) {
Builder.SetInsertPoint(VecI->getParent(),
std::next(VecI->getIterator()));
} else {
- Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+ Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- PH->setOperand(i, NewInst);
+ PH->setOperand(I, NewInst);
}
}
} else {
@@ -11012,7 +12100,7 @@ Value *BoUpSLP::vectorizeTree(
User->replaceUsesOfWith(Scalar, NewInst);
}
} else {
- Builder.SetInsertPoint(&F->getEntryBlock().front());
+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
User->replaceUsesOfWith(Scalar, NewInst);
}
@@ -11085,7 +12173,7 @@ Value *BoUpSLP::vectorizeTree(
// non-resizing mask.
if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
->getNumElements() ||
- !ShuffleVectorInst::isIdentityMask(Mask))
+ !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
return CreateShuffle(Vals.front(), nullptr, Mask);
return Vals.front();
}
@@ -11676,7 +12764,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
}
- auto makeControlDependent = [&](Instruction *I) {
+ auto MakeControlDependent = [&](Instruction *I) {
auto *DepDest = getScheduleData(I);
assert(DepDest && "must be in schedule window");
DepDest->ControlDependencies.push_back(BundleMember);
@@ -11698,7 +12786,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
continue;
// Add the dependency
- makeControlDependent(I);
+ MakeControlDependent(I);
if (!isGuaranteedToTransferExecutionToSuccessor(I))
// Everything past here must be control dependent on I.
@@ -11724,7 +12812,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
continue;
// Add the dependency
- makeControlDependent(I);
+ MakeControlDependent(I);
}
}
@@ -11742,7 +12830,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
continue;
// Add the dependency
- makeControlDependent(I);
+ MakeControlDependent(I);
break;
}
}
@@ -11757,7 +12845,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
"NextLoadStore list for non memory effecting bundle?");
MemoryLocation SrcLoc = getLocation(SrcInst);
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
- unsigned numAliased = 0;
+ unsigned NumAliased = 0;
unsigned DistToSrc = 1;
for (; DepDest; DepDest = DepDest->NextLoadStore) {
@@ -11772,13 +12860,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// check this limit even between two read-only instructions.
if (DistToSrc >= MaxMemDepDistance ||
((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
- (numAliased >= AliasedCheckLimit ||
+ (NumAliased >= AliasedCheckLimit ||
SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
// We increment the counter only if the locations are aliased
// (instead of counting all alias checks). This gives a better
// balance between reduced runtime and accurate dependencies.
- numAliased++;
+ NumAliased++;
DepDest->MemoryDependencies.push_back(BundleMember);
BundleMember->Dependencies++;
@@ -11880,20 +12968,20 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
// Do the "real" scheduling.
while (!ReadyInsts.empty()) {
- ScheduleData *picked = *ReadyInsts.begin();
+ ScheduleData *Picked = *ReadyInsts.begin();
ReadyInsts.erase(ReadyInsts.begin());
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
- for (ScheduleData *BundleMember = picked; BundleMember;
+ for (ScheduleData *BundleMember = Picked; BundleMember;
BundleMember = BundleMember->NextInBundle) {
- Instruction *pickedInst = BundleMember->Inst;
- if (pickedInst->getNextNode() != LastScheduledInst)
- pickedInst->moveBefore(LastScheduledInst);
- LastScheduledInst = pickedInst;
+ Instruction *PickedInst = BundleMember->Inst;
+ if (PickedInst->getNextNode() != LastScheduledInst)
+ PickedInst->moveBefore(LastScheduledInst);
+ LastScheduledInst = PickedInst;
}
- BS->schedule(picked, ReadyInsts);
+ BS->schedule(Picked, ReadyInsts);
}
// Check that we didn't break any of our invariants.
@@ -11994,21 +13082,22 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// Determine if a value V in a vectorizable expression Expr can be demoted to a
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
-static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
- SmallVectorImpl<Value *> &ToDemote,
- SmallVectorImpl<Value *> &Roots) {
+bool BoUpSLP::collectValuesToDemote(
+ Value *V, SmallVectorImpl<Value *> &ToDemote,
+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
// We can always demote constants.
- if (isa<Constant>(V)) {
- ToDemote.push_back(V);
+ if (isa<Constant>(V))
return true;
- }
- // If the value is not an instruction in the expression with only one use, it
- // cannot be demoted.
+ // If the value is not a vectorized instruction in the expression with only
+ // one use, it cannot be demoted.
auto *I = dyn_cast<Instruction>(V);
- if (!I || !I->hasOneUse() || !Expr.count(I))
+ if (!I || !I->hasOneUse() || !getTreeEntry(I) || !Visited.insert(I).second)
return false;
+ unsigned Start = 0;
+ unsigned End = I->getNumOperands();
switch (I->getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
@@ -12030,16 +13119,21 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
- !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
+ if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
+ Visited) ||
+ !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
+ Visited))
return false;
break;
// We can demote selects if we can demote their true and false values.
case Instruction::Select: {
+ Start = 1;
SelectInst *SI = cast<SelectInst>(I);
- if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
- !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
+ if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
+ Roots, Visited) ||
+ !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
+ Roots, Visited))
return false;
break;
}
@@ -12049,7 +13143,8 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
case Instruction::PHI: {
PHINode *PN = cast<PHINode>(I);
for (Value *IncValue : PN->incoming_values())
- if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+ if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
+ Visited))
return false;
break;
}
@@ -12059,6 +13154,10 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
return false;
}
+ // Gather demoted constant operands.
+ for (unsigned Idx : seq<unsigned>(Start, End))
+ if (isa<Constant>(I->getOperand(Idx)))
+ DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
// Record the value that we can demote.
ToDemote.push_back(V);
return true;
@@ -12076,44 +13175,26 @@ void BoUpSLP::computeMinimumValueSizes() {
if (!TreeRootIT)
return;
- // If the expression is not rooted by a store, these roots should have
- // external uses. We will rely on InstCombine to rewrite the expression in
- // the narrower type. However, InstCombine only rewrites single-use values.
- // This means that if a tree entry other than a root is used externally, it
- // must have multiple uses and InstCombine will not rewrite it. The code
- // below ensures that only the roots are used externally.
- SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
- for (auto &EU : ExternalUses)
- if (!Expr.erase(EU.Scalar))
- return;
- if (!Expr.empty())
+ // Ensure the roots of the vectorizable tree don't form a cycle.
+ if (!VectorizableTree.front()->UserTreeIndices.empty())
return;
- // Collect the scalar values of the vectorizable expression. We will use this
- // context to determine which values can be demoted. If we see a truncation,
- // we mark it as seeding another demotion.
- for (auto &EntryPtr : VectorizableTree)
- Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
-
- // Ensure the roots of the vectorizable tree don't form a cycle. They must
- // have a single external user that is not in the vectorizable tree.
- for (auto *Root : TreeRoot)
- if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
- return;
-
// Conservatively determine if we can actually truncate the roots of the
// expression. Collect the values that can be demoted in ToDemote and
// additional roots that require investigating in Roots.
SmallVector<Value *, 32> ToDemote;
+ DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
SmallVector<Value *, 4> Roots;
- for (auto *Root : TreeRoot)
- if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+ for (auto *Root : TreeRoot) {
+ DenseSet<Value *> Visited;
+ if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
return;
+ }
// The maximum bit width required to represent all the values that can be
// demoted without loss of precision. It would be safe to truncate the roots
// of the expression to this width.
- auto MaxBitWidth = 8u;
+ auto MaxBitWidth = 1u;
// We first check if all the bits of the roots are demanded. If they're not,
// we can truncate the roots to this narrower type.
@@ -12138,9 +13219,9 @@ void BoUpSLP::computeMinimumValueSizes() {
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
- llvm::all_of(TreeRoot, [](Value *R) {
- assert(R->hasOneUse() && "Root should have only one use!");
- return isa<GetElementPtrInst>(R->user_back());
+ all_of(TreeRoot, [](Value *V) {
+ return all_of(V->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); });
})) {
MaxBitWidth = 8u;
@@ -12189,12 +13270,39 @@ void BoUpSLP::computeMinimumValueSizes() {
// If we can truncate the root, we must collect additional values that might
// be demoted as a result. That is, those seeded by truncations we will
// modify.
- while (!Roots.empty())
- collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+ while (!Roots.empty()) {
+ DenseSet<Value *> Visited;
+ collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,
+ Visited);
+ }
// Finally, map the values we can demote to the maximum bit with we computed.
- for (auto *Scalar : ToDemote)
- MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
+ for (auto *Scalar : ToDemote) {
+ auto *TE = getTreeEntry(Scalar);
+ assert(TE && "Expected vectorized scalar.");
+ if (MinBWs.contains(TE))
+ continue;
+ bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
+ KnownBits Known = computeKnownBits(R, *DL);
+ return !Known.isNonNegative();
+ });
+ MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
+ const auto *I = cast<Instruction>(Scalar);
+ auto DCIt = DemotedConsts.find(I);
+ if (DCIt != DemotedConsts.end()) {
+ for (unsigned Idx : DCIt->getSecond()) {
+ // Check that all instructions operands are demoted.
+ if (all_of(TE->Scalars, [&](Value *V) {
+ auto SIt = DemotedConsts.find(cast<Instruction>(V));
+ return SIt != DemotedConsts.end() &&
+ is_contained(SIt->getSecond(), Idx);
+ })) {
+ const TreeEntry *CTE = getOperandEntry(TE, Idx);
+ MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
+ }
+ }
+ }
+ }
}
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
@@ -12348,139 +13456,206 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- int E = Stores.size();
- SmallBitVector Tails(E, false);
- int MaxIter = MaxStoreLookup.getValue();
- SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
- E, std::make_pair(E, INT_MAX));
- SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
- int IterCnt;
- auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
- &CheckedPairs,
- &ConsecutiveChain](int K, int Idx) {
- if (IterCnt >= MaxIter)
- return true;
- if (CheckedPairs[Idx].test(K))
- return ConsecutiveChain[K].second == 1 &&
- ConsecutiveChain[K].first == Idx;
- ++IterCnt;
- CheckedPairs[Idx].set(K);
- CheckedPairs[K].set(Idx);
- std::optional<int> Diff = getPointersDiff(
- Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),
- Stores[Idx]->getValueOperand()->getType(),
- Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
- if (!Diff || *Diff == 0)
- return false;
- int Val = *Diff;
- if (Val < 0) {
- if (ConsecutiveChain[Idx].second > -Val) {
- Tails.set(K);
- ConsecutiveChain[Idx] = std::make_pair(K, -Val);
- }
- return false;
+ // Stores the pair of stores (first_store, last_store) in a range, that were
+ // already tried to be vectorized. Allows to skip the store ranges that were
+ // already tried to be vectorized but the attempts were unsuccessful.
+ DenseSet<std::pair<Value *, Value *>> TriedSequences;
+ struct StoreDistCompare {
+ bool operator()(const std::pair<unsigned, int> &Op1,
+ const std::pair<unsigned, int> &Op2) const {
+ return Op1.second < Op2.second;
}
- if (ConsecutiveChain[K].second <= Val)
- return false;
-
- Tails.set(Idx);
- ConsecutiveChain[K] = std::make_pair(Idx, Val);
- return Val == 1;
};
- // Do a quadratic search on all of the given stores in reverse order and find
- // all of the pairs of stores that follow each other.
- for (int Idx = E - 1; Idx >= 0; --Idx) {
- // If a store has multiple consecutive store candidates, search according
- // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
- // This is because usually pairing with immediate succeeding or preceding
- // candidate create the best chance to find slp vectorization opportunity.
- const int MaxLookDepth = std::max(E - Idx, Idx + 1);
- IterCnt = 0;
- for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
- if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
- (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
- break;
- }
-
- // Tracks if we tried to vectorize stores starting from the given tail
- // already.
- SmallBitVector TriedTails(E, false);
- // For stores that start but don't end a link in the chain:
- for (int Cnt = E; Cnt > 0; --Cnt) {
- int I = Cnt - 1;
- if (ConsecutiveChain[I].first == E || Tails.test(I))
- continue;
- // We found a store instr that starts a chain. Now follow the chain and try
- // to vectorize it.
+ // A set of pairs (index of store in Stores array ref, Distance of the store
+ // address relative to base store address in units).
+ using StoreIndexToDistSet =
+ std::set<std::pair<unsigned, int>, StoreDistCompare>;
+ auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
+ int PrevDist = -1;
BoUpSLP::ValueList Operands;
// Collect the chain into a list.
- while (I != E && !VectorizedStores.count(Stores[I])) {
- Operands.push_back(Stores[I]);
- Tails.set(I);
- if (ConsecutiveChain[I].second != 1) {
- // Mark the new end in the chain and go back, if required. It might be
- // required if the original stores come in reversed order, for example.
- if (ConsecutiveChain[I].first != E &&
- Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&
- !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {
- TriedTails.set(I);
- Tails.reset(ConsecutiveChain[I].first);
- if (Cnt < ConsecutiveChain[I].first + 2)
- Cnt = ConsecutiveChain[I].first + 2;
+ for (auto [Idx, Data] : enumerate(Set)) {
+ if (Operands.empty() || Data.second - PrevDist == 1) {
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
+ if (Idx != Set.size() - 1)
+ continue;
+ }
+ if (Operands.size() <= 1) {
+ Operands.clear();
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
+ continue;
+ }
+
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
+ unsigned EltSize = R.getVectorElementSize(Operands[0]);
+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
+
+ unsigned MaxVF =
+ std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+ auto *Store = cast<StoreInst>(Operands[0]);
+ Type *StoreTy = Store->getValueOperand()->getType();
+ Type *ValueTy = StoreTy;
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+ ValueTy = Trunc->getSrcTy();
+ unsigned MinVF = TTI->getStoreMinimumVF(
+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
+
+ if (MaxVF <= MinVF) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
+ << ") <= "
+ << "MinVF (" << MinVF << ")\n");
+ }
+
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ unsigned StartIdx = 0;
+ for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+ assert(
+ all_of(
+ Slice,
+ [&](Value *V) {
+ return cast<StoreInst>(V)->getValueOperand()->getType() ==
+ cast<StoreInst>(Slice.front())
+ ->getValueOperand()
+ ->getType();
+ }) &&
+ "Expected all operands of same type.");
+ if (!VectorizedStores.count(Slice.front()) &&
+ !VectorizedStores.count(Slice.back()) &&
+ TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
+ .second &&
+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Slice.begin(), Slice.end());
+ Changed = true;
+ // If we vectorized initial block, no need to try to vectorize it
+ // again.
+ if (Cnt == StartIdx)
+ StartIdx += Size;
+ Cnt += Size;
+ continue;
+ }
+ ++Cnt;
}
- break;
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= Operands.size())
+ break;
}
- // Move to the next value in the chain.
- I = ConsecutiveChain[I].first;
+ Operands.clear();
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
}
- assert(!Operands.empty() && "Expected non-empty list of stores.");
+ };
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(Operands[0]);
- unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
-
- unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
- MaxElts);
- auto *Store = cast<StoreInst>(Operands[0]);
- Type *StoreTy = Store->getValueOperand()->getType();
- Type *ValueTy = StoreTy;
- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
- ValueTy = Trunc->getSrcTy();
- unsigned MinVF = TTI->getStoreMinimumVF(
- R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
-
- if (MaxVF <= MinVF) {
- LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "
- << "MinVF (" << MinVF << ")\n");
- }
-
- // FIXME: Is division-by-2 the correct step? Should we assert that the
- // register size is a power-of-2?
- unsigned StartIdx = 0;
- for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
- if (!VectorizedStores.count(Slice.front()) &&
- !VectorizedStores.count(Slice.back()) &&
- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Slice.begin(), Slice.end());
- Changed = true;
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += Size;
- Cnt += Size;
- continue;
- }
- ++Cnt;
+ // Stores pair (first: index of the store into Stores array ref, address of
+ // which taken as base, second: sorted set of pairs {index, dist}, which are
+ // indices of stores in the set and their store location distances relative to
+ // the base address).
+
+ // Need to store the index of the very first store separately, since the set
+ // may be reordered after the insertion and the first store may be moved. This
+ // container allows to reduce number of calls of getPointersDiff() function.
+ SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
+ // Inserts the specified store SI with the given index Idx to the set of the
+ // stores. If the store with the same distance is found already - stop
+ // insertion, try to vectorize already found stores. If some stores from this
+ // sequence were not vectorized - try to vectorize them with the new store
+ // later. But this logic is applied only to the stores, that come before the
+ // previous store with the same distance.
+ // Example:
+ // 1. store x, %p
+ // 2. store y, %p+1
+ // 3. store z, %p+2
+ // 4. store a, %p
+ // 5. store b, %p+3
+ // - Scan this from the last to first store. The very first bunch of stores is
+ // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
+ // vector).
+ // - The next store in the list - #1 - has the same distance from store #5 as
+ // the store #4.
+ // - Try to vectorize sequence of stores 4,2,3,5.
+ // - If all these stores are vectorized - just drop them.
+ // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
+ // - Start new stores sequence.
+ // The new bunch of stores is {1, {1, 0}}.
+ // - Add the stores from previous sequence, that were not vectorized.
+ // Here we consider the stores in the reversed order, rather they are used in
+ // the IR (Stores are reversed already, see vectorizeStoreChains() function).
+ // Store #3 can be added -> comes after store #4 with the same distance as
+ // store #1.
+ // Store #5 cannot be added - comes before store #4.
+ // This logic allows to improve the compile time, we assume that the stores
+ // after previous store with the same distance most likely have memory
+ // dependencies and no need to waste compile time to try to vectorize them.
+ // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
+ auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
+ for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
+ std::optional<int> Diff = getPointersDiff(
+ Stores[Set.first]->getValueOperand()->getType(),
+ Stores[Set.first]->getPointerOperand(),
+ SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
+ /*StrictCheck=*/true);
+ if (!Diff)
+ continue;
+ auto It = Set.second.find(std::make_pair(Idx, *Diff));
+ if (It == Set.second.end()) {
+ Set.second.emplace(Idx, *Diff);
+ return;
}
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= Operands.size())
- break;
+ // Try to vectorize the first found set to avoid duplicate analysis.
+ TryToVectorize(Set.second);
+ StoreIndexToDistSet PrevSet;
+ PrevSet.swap(Set.second);
+ Set.first = Idx;
+ Set.second.emplace(Idx, 0);
+ // Insert stores that followed previous match to try to vectorize them
+ // with this store.
+ unsigned StartIdx = It->first + 1;
+ SmallBitVector UsedStores(Idx - StartIdx);
+ // Distances to previously found dup store (or this store, since they
+ // store to the same addresses).
+ SmallVector<int> Dists(Idx - StartIdx, 0);
+ for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
+ // Do not try to vectorize sequences, we already tried.
+ if (Pair.first <= It->first ||
+ VectorizedStores.contains(Stores[Pair.first]))
+ break;
+ unsigned BI = Pair.first - StartIdx;
+ UsedStores.set(BI);
+ Dists[BI] = Pair.second - It->second;
+ }
+ for (unsigned I = StartIdx; I < Idx; ++I) {
+ unsigned BI = I - StartIdx;
+ if (UsedStores.test(BI))
+ Set.second.emplace(I, Dists[BI]);
+ }
+ return;
}
+ auto &Res = SortedStores.emplace_back();
+ Res.first = Idx;
+ Res.second.emplace(Idx, 0);
+ };
+ StoreInst *PrevStore = Stores.front();
+ for (auto [I, SI] : enumerate(Stores)) {
+ // Check that we do not try to vectorize stores of different types.
+ if (PrevStore->getValueOperand()->getType() !=
+ SI->getValueOperand()->getType()) {
+ for (auto &Set : SortedStores)
+ TryToVectorize(Set.second);
+ SortedStores.clear();
+ PrevStore = SI;
+ }
+ FillStoresSet(I, SI);
}
+ // Final vectorization attempt.
+ for (auto &Set : SortedStores)
+ TryToVectorize(Set.second);
+
return Changed;
}
@@ -12507,8 +13682,10 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// constant index, or a pointer operand that doesn't point to a scalar
// type.
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
- auto Idx = GEP->idx_begin()->get();
- if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+ if (GEP->getNumIndices() != 1)
+ continue;
+ Value *Idx = GEP->idx_begin()->get();
+ if (isa<Constant>(Idx))
continue;
if (!isValidElementType(Idx->getType()))
continue;
@@ -12542,8 +13719,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// NOTE: the following will give user internal llvm type name, which may
// not be useful.
R.getORE()->emit([&]() {
- std::string type_str;
- llvm::raw_string_ostream rso(type_str);
+ std::string TypeStr;
+ llvm::raw_string_ostream rso(TypeStr);
Ty->print(rso);
return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
<< "Cannot SLP vectorize list: type "
@@ -12878,10 +14055,12 @@ class HorizontalReduction {
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
Value *RHS, const Twine &Name,
const ReductionOpsListType &ReductionOps) {
- bool UseSelect = ReductionOps.size() == 2 ||
- // Logical or/and.
- (ReductionOps.size() == 1 &&
- isa<SelectInst>(ReductionOps.front().front()));
+ bool UseSelect =
+ ReductionOps.size() == 2 ||
+ // Logical or/and.
+ (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) {
+ return isa<SelectInst>(V);
+ }));
assert((!UseSelect || ReductionOps.size() != 2 ||
isa<SelectInst>(ReductionOps[1][0])) &&
"Expected cmp + select pairs for reduction");
@@ -13315,12 +14494,26 @@ public:
// Update the final value in the reduction.
Builder.SetCurrentDebugLocation(
cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+ if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
+ (isGuaranteedNotToBePoison(Res) &&
+ !isGuaranteedNotToBePoison(VectorizedTree))) {
+ auto It = ReducedValsToOps.find(Res);
+ if (It != ReducedValsToOps.end() &&
+ any_of(It->getSecond(),
+ [](Instruction *I) { return isBoolLogicOp(I); }))
+ std::swap(VectorizedTree, Res);
+ }
+
return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
ReductionOps);
}
// Initialize the final value in the reduction.
return Res;
};
+ bool AnyBoolLogicOp =
+ any_of(ReductionOps.back(), [](Value *V) {
+ return isBoolLogicOp(cast<Instruction>(V));
+ });
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
@@ -13364,10 +14557,12 @@ public:
// Check if the reduction value was not overriden by the extractelement
// instruction because of the vectorization and exclude it, if it is not
// compatible with other values.
- if (auto *Inst = dyn_cast<Instruction>(RdxVal))
- if (isVectorLikeInstWithConstOps(Inst) &&
- (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
- continue;
+ // Also check if the instruction was folded to constant/other value.
+ auto *Inst = dyn_cast<Instruction>(RdxVal);
+ if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
+ (S.getOpcode() && !Inst))
+ continue;
Candidates.push_back(RdxVal);
TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
}
@@ -13543,11 +14738,9 @@ public:
for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
continue;
- for_each(ReducedVals[Cnt],
- [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
- if (isa<Instruction>(V))
- LocalExternallyUsedValues[TrackedVals[V]];
- });
+ for (Value *V : ReducedVals[Cnt])
+ if (isa<Instruction>(V))
+ LocalExternallyUsedValues[TrackedVals[V]];
}
if (!IsSupportedHorRdxIdentityOp) {
// Number of uses of the candidates in the vector of values.
@@ -13591,7 +14784,7 @@ public:
// Update LocalExternallyUsedValues for the scalar, replaced by
// extractelement instructions.
for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
- auto It = ExternallyUsedValues.find(Pair.first);
+ auto *It = ExternallyUsedValues.find(Pair.first);
if (It == ExternallyUsedValues.end())
continue;
LocalExternallyUsedValues[Pair.second].append(It->second);
@@ -13605,7 +14798,8 @@ public:
InstructionCost ReductionCost =
getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
InstructionCost Cost = TreeCost + ReductionCost;
- LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
+ << " for reduction\n");
if (!Cost.isValid())
return nullptr;
if (Cost >= -SLPCostThreshold) {
@@ -13652,7 +14846,9 @@ public:
// To prevent poison from leaking across what used to be sequential,
// safe, scalar boolean logic operations, the reduction operand must be
// frozen.
- if (isBoolLogicOp(RdxRootInst))
+ if ((isBoolLogicOp(RdxRootInst) ||
+ (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
+ !isGuaranteedNotToBePoison(VectorizedRoot))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
// Emit code to correctly handle reused reduced values, if required.
@@ -13664,6 +14860,16 @@ public:
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ if (ReducedSubTree->getType() != VL.front()->getType()) {
+ ReducedSubTree = Builder.CreateIntCast(
+ ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
+ KnownBits Known = computeKnownBits(
+ R, cast<Instruction>(ReductionOps.front().front())
+ ->getModule()
+ ->getDataLayout());
+ return !Known.isNonNegative();
+ }));
+ }
// Improved analysis for add/fadd/xor reductions with same scale factor
// for all operands of reductions. We can emit scalar ops for them
@@ -13716,31 +14922,33 @@ public:
// RedOp2 = select i1 ?, i1 RHS, i1 false
// Then, we must freeze LHS in the new op.
- auto &&FixBoolLogicalOps =
- [&Builder, VectorizedTree](Value *&LHS, Value *&RHS,
- Instruction *RedOp1, Instruction *RedOp2) {
- if (!isBoolLogicOp(RedOp1))
- return;
- if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS ||
- isGuaranteedNotToBePoison(LHS))
- return;
- if (!isBoolLogicOp(RedOp2))
- return;
- if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS ||
- isGuaranteedNotToBePoison(RHS)) {
- std::swap(LHS, RHS);
- return;
- }
- LHS = Builder.CreateFreeze(LHS);
- };
+ auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
+ Instruction *RedOp1,
+ Instruction *RedOp2,
+ bool InitStep) {
+ if (!AnyBoolLogicOp)
+ return;
+ if (isBoolLogicOp(RedOp1) &&
+ ((!InitStep && LHS == VectorizedTree) ||
+ getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
+ return;
+ if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
+ getRdxOperand(RedOp2, 0) == RHS ||
+ isGuaranteedNotToBePoison(RHS))) {
+ std::swap(LHS, RHS);
+ return;
+ }
+ if (LHS != VectorizedTree)
+ LHS = Builder.CreateFreeze(LHS);
+ };
// Finish the reduction.
// Need to add extra arguments and not vectorized possible reduction
// values.
// Try to avoid dependencies between the scalar remainders after
// reductions.
- auto &&FinalGen =
- [this, &Builder, &TrackedVals, &FixBoolLogicalOps](
- ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
+ auto FinalGen =
+ [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
+ bool InitStep) {
unsigned Sz = InstVals.size();
SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
Sz % 2);
@@ -13761,7 +14969,7 @@ public:
// sequential, safe, scalar boolean logic operations, the
// reduction operand must be frozen.
FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
- RedOp);
+ RedOp, InitStep);
Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
StableRdxVal2, "op.rdx", ReductionOps);
ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
@@ -13791,11 +14999,13 @@ public:
ExtraReductions.emplace_back(I, Pair.first);
}
// Iterate through all not-vectorized reduction values/extra arguments.
+ bool InitStep = true;
while (ExtraReductions.size() > 1) {
VectorizedTree = ExtraReductions.front().second;
SmallVector<std::pair<Instruction *, Value *>> NewReds =
- FinalGen(ExtraReductions);
+ FinalGen(ExtraReductions, InitStep);
ExtraReductions.swap(NewReds);
+ InitStep = false;
}
VectorizedTree = ExtraReductions.front().second;
@@ -13842,8 +15052,7 @@ private:
bool IsCmpSelMinMax, unsigned ReduxWidth,
FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- Value *FirstReducedVal = ReducedVals.front();
- Type *ScalarTy = FirstReducedVal->getType();
+ Type *ScalarTy = ReducedVals.front()->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
InstructionCost VectorCost = 0, ScalarCost;
// If all of the reduced values are constant, the vector cost is 0, since
@@ -13917,7 +15126,7 @@ private:
}
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
- << " for reduction that starts with " << *FirstReducedVal
+ << " for reduction of " << shortBundleName(ReducedVals)
<< " (It is a splitting reduction)\n");
return VectorCost - ScalarCost;
}
@@ -13932,7 +15141,7 @@ private:
"A call to the llvm.fmuladd intrinsic is not handled yet");
++NumVectorInstructions;
- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
+ return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
}
/// Emits optimized code for unique scalar value reused \p Cnt times.
@@ -13979,8 +15188,8 @@ private:
case RecurKind::Mul:
case RecurKind::FMul:
case RecurKind::FMulAdd:
- case RecurKind::SelectICmp:
- case RecurKind::SelectFCmp:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
}
@@ -14068,8 +15277,8 @@ private:
case RecurKind::Mul:
case RecurKind::FMul:
case RecurKind::FMulAdd:
- case RecurKind::SelectICmp:
- case RecurKind::SelectFCmp:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for reused scalars.");
}
@@ -14164,8 +15373,8 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
InsertElts.resize(*AggregateSize);
findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
- llvm::erase_value(BuildVectorOpds, nullptr);
- llvm::erase_value(InsertElts, nullptr);
+ llvm::erase(BuildVectorOpds, nullptr);
+ llvm::erase(InsertElts, nullptr);
if (BuildVectorOpds.size() >= 2)
return true;
@@ -14401,8 +15610,7 @@ bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
BasicBlock *BB, BoUpSLP &R) {
- const DataLayout &DL = BB->getModule()->getDataLayout();
- if (!R.canMapToVector(IVI->getType(), DL))
+ if (!R.canMapToVector(IVI->getType()))
return false;
SmallVector<Value *, 16> BuildVectorOpds;
@@ -14541,11 +15749,11 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
if (BasePred1 > BasePred2)
return false;
// Compare operands.
- bool LEPreds = Pred1 <= Pred2;
- bool GEPreds = Pred1 >= Pred2;
+ bool CI1Preds = Pred1 == BasePred1;
+ bool CI2Preds = Pred2 == BasePred1;
for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
- auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);
- auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);
+ auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
+ auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
if (Op1->getValueID() < Op2->getValueID())
return !IsCompatibility;
if (Op1->getValueID() > Op2->getValueID())
@@ -14691,14 +15899,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return true;
if (Opcodes1.size() > Opcodes2.size())
return false;
- std::optional<bool> ConstOrder;
for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
// Undefs are compatible with any other value.
if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {
- if (!ConstOrder)
- ConstOrder =
- !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);
- continue;
+ if (isa<Instruction>(Opcodes1[I]))
+ return true;
+ if (isa<Instruction>(Opcodes2[I]))
+ return false;
+ if (isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]))
+ return true;
+ if (isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]))
+ return false;
+ if (isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]))
+ continue;
+ return isa<UndefValue>(Opcodes2[I]);
}
if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
@@ -14714,21 +15928,26 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
- if (S.getOpcode())
+ if (S.getOpcode() && !S.isAltShuffle())
continue;
return I1->getOpcode() < I2->getOpcode();
}
- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {
- if (!ConstOrder)
- ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
- continue;
- }
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ return Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
+ if (isa<Instruction>(Opcodes1[I]))
+ return true;
+ if (isa<Instruction>(Opcodes2[I]))
+ return false;
+ if (isa<Constant>(Opcodes1[I]))
+ return true;
+ if (isa<Constant>(Opcodes2[I]))
+ return false;
if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
return true;
if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
return false;
}
- return ConstOrder && *ConstOrder;
+ return false;
};
auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
if (V1 == V2)
@@ -14776,6 +15995,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
Incoming.push_back(P);
}
+ if (Incoming.size() <= 1)
+ break;
+
// Find the corresponding non-phi nodes for better matching when trying to
// build the tree.
for (Value *V : Incoming) {
@@ -14838,41 +16060,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return I->use_empty() &&
(I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
};
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
// Skip instructions with scalable type. The num of elements is unknown at
// compile-time for scalable type.
- if (isa<ScalableVectorType>(it->getType()))
+ if (isa<ScalableVectorType>(It->getType()))
continue;
// Skip instructions marked for the deletion.
- if (R.isDeleted(&*it))
+ if (R.isDeleted(&*It))
continue;
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(&*it).second) {
- if (HasNoUsers(&*it) &&
- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) {
+ if (!VisitedInstrs.insert(&*It).second) {
+ if (HasNoUsers(&*It) &&
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
- it = BB->begin();
- e = BB->end();
+ It = BB->begin();
+ E = BB->end();
}
continue;
}
- if (isa<DbgInfoIntrinsic>(it))
+ if (isa<DbgInfoIntrinsic>(It))
continue;
// Try to vectorize reductions that use PHINodes.
- if (PHINode *P = dyn_cast<PHINode>(it)) {
+ if (PHINode *P = dyn_cast<PHINode>(It)) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() == 2) {
// Try to match and vectorize a horizontal reduction.
Instruction *Root = getReductionInstr(DT, P, BB, LI);
if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
Changed = true;
- it = BB->begin();
- e = BB->end();
+ It = BB->begin();
+ E = BB->end();
continue;
}
}
@@ -14897,23 +16119,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- if (HasNoUsers(&*it)) {
+ if (HasNoUsers(&*It)) {
bool OpsChanged = false;
- auto *SI = dyn_cast<StoreInst>(it);
+ auto *SI = dyn_cast<StoreInst>(It);
bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
if (SI) {
- auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
+ auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
// Try to vectorize chain in store, if this is the only store to the
// address in the block.
// TODO: This is just a temporarily solution to save compile time. Need
// to investigate if we can safely turn on slp-vectorize-hor-store
// instead to allow lookup for reduction chains in all non-vectorized
// stores (need to check side effects and compile time).
- TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&
- SI->getValueOperand()->hasOneUse();
+ TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
+ SI->getValueOperand()->hasOneUse();
}
if (TryToVectorizeRoot) {
- for (auto *V : it->operand_values()) {
+ for (auto *V : It->operand_values()) {
// Postponed instructions should not be vectorized here, delay their
// vectorization.
if (auto *VI = dyn_cast<Instruction>(V);
@@ -14926,21 +16148,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// top-tree instructions to try to vectorize as many instructions as
// possible.
OpsChanged |=
- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator());
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
- it = BB->begin();
- e = BB->end();
+ It = BB->begin();
+ E = BB->end();
continue;
}
}
- if (isa<InsertElementInst, InsertValueInst>(it))
- PostProcessInserts.insert(&*it);
- else if (isa<CmpInst>(it))
- PostProcessCmps.insert(cast<CmpInst>(&*it));
+ if (isa<InsertElementInst, InsertValueInst>(It))
+ PostProcessInserts.insert(&*It);
+ else if (isa<CmpInst>(It))
+ PostProcessCmps.insert(cast<CmpInst>(&*It));
}
return Changed;
@@ -15044,6 +16266,12 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
// compatible (have the same opcode, same parent), otherwise it is
// definitely not profitable to try to vectorize them.
auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
+ if (V->getValueOperand()->getType()->getTypeID() <
+ V2->getValueOperand()->getType()->getTypeID())
+ return true;
+ if (V->getValueOperand()->getType()->getTypeID() >
+ V2->getValueOperand()->getType()->getTypeID())
+ return false;
if (V->getPointerOperandType()->getTypeID() <
V2->getPointerOperandType()->getTypeID())
return true;
@@ -15082,6 +16310,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
if (V1 == V2)
return true;
+ if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
+ return false;
if (V1->getPointerOperandType() != V2->getPointerOperandType())
return false;
// Undefs are compatible with any other value.
@@ -15113,8 +16343,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
continue;
+ // Reverse stores to do bottom-to-top analysis. This is important if the
+ // values are stores to the same addresses several times, in this case need
+ // to follow the stores order (reversed to meet the memory dependecies).
+ SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
+ Pair.second.rend());
Changed |= tryToVectorizeSequence<StoreInst>(
- Pair.second, StoreSorter, AreCompatibleStores,
+ ReversedStores, StoreSorter, AreCompatibleStores,
[this, &R](ArrayRef<StoreInst *> Candidates, bool) {
return vectorizeStores(Candidates, R);
},