aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-07-26 19:03:47 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-07-26 19:04:23 +0000
commit7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parente3b557809604d036af6e00c60f012c2025b59a5e (diff)
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4239
1 files changed, 2695 insertions, 1544 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3eb6b1804e7..821a3fa22a85 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -87,7 +87,6 @@
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -126,6 +125,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
+// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
+// even if we match a reduction but do not vectorize in the end.
+static cl::opt<bool> AllowHorRdxIdenityOptimization(
+ "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
+ cl::desc("Allow optimization of original scalar identity operations on "
+ "matched horizontal reductions."));
+
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
@@ -287,7 +293,7 @@ static bool isCommutative(Instruction *I) {
/// \returns inserting index of InsertElement or InsertValue instruction,
/// using Offset as base offset for index.
static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
- unsigned Offset = 0) {
+ unsigned Offset = 0) {
int Index = Offset;
if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
@@ -342,16 +348,16 @@ enum class UseMask {
static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
UseMask MaskArg) {
SmallBitVector UseMask(VF, true);
- for (auto P : enumerate(Mask)) {
- if (P.value() == UndefMaskElem) {
+ for (auto [Idx, Value] : enumerate(Mask)) {
+ if (Value == PoisonMaskElem) {
if (MaskArg == UseMask::UndefsAsMask)
- UseMask.reset(P.index());
+ UseMask.reset(Idx);
continue;
}
- if (MaskArg == UseMask::FirstArg && P.value() < VF)
- UseMask.reset(P.value());
- else if (MaskArg == UseMask::SecondArg && P.value() >= VF)
- UseMask.reset(P.value() - VF);
+ if (MaskArg == UseMask::FirstArg && Value < VF)
+ UseMask.reset(Value);
+ else if (MaskArg == UseMask::SecondArg && Value >= VF)
+ UseMask.reset(Value - VF);
}
return UseMask;
}
@@ -374,9 +380,9 @@ static SmallBitVector isUndefVector(const Value *V,
if (!UseMask.empty()) {
const Value *Base = V;
while (auto *II = dyn_cast<InsertElementInst>(Base)) {
+ Base = II->getOperand(0);
if (isa<T>(II->getOperand(1)))
continue;
- Base = II->getOperand(0);
std::optional<unsigned> Idx = getInsertIndex(II);
if (!Idx)
continue;
@@ -461,7 +467,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
- Mask.assign(VL.size(), UndefMaskElem);
+ Mask.assign(VL.size(), PoisonMaskElem);
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
// Undef can be represented as an undef element in a vector.
if (isa<UndefValue>(VL[I]))
@@ -533,6 +539,117 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {
return *EI->idx_begin();
}
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+static std::optional<TTI::ShuffleKind>
+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+ SmallVectorImpl<int> &Mask) {
+ // Scan list of gathered scalars for extractelements that can be represented
+ // as shuffles.
+ MapVector<Value *, SmallVector<int>> VectorOpToIdx;
+ SmallVector<int> UndefVectorExtracts;
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI) {
+ if (isa<UndefValue>(VL[I]))
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
+ if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
+ continue;
+ std::optional<unsigned> Idx = getExtractIndex(EI);
+ // Undefined index.
+ if (!Idx) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ SmallBitVector ExtractMask(VecTy->getNumElements(), true);
+ ExtractMask.reset(*Idx);
+ if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ VectorOpToIdx[EI->getVectorOperand()].push_back(I);
+ }
+ // Sort the vector operands by the maximum number of uses in extractelements.
+ MapVector<unsigned, SmallVector<Value *>> VFToVector;
+ for (const auto &Data : VectorOpToIdx)
+ VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
+ .push_back(Data.first);
+ for (auto &Data : VFToVector) {
+ stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
+ return VectorOpToIdx.find(V1)->second.size() >
+ VectorOpToIdx.find(V2)->second.size();
+ });
+ }
+ // Find the best pair of the vectors with the same number of elements or a
+ // single vector.
+ const int UndefSz = UndefVectorExtracts.size();
+ unsigned SingleMax = 0;
+ Value *SingleVec = nullptr;
+ unsigned PairMax = 0;
+ std::pair<Value *, Value *> PairVec(nullptr, nullptr);
+ for (auto &Data : VFToVector) {
+ Value *V1 = Data.second.front();
+ if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
+ SingleMax = VectorOpToIdx[V1].size() + UndefSz;
+ SingleVec = V1;
+ }
+ Value *V2 = nullptr;
+ if (Data.second.size() > 1)
+ V2 = *std::next(Data.second.begin());
+ if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
+ UndefSz) {
+ PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
+ PairVec = std::make_pair(V1, V2);
+ }
+ }
+ if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
+ return std::nullopt;
+ // Check if better to perform a shuffle of 2 vectors or just of a single
+ // vector.
+ SmallVector<Value *> SavedVL(VL.begin(), VL.end());
+ SmallVector<Value *> GatheredExtracts(
+ VL.size(), PoisonValue::get(VL.front()->getType()));
+ if (SingleMax >= PairMax && SingleMax) {
+ for (int Idx : VectorOpToIdx[SingleVec])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ } else {
+ for (Value *V : {PairVec.first, PairVec.second})
+ for (int Idx : VectorOpToIdx[V])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ }
+ // Add extracts from undefs too.
+ for (int Idx : UndefVectorExtracts)
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ std::optional<TTI::ShuffleKind> Res =
+ isFixedVectorShuffle(GatheredExtracts, Mask);
+ if (!Res) {
+ // TODO: try to check other subsets if possible.
+ // Restore the original VL if attempt was not successful.
+ VL.swap(SavedVL);
+ return std::nullopt;
+ }
+ // Restore unused scalars from mask, if some of the extractelements were not
+ // selected for shuffle.
+ for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
+ !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
+ is_contained(UndefVectorExtracts, I))
+ continue;
+ if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))
+ std::swap(VL[I], GatheredExtracts[I]);
+ }
+ return Res;
+}
+
namespace {
/// Main data required for vectorization of instructions.
@@ -829,18 +946,29 @@ static bool isSimple(Instruction *I) {
}
/// Shuffles \p Mask in accordance with the given \p SubMask.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
+/// one but two input vectors.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
+ bool ExtendingManyInputs = false) {
if (SubMask.empty())
return;
+ assert(
+ (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
+ // Check if input scalars were extended to match the size of other node.
+ (SubMask.size() == Mask.size() &&
+ std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
+ [](int Idx) { return Idx == PoisonMaskElem; }))) &&
+ "SubMask with many inputs support must be larger than the mask.");
if (Mask.empty()) {
Mask.append(SubMask.begin(), SubMask.end());
return;
}
- SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
+ SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
int TermValue = std::min(Mask.size(), SubMask.size());
for (int I = 0, E = SubMask.size(); I < E; ++I) {
- if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
- Mask[SubMask[I]] >= TermValue)
+ if (SubMask[I] == PoisonMaskElem ||
+ (!ExtendingManyInputs &&
+ (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
continue;
NewMask[I] = Mask[SubMask[I]];
}
@@ -887,7 +1015,7 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
SmallVectorImpl<int> &Mask) {
Mask.clear();
const unsigned E = Indices.size();
- Mask.resize(E, UndefMaskElem);
+ Mask.resize(E, PoisonMaskElem);
for (unsigned I = 0; I < E; ++I)
Mask[Indices[I]] = I;
}
@@ -900,7 +1028,7 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
UndefValue::get(Scalars.front()->getType()));
Prev.swap(Scalars);
for (unsigned I = 0, E = Prev.size(); I < E; ++I)
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
Scalars[Mask[I]] = Prev[I];
}
@@ -962,6 +1090,7 @@ namespace slpvectorizer {
class BoUpSLP {
struct TreeEntry;
struct ScheduleData;
+ class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
public:
@@ -1006,8 +1135,12 @@ public:
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
- Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
- Instruction *ReductionRoot = nullptr);
+ /// \param ReplacedExternals containd list of replaced external values
+ /// {scalar, replace} after emitting extractelement for external uses.
+ Value *
+ vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
+ Instruction *ReductionRoot = nullptr);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
@@ -1025,24 +1158,18 @@ public:
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);
- /// Checks if the very first tree node is going to be vectorized.
- bool isVectorizedFirstNode() const {
- return !VectorizableTree.empty() &&
- VectorizableTree.front()->State == TreeEntry::Vectorize;
- }
-
- /// Returns the main instruction for the very first node.
- Instruction *getFirstNodeMainOp() const {
- assert(!VectorizableTree.empty() && "No tree to get the first node from");
- return VectorizableTree.front()->getMainOp();
- }
-
/// Returns whether the root node has in-tree uses.
bool doesRootHaveInTreeUses() const {
return !VectorizableTree.empty() &&
!VectorizableTree.front()->UserTreeIndices.empty();
}
+ /// Return the scalars of the root node.
+ ArrayRef<Value *> getRootNodeScalars() const {
+ assert(!VectorizableTree.empty() && "No graph to get the first node from");
+ return VectorizableTree.front()->Scalars;
+ }
+
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -1064,6 +1191,8 @@ public:
MinBWs.clear();
InstrElementSize.clear();
UserIgnoreList = nullptr;
+ PostponedGathers.clear();
+ ValueToGatherNodes.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -1083,9 +1212,12 @@ public:
/// Gets reordering data for the given tree entry. If the entry is vectorized
/// - just return ReorderIndices, otherwise check if the scalars can be
/// reordered and return the most optimal order.
+ /// \return std::nullopt if ordering is not important, empty order, if
+ /// identity order is important, or the actual order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
- std::optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
+ std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
+ bool TopToBottom);
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -1328,8 +1460,14 @@ public:
ConstantInt *Ex1Idx;
if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
// Undefs are always profitable for extractelements.
+ // Compiler can easily combine poison and extractelement <non-poison> or
+ // undef and extractelement <poison>. But combining undef +
+ // extractelement <non-poison-but-may-produce-poison> requires some
+ // extra operations.
if (isa<UndefValue>(V2))
- return LookAheadHeuristics::ScoreConsecutiveExtracts;
+ return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
+ ? LookAheadHeuristics::ScoreConsecutiveExtracts
+ : LookAheadHeuristics::ScoreSameOpcode;
Value *EV2 = nullptr;
ConstantInt *Ex2Idx = nullptr;
if (match(V2,
@@ -1683,9 +1821,10 @@ public:
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return std::nullopt.
- std::optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
- ArrayRef<ReorderingMode> ReorderingModes,
- ArrayRef<Value *> MainAltOps) {
+ std::optional<unsigned>
+ getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ ArrayRef<ReorderingMode> ReorderingModes,
+ ArrayRef<Value *> MainAltOps) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
@@ -2299,7 +2438,8 @@ private:
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
- ArrayRef<Value *> VectorizedVals);
+ ArrayRef<Value *> VectorizedVals,
+ SmallPtrSetImpl<Value *> &CheckedExtracts);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -2323,15 +2463,13 @@ private:
/// Create a new vector from a list of scalar values. Produces a sequence
/// which exploits values reused across lanes, and arranges the inserts
/// for ease of later optimization.
- Value *createBuildVector(const TreeEntry *E);
+ template <typename BVTy, typename ResTy, typename... Args>
+ ResTy processBuildVector(const TreeEntry *E, Args &...Params);
- /// \returns the scalarization cost for this type. Scalarization in this
- /// context means the creation of vectors from a group of scalars. If \p
- /// NeedToShuffle is true, need to add a cost of reshuffling some of the
- /// vector elements.
- InstructionCost getGatherCost(FixedVectorType *Ty,
- const APInt &ShuffledIndices,
- bool NeedToShuffle) const;
+ /// Create a new vector from a list of scalar values. Produces a sequence
+ /// which exploits values reused across lanes, and arranges the inserts
+ /// for ease of later optimization.
+ Value *createBuildVector(const TreeEntry *E);
/// Returns the instruction in the bundle, which can be used as a base point
/// for scheduling. Usually it is the last instruction in the bundle, except
@@ -2354,14 +2492,16 @@ private:
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
- InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
+ /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
+ InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(const TreeEntry *E);
- /// \returns a vector from a collection of scalars in \p VL.
- Value *gather(ArrayRef<Value *> VL);
+ /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
+ /// specified, the starting vector value is poison.
+ Value *gather(ArrayRef<Value *> VL, Value *Root);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
@@ -2400,6 +2540,14 @@ private:
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
+ /// \returns Common mask for reorder indices and reused scalars.
+ SmallVector<int> getCommonMask() const {
+ SmallVector<int> Mask;
+ inversePermutation(ReorderIndices, Mask);
+ ::addMask(Mask, ReuseShuffleIndices);
+ return Mask;
+ }
+
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
@@ -2409,8 +2557,8 @@ private:
std::equal(VL.begin(), VL.end(), Mask.begin(),
[Scalars](Value *V, int Idx) {
return (isa<UndefValue>(V) &&
- Idx == UndefMaskElem) ||
- (Idx != UndefMaskElem && V == Scalars[Idx]);
+ Idx == PoisonMaskElem) ||
+ (Idx != PoisonMaskElem && V == Scalars[Idx]);
});
};
if (!ReorderIndices.empty()) {
@@ -2471,7 +2619,7 @@ private:
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
- Value *VectorizedValue = nullptr;
+ WeakTrackingVH VectorizedValue = nullptr;
/// Do we need to gather this sequence or vectorize it
/// (either with vector instruction or with scatter/gather
@@ -2684,20 +2832,22 @@ private:
#ifndef NDEBUG
void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
- InstructionCost VecCost,
- InstructionCost ScalarCost) const {
- dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
+ InstructionCost VecCost, InstructionCost ScalarCost,
+ StringRef Banner) const {
+ dbgs() << "SLP: " << Banner << ":\n";
+ E->dump();
dbgs() << "SLP: Costs:\n";
dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
dbgs() << "SLP: VectorCost = " << VecCost << "\n";
dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
- dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
- ReuseShuffleCost + VecCost - ScalarCost << "\n";
+ dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
+ << ReuseShuffleCost + VecCost - ScalarCost << "\n";
}
#endif
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
+ std::optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
ArrayRef<int> ReuseShuffleIndices = std::nullopt,
@@ -2791,8 +2941,14 @@ private:
return ScalarToTreeEntry.lookup(V);
}
+ /// Checks if the specified list of the instructions/values can be vectorized
+ /// and fills required data before actual scheduling of the instructions.
+ TreeEntry::EntryState getScalarsVectorizationState(
+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
+
/// Maps a specific scalar to its tree entry.
- SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
+ SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -2808,6 +2964,15 @@ private:
/// pre-gather them before.
DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
+ /// List of gather nodes, depending on other gather/vector nodes, which should
+ /// be emitted after the vector instruction emission process to correctly
+ /// handle order of the vector instructions and shuffles.
+ SetVector<const TreeEntry *> PostponedGathers;
+
+ using ValueToGatherNodesMap =
+ DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
+ ValueToGatherNodesMap ValueToGatherNodes;
+
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, int L)
@@ -3235,7 +3400,6 @@ private:
<< "SLP: gets ready (ctl): " << *DepBundle << "\n");
}
}
-
}
}
@@ -3579,7 +3743,7 @@ static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
SmallVector<int> Prev(Reuses.begin(), Reuses.end());
Prev.swap(Reuses);
for (unsigned I = 0, E = Prev.size(); I < E; ++I)
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
Reuses[Mask[I]] = Prev[I];
}
@@ -3603,7 +3767,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
}
Order.assign(Mask.size(), Mask.size());
for (unsigned I = 0, E = Mask.size(); I < E; ++I)
- if (MaskOrder[I] != UndefMaskElem)
+ if (MaskOrder[I] != PoisonMaskElem)
Order[MaskOrder[I]] = I;
fixupOrderingIndices(Order);
}
@@ -3653,10 +3817,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
return false;
return true;
};
- if (IsIdentityOrder(CurrentOrder)) {
- CurrentOrder.clear();
- return CurrentOrder;
- }
+ if (IsIdentityOrder(CurrentOrder))
+ return OrdersType();
auto *It = CurrentOrder.begin();
for (unsigned I = 0; I < NumScalars;) {
if (UsedPositions.test(I)) {
@@ -3669,7 +3831,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
}
++It;
}
- return CurrentOrder;
+ return std::move(CurrentOrder);
}
return std::nullopt;
}
@@ -3779,9 +3941,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
return LoadsState::Gather;
}
-bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &SortedIndices) {
+static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices) {
assert(llvm::all_of(
VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
"Expected list of pointer operands.");
@@ -3825,7 +3987,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
return std::get<1>(X) < std::get<1>(Y);
});
int InitialOffset = std::get<1>(Vec[0]);
- AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {
+ AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
});
}
@@ -3862,7 +4024,7 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
BoUpSLP::OrdersType Order;
if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
- return Order;
+ return std::move(Order);
return std::nullopt;
}
@@ -3888,31 +4050,35 @@ static bool areTwoInsertFromSameBuildVector(
// Go through the vector operand of insertelement instructions trying to find
// either VU as the original vector for IE2 or V as the original vector for
// IE1.
+ SmallSet<int, 8> ReusedIdx;
+ bool IsReusedIdx = false;
do {
- if (IE2 == VU)
+ if (IE2 == VU && !IE1)
return VU->hasOneUse();
- if (IE1 == V)
+ if (IE1 == V && !IE2)
return V->hasOneUse();
- if (IE1) {
- if ((IE1 != VU && !IE1->hasOneUse()) ||
- getInsertIndex(IE1).value_or(*Idx2) == *Idx2)
+ if (IE1 && IE1 != V) {
+ IsReusedIdx |=
+ !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;
+ if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
IE1 = nullptr;
else
IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
}
- if (IE2) {
- if ((IE2 != V && !IE2->hasOneUse()) ||
- getInsertIndex(IE2).value_or(*Idx1) == *Idx1)
+ if (IE2 && IE2 != VU) {
+ IsReusedIdx |=
+ !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;
+ if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
IE2 = nullptr;
else
IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
}
- } while (IE1 || IE2);
+ } while (!IsReusedIdx && (IE1 || IE2));
return false;
}
-std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
- bool TopToBottom) {
+std::optional<BoUpSLP::OrdersType>
+BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
@@ -3936,14 +4102,14 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
return Idx && *Idx < Sz;
})) {
- SmallVector<int> ReorderMask(Sz, UndefMaskElem);
+ SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
if (TE.ReorderIndices.empty())
std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
else
inversePermutation(TE.ReorderIndices, ReorderMask);
for (unsigned I = 0; I < VF; ++I) {
int &Idx = ReusedMask[I];
- if (Idx == UndefMaskElem)
+ if (Idx == PoisonMaskElem)
continue;
Value *V = TE.Scalars[ReorderMask[Idx]];
std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
@@ -3958,7 +4124,7 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
for (unsigned K = 0; K < VF; K += Sz) {
OrdersType CurrentOrder(TE.ReorderIndices);
SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
- if (SubMask.front() == UndefMaskElem)
+ if (SubMask.front() == PoisonMaskElem)
std::iota(SubMask.begin(), SubMask.end(), 0);
reorderOrder(CurrentOrder, SubMask);
transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
@@ -3966,8 +4132,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
}
if (all_of(enumerate(ResOrder),
[](const auto &Data) { return Data.index() == Data.value(); }))
- return {}; // Use identity order.
- return ResOrder;
+ return std::nullopt; // No need to reorder.
+ return std::move(ResOrder);
}
if (TE.State == TreeEntry::Vectorize &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
@@ -3976,6 +4142,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
return TE.ReorderIndices;
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {
+ if (V1 == V2)
+ return false;
if (!V1->hasOneUse() || !V2->hasOneUse())
return false;
auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
@@ -4023,8 +4191,8 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
ResOrder[Id] = PhiToId[Phis[Id]];
if (IsIdentityOrder(ResOrder))
- return {};
- return ResOrder;
+ return std::nullopt; // No need to reorder.
+ return std::move(ResOrder);
}
if (TE.State == TreeEntry::NeedToGather) {
// TODO: add analysis of other gather nodes with extractelement
@@ -4050,7 +4218,42 @@ std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &T
if (Reuse || !CurrentOrder.empty()) {
if (!CurrentOrder.empty())
fixupOrderingIndices(CurrentOrder);
- return CurrentOrder;
+ return std::move(CurrentOrder);
+ }
+ }
+ // If the gather node is <undef, v, .., poison> and
+ // insertelement poison, v, 0 [+ permute]
+ // is cheaper than
+ // insertelement poison, v, n - try to reorder.
+ // If rotating the whole graph, exclude the permute cost, the whole graph
+ // might be transformed.
+ int Sz = TE.Scalars.size();
+ if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
+ count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
+ const auto *It =
+ find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
+ if (It == TE.Scalars.begin())
+ return OrdersType();
+ auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
+ if (It != TE.Scalars.end()) {
+ OrdersType Order(Sz, Sz);
+ unsigned Idx = std::distance(TE.Scalars.begin(), It);
+ Order[Idx] = 0;
+ fixupOrderingIndices(Order);
+ SmallVector<int> Mask;
+ inversePermutation(Order, Mask);
+ InstructionCost PermuteCost =
+ TopToBottom
+ ? 0
+ : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask);
+ InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
+ PoisonValue::get(Ty), *It);
+ InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
+ PoisonValue::get(Ty), *It);
+ if (InsertFirstCost + PermuteCost < InsertIdxCost)
+ return std::move(Order);
}
}
if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
@@ -4260,7 +4463,7 @@ void BoUpSLP::reorderTopToBottom() {
unsigned E = Order.size();
OrdersType CurrentOrder(E, E);
transform(Mask, CurrentOrder.begin(), [E](int Idx) {
- return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+ return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
@@ -4285,10 +4488,10 @@ void BoUpSLP::reorderTopToBottom() {
continue;
SmallVector<int> Mask;
inversePermutation(BestOrder, Mask);
- SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+ SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
unsigned E = BestOrder.size();
transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
- return I < E ? static_cast<int>(I) : UndefMaskElem;
+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
});
// Do an actual reordering, if profitable.
for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
@@ -4384,7 +4587,7 @@ bool BoUpSLP::canReorderOperands(
}
return false;
}) > 1 &&
- !all_of(UserTE->getOperand(I), isConstant))
+ !allConstant(UserTE->getOperand(I)))
return false;
if (Gather)
GatherOps.push_back(Gather);
@@ -4499,7 +4702,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
unsigned E = Order.size();
OrdersType CurrentOrder(E, E);
transform(Mask, CurrentOrder.begin(), [E](int Idx) {
- return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+ return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
@@ -4578,10 +4781,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
VisitedOps.clear();
SmallVector<int> Mask;
inversePermutation(BestOrder, Mask);
- SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+ SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
unsigned E = BestOrder.size();
transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
- return I < E ? static_cast<int>(I) : UndefMaskElem;
+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
});
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
TreeEntry *TE = Op.second;
@@ -4779,7 +4982,7 @@ bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
// Check if the stores are consecutive by checking if their difference is 1.
for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
- if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
+ if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
return false;
// Calculate the shuffle indices according to their offset against the sorted
@@ -4976,6 +5179,309 @@ static bool isAlternateInstruction(const Instruction *I,
const Instruction *AltOp,
const TargetLibraryInfo &TLI);
+BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
+ assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
+
+ unsigned ShuffleOrOp =
+ S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
+ auto *VL0 = cast<Instruction>(S.OpValue);
+ switch (ShuffleOrOp) {
+ case Instruction::PHI: {
+ // Check for terminator values (e.g. invoke).
+ for (Value *V : VL)
+ for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
+ Instruction *Term = dyn_cast<Instruction>(Incoming);
+ if (Term && Term->isTerminator()) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: Need to swizzle PHINodes (terminator use).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement: {
+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+ if (Reuse || !CurrentOrder.empty())
+ return TreeEntry::Vectorize;
+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+ return TreeEntry::NeedToGather;
+ }
+ case Instruction::InsertElement: {
+ // Check that we have a buildvector and not a shuffle of 2 or more
+ // different vectors.
+ ValueSet SourceVectors;
+ for (Value *V : VL) {
+ SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
+ assert(getInsertIndex(V) != std::nullopt &&
+ "Non-constant or undef index?");
+ }
+
+ if (count_if(VL, [&SourceVectors](Value *V) {
+ return !SourceVectors.contains(V);
+ }) >= 2) {
+ // Found 2nd source vector - cancel.
+ LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
+ "different source vectors.\n");
+ return TreeEntry::NeedToGather;
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder,
+ PointerOps)) {
+ case LoadsState::Vectorize:
+ return TreeEntry::Vectorize;
+ case LoadsState::ScatterVectorize:
+ return TreeEntry::ScatterVectorize;
+ case LoadsState::Gather:
+#ifndef NDEBUG
+ Type *ScalarTy = VL0->getType();
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy))
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ else if (any_of(VL,
+ [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ else
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+#endif // NDEBUG
+ return TreeEntry::NeedToGather;
+ }
+ llvm_unreachable("Unexpected state of loads");
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
+ if (Ty != SrcTy || !isValidElementType(Ty)) {
+ LLVM_DEBUG(
+ dbgs() << "SLP: Gathering casts with different src types.\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Check that all of the compares have the same predicate.
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
+ Type *ComparedTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ CmpInst *Cmp = cast<CmpInst>(V);
+ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
+ Cmp->getOperand(0)->getType() != ComparedTy) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::Select:
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return TreeEntry::Vectorize;
+ case Instruction::GetElementPtr: {
+ // We don't combine GEPs with complicated (nested) indexing.
+ for (Value *V : VL) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ if (I->getNumOperands() != 2) {
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ // We can't combine several GEPs into one vector if they operate on
+ // different types.
+ Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
+ for (Value *V : VL) {
+ auto *GEP = dyn_cast<GEPOperator>(V);
+ if (!GEP)
+ continue;
+ Type *CurTy = GEP->getSourceElementType();
+ if (Ty0 != CurTy) {
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ // We don't combine GEPs with non-constant indexes.
+ Type *Ty1 = VL0->getOperand(1)->getType();
+ for (Value *V : VL) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ auto *Op = I->getOperand(1);
+ if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
+ (Op->getType() != Ty1 &&
+ ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
+ Op->getType()->getScalarSizeInBits() >
+ DL->getIndexSizeInBits(
+ V->getType()->getPointerAddressSpace())))) {
+ LLVM_DEBUG(
+ dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::Store: {
+ // Check if the stores are consecutive or if we need to swizzle them.
+ llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+ // Avoid types that are padded when being allocated as scalars, while
+ // being packed together in a vector (such as i1).
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
+ return TreeEntry::NeedToGather;
+ }
+ // Make sure all stores in the bundle are simple - we can't vectorize
+ // atomic or volatile stores.
+ for (Value *V : VL) {
+ auto *SI = cast<StoreInst>(V);
+ if (!SI->isSimple()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
+ return TreeEntry::NeedToGather;
+ }
+ PointerOps.push_back(SI->getPointerOperand());
+ }
+
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
+ }
+ std::optional<int> Dist =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
+ // Check that the sorted pointer operands are consecutive.
+ if (static_cast<unsigned>(*Dist) == VL.size() - 1)
+ return TreeEntry::Vectorize;
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ return TreeEntry::NeedToGather;
+ }
+ case Instruction::Call: {
+ // Check if the calls are all to the same vectorizable intrinsic or
+ // library function.
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ VFShape Shape = VFShape::get(
+ *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+ if (!VecFunc && !isTriviallyVectorizable(ID)) {
+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ return TreeEntry::NeedToGather;
+ }
+ Function *F = CI->getCalledFunction();
+ unsigned NumArgs = CI->arg_size();
+ SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
+ for (unsigned J = 0; J != NumArgs; ++J)
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
+ ScalarArgs[J] = CI->getArgOperand(J);
+ for (Value *V : VL) {
+ CallInst *CI2 = dyn_cast<CallInst>(V);
+ if (!CI2 || CI2->getCalledFunction() != F ||
+ getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+ (VecFunc &&
+ VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
+ !CI->hasIdenticalOperandBundleSchema(*CI2)) {
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
+ << "\n");
+ return TreeEntry::NeedToGather;
+ }
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized.
+ for (unsigned J = 0; J != NumArgs; ++J) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
+ Value *A1J = CI2->getArgOperand(J);
+ if (ScalarArgs[J] != A1J) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
+ return TreeEntry::NeedToGather;
+ }
+ }
+ }
+ // Verify that the bundle operands are identical between the two calls.
+ if (CI->hasOperandBundles() &&
+ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+ CI->op_begin() + CI->getBundleOperandsEndIndex(),
+ CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
+ << "!=" << *V << '\n');
+ return TreeEntry::NeedToGather;
+ }
+ }
+
+ return TreeEntry::Vectorize;
+ }
+ case Instruction::ShuffleVector: {
+ // If this is not an alternate sequence of opcode like add-sub
+ // then do not vectorize this instruction.
+ if (!S.isAltShuffle()) {
+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ return TreeEntry::NeedToGather;
+ }
+ return TreeEntry::Vectorize;
+ }
+ default:
+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ return TreeEntry::NeedToGather;
+ }
+}
+
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -4990,7 +5496,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *V : VL) {
if (isConstant(V)) {
ReuseShuffleIndicies.emplace_back(
- isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
+ isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
UniqueValues.emplace_back(V);
continue;
}
@@ -5010,7 +5516,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return isa<UndefValue>(V) ||
!isConstant(V);
})) ||
- !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+ !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
@@ -5257,6 +5763,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!TryToFindDuplicates(S))
return;
+ // Perform specific checks for each particular instruction kind.
+ OrdersType CurrentOrder;
+ SmallVector<Value *> PointerOps;
+ TreeEntry::EntryState State = getScalarsVectorizationState(
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+ if (State == TreeEntry::NeedToGather) {
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);
@@ -5285,20 +5802,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::PHI: {
auto *PH = cast<PHINode>(VL0);
- // Check for terminator values (e.g. invoke).
- for (Value *V : VL)
- for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
- Instruction *Term = dyn_cast<Instruction>(Incoming);
- if (Term && Term->isTerminator()) {
- LLVM_DEBUG(dbgs()
- << "SLP: Need to swizzle PHINodes (terminator use).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
TreeEntry *TE =
newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
@@ -5326,9 +5829,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- OrdersType CurrentOrder;
- bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
- if (Reuse) {
+ if (CurrentOrder.empty()) {
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
@@ -5339,55 +5840,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
VectorizableTree.back()->setOperand(0, Op0);
return;
}
- if (!CurrentOrder.empty()) {
- LLVM_DEBUG({
- dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
- "with order";
- for (unsigned Idx : CurrentOrder)
- dbgs() << " " << Idx;
- dbgs() << "\n";
- });
- fixupOrderingIndices(CurrentOrder);
- // Insert new order with initial value 0, if it does not exist,
- // otherwise return the iterator to the existing one.
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, CurrentOrder);
- // This is a special case, as it does not gather, but at the same time
- // we are not extending buildTree_rec() towards the operands.
- ValueList Op0;
- Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0);
- return;
- }
- LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- BS.cancelScheduling(VL, VL0);
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ fixupOrderingIndices(CurrentOrder);
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, CurrentOrder);
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0);
return;
}
case Instruction::InsertElement: {
assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
- // Check that we have a buildvector and not a shuffle of 2 or more
- // different vectors.
- ValueSet SourceVectors;
- for (Value *V : VL) {
- SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
- assert(getInsertIndex(V) != std::nullopt &&
- "Non-constant or undef index?");
- }
-
- if (count_if(VL, [&SourceVectors](Value *V) {
- return !SourceVectors.contains(V);
- }) >= 2) {
- // Found 2nd source vector - cancel.
- LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
- "different source vectors.\n");
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
- BS.cancelScheduling(VL, VL0);
- return;
- }
-
auto OrdCompare = [](const std::pair<int, int> &P1,
const std::pair<int, int> &P2) {
return P1.first > P2.first;
@@ -5430,12 +5904,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
- SmallVector<Value *> PointerOps;
- OrdersType CurrentOrder;
TreeEntry *TE = nullptr;
- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI,
- CurrentOrder, PointerOps)) {
- case LoadsState::Vectorize:
+ switch (State) {
+ case TreeEntry::Vectorize:
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -5450,7 +5921,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
break;
- case LoadsState::ScatterVectorize:
+ case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndicies);
@@ -5458,23 +5929,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
break;
- case LoadsState::Gather:
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
-#ifndef NDEBUG
- Type *ScalarTy = VL0->getType();
- if (DL->getTypeSizeInBits(ScalarTy) !=
- DL->getTypeAllocSizeInBits(ScalarTy))
- LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
- else if (any_of(VL, [](Value *V) {
- return !cast<LoadInst>(V)->isSimple();
- }))
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
- else
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-#endif // NDEBUG
- break;
+ case TreeEntry::NeedToGather:
+ llvm_unreachable("Unexpected loads state.");
}
return;
}
@@ -5490,18 +5946,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- Type *SrcTy = VL0->getOperand(0)->getType();
- for (Value *V : VL) {
- Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
- if (Ty != SrcTy || !isValidElementType(Ty)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs()
- << "SLP: Gathering casts with different src types.\n");
- return;
- }
- }
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
@@ -5521,21 +5965,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
- Type *ComparedTy = VL0->getOperand(0)->getType();
- for (Value *V : VL) {
- CmpInst *Cmp = cast<CmpInst>(V);
- if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
- Cmp->getOperand(0)->getType() != ComparedTy) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs()
- << "SLP: Gathering cmp with different predicate.\n");
- return;
- }
- }
-
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
@@ -5544,7 +5973,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
- assert(P0 == SwapP0 && "Commutative Predicate mismatch");
+ assert(P0 == CmpInst::getSwappedPredicate(P0) &&
+ "Commutative Predicate mismatch");
reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
} else {
// Collect operands - commute if it uses the swapped predicate.
@@ -5612,60 +6042,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
case Instruction::GetElementPtr: {
- // We don't combine GEPs with complicated (nested) indexing.
- for (Value *V : VL) {
- auto *I = dyn_cast<GetElementPtrInst>(V);
- if (!I)
- continue;
- if (I->getNumOperands() != 2) {
- LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- // We can't combine several GEPs into one vector if they operate on
- // different types.
- Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
- for (Value *V : VL) {
- auto *GEP = dyn_cast<GEPOperator>(V);
- if (!GEP)
- continue;
- Type *CurTy = GEP->getSourceElementType();
- if (Ty0 != CurTy) {
- LLVM_DEBUG(dbgs()
- << "SLP: not-vectorizable GEP (different types).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- // We don't combine GEPs with non-constant indexes.
- Type *Ty1 = VL0->getOperand(1)->getType();
- for (Value *V : VL) {
- auto *I = dyn_cast<GetElementPtrInst>(V);
- if (!I)
- continue;
- auto *Op = I->getOperand(1);
- if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
- (Op->getType() != Ty1 &&
- ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
- Op->getType()->getScalarSizeInBits() >
- DL->getIndexSizeInBits(
- V->getType()->getPointerAddressSpace())))) {
- LLVM_DEBUG(dbgs()
- << "SLP: not-vectorizable GEP (non-constant indexes).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
@@ -5722,78 +6098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::Store: {
// Check if the stores are consecutive or if we need to swizzle them.
- llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
- // Avoid types that are padded when being allocated as scalars, while
- // being packed together in a vector (such as i1).
- if (DL->getTypeSizeInBits(ScalarTy) !=
- DL->getTypeAllocSizeInBits(ScalarTy)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
- return;
- }
- // Make sure all stores in the bundle are simple - we can't vectorize
- // atomic or volatile stores.
- SmallVector<Value *, 4> PointerOps(VL.size());
ValueList Operands(VL.size());
- auto POIter = PointerOps.begin();
- auto OIter = Operands.begin();
+ auto *OIter = Operands.begin();
for (Value *V : VL) {
auto *SI = cast<StoreInst>(V);
- if (!SI->isSimple()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
- return;
- }
- *POIter = SI->getPointerOperand();
*OIter = SI->getValueOperand();
- ++POIter;
++OIter;
}
-
- OrdersType CurrentOrder;
- // Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
- Value *Ptr0;
- Value *PtrN;
- if (CurrentOrder.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
- } else {
- Ptr0 = PointerOps[CurrentOrder.front()];
- PtrN = PointerOps[CurrentOrder.back()];
- }
- std::optional<int> Dist =
- getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
- // Check that the sorted pointer operands are consecutive.
- if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
- if (CurrentOrder.empty()) {
- // Original stores are consecutive and does not require reordering.
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
- UserTreeIdx, ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
- } else {
- fixupOrderingIndices(CurrentOrder);
- TreeEntry *TE =
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, CurrentOrder);
- TE->setOperandsInOrder();
- buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
- }
- return;
- }
+ // Check that the sorted pointer operands are consecutive.
+ if (CurrentOrder.empty()) {
+ // Original stores are consecutive and does not require reordering.
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ } else {
+ fixupOrderingIndices(CurrentOrder);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, CurrentOrder);
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
}
-
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
case Instruction::Call: {
@@ -5802,68 +6129,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- VFShape Shape = VFShape::get(
- *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
- false /*HasGlobalPred*/);
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-
- if (!VecFunc && !isTriviallyVectorizable(ID)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
- return;
- }
- Function *F = CI->getCalledFunction();
- unsigned NumArgs = CI->arg_size();
- SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
- for (unsigned j = 0; j != NumArgs; ++j)
- if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
- ScalarArgs[j] = CI->getArgOperand(j);
- for (Value *V : VL) {
- CallInst *CI2 = dyn_cast<CallInst>(V);
- if (!CI2 || CI2->getCalledFunction() != F ||
- getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
- (VecFunc &&
- VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
- !CI->hasIdenticalOperandBundleSchema(*CI2)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
- << "\n");
- return;
- }
- // Some intrinsics have scalar arguments and should be same in order for
- // them to be vectorized.
- for (unsigned j = 0; j != NumArgs; ++j) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
- Value *A1J = CI2->getArgOperand(j);
- if (ScalarArgs[j] != A1J) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
- << " argument " << ScalarArgs[j] << "!=" << A1J
- << "\n");
- return;
- }
- }
- }
- // Verify that the bundle operands are identical between the two calls.
- if (CI->hasOperandBundles() &&
- !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
- CI->op_begin() + CI->getBundleOperandsEndIndex(),
- CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
- << *CI << "!=" << *V << '\n');
- return;
- }
- }
-
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
TE->setOperandsInOrder();
@@ -5883,15 +6148,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
case Instruction::ShuffleVector: {
- // If this is not an alternate sequence of opcode like add-sub
- // then do not vectorize this instruction.
- if (!S.isAltShuffle()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
- return;
- }
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
@@ -5949,19 +6205,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
default:
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
- return;
+ break;
}
+ llvm_unreachable("Unexpected vectorization of the instructions.");
}
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N = 1;
Type *EltTy = T;
- while (isa<StructType, ArrayType, VectorType>(EltTy)) {
+ while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
if (auto *ST = dyn_cast<StructType>(EltTy)) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
@@ -5982,7 +6235,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
if (!isValidElementType(EltTy))
return 0;
uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
- if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+ if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
+ VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
return N;
}
@@ -6111,68 +6365,6 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
-/// Compute the cost of creating a vector of type \p VecTy containing the
-/// extracted values from \p VL.
-static InstructionCost
-computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
- TargetTransformInfo::ShuffleKind ShuffleKind,
- ArrayRef<int> Mask, TargetTransformInfo &TTI) {
- unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-
- if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
- VecTy->getNumElements() < NumOfParts)
- return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
- bool AllConsecutive = true;
- unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
- unsigned Idx = -1;
- InstructionCost Cost = 0;
-
- // Process extracts in blocks of EltsPerVector to check if the source vector
- // operand can be re-used directly. If not, add the cost of creating a shuffle
- // to extract the values into a vector register.
- SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
- for (auto *V : VL) {
- ++Idx;
-
- // Reached the start of a new vector registers.
- if (Idx % EltsPerVector == 0) {
- RegMask.assign(EltsPerVector, UndefMaskElem);
- AllConsecutive = true;
- continue;
- }
-
- // Need to exclude undefs from analysis.
- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
- continue;
-
- // Check all extracts for a vector register on the target directly
- // extract values in order.
- unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
- if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
- CurrentIdx % EltsPerVector == Idx % EltsPerVector;
- RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
- }
-
- if (AllConsecutive)
- continue;
-
- // Skip all indices, except for the last index per vector block.
- if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
- continue;
-
- // If we have a series of extracts which are not consecutive and hence
- // cannot re-use the source vector register directly, compute the shuffle
- // cost to extract the vector with EltsPerVector elements.
- Cost += TTI.getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
- }
- return Cost;
-}
-
/// Build shuffle mask for shuffle graph entries and lists of main and alternate
/// operations operands.
static void
@@ -6183,7 +6375,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
SmallVectorImpl<Value *> *OpScalars = nullptr,
SmallVectorImpl<Value *> *AltScalars = nullptr) {
unsigned Sz = VL.size();
- Mask.assign(Sz, UndefMaskElem);
+ Mask.assign(Sz, PoisonMaskElem);
SmallVector<int> OrderMask;
if (!ReorderIndices.empty())
inversePermutation(ReorderIndices, OrderMask);
@@ -6203,9 +6395,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
}
}
if (!ReusesIndices.empty()) {
- SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);
+ SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem);
transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
- return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;
+ return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
});
Mask.swap(NewMask);
}
@@ -6325,13 +6517,13 @@ protected:
static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
ArrayRef<int> ExtMask) {
unsigned VF = Mask.size();
- SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
- if (ExtMask[I] == UndefMaskElem)
+ if (ExtMask[I] == PoisonMaskElem)
continue;
int MaskedIdx = Mask[ExtMask[I] % VF];
NewMask[I] =
- MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF;
+ MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
}
Mask.swap(NewMask);
}
@@ -6418,11 +6610,12 @@ protected:
if (auto *SVOpTy =
dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
LocalVF = SVOpTy->getNumElements();
- SmallVector<int> ExtMask(Mask.size(), UndefMaskElem);
+ SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(Mask)) {
- if (I == UndefMaskElem)
- continue;
- ExtMask[Idx] = SV->getMaskValue(I);
+ if (I == PoisonMaskElem ||
+ static_cast<unsigned>(I) >= SV->getShuffleMask().size())
+ continue;
+ ExtMask[Idx] = SV->getMaskValue(I);
}
bool IsOp1Undef =
isUndefVector(SV->getOperand(0),
@@ -6435,11 +6628,11 @@ protected:
if (!IsOp1Undef && !IsOp2Undef) {
// Update mask and mark undef elems.
for (int &I : Mask) {
- if (I == UndefMaskElem)
+ if (I == PoisonMaskElem)
continue;
if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
- UndefMaskElem)
- I = UndefMaskElem;
+ PoisonMaskElem)
+ I = PoisonMaskElem;
}
break;
}
@@ -6453,15 +6646,16 @@ protected:
Op = SV->getOperand(1);
}
if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
- !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) {
+ !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
+ ShuffleVectorInst::isZeroEltSplatMask(Mask)) {
if (IdentityOp) {
V = IdentityOp;
assert(Mask.size() == IdentityMask.size() &&
"Expected masks of same sizes.");
// Clear known poison elements.
for (auto [I, Idx] : enumerate(Mask))
- if (Idx == UndefMaskElem)
- IdentityMask[I] = UndefMaskElem;
+ if (Idx == PoisonMaskElem)
+ IdentityMask[I] = PoisonMaskElem;
Mask.swap(IdentityMask);
auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
return SinglePermute &&
@@ -6481,10 +6675,12 @@ protected:
/// Smart shuffle instruction emission, walks through shuffles trees and
/// tries to find the best matching vector for the actual shuffle
/// instruction.
- template <typename ShuffleBuilderTy>
- static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
- ShuffleBuilderTy &Builder) {
+ template <typename T, typename ShuffleBuilderTy>
+ static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
+ ShuffleBuilderTy &Builder) {
assert(V1 && "Expected at least one vector value.");
+ if (V2)
+ Builder.resizeToMatch(V1, V2);
int VF = Mask.size();
if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
VF = FTy->getNumElements();
@@ -6495,8 +6691,8 @@ protected:
Value *Op2 = V2;
int VF =
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
- SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
- SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
for (int I = 0, E = Mask.size(); I < E; ++I) {
if (Mask[I] < VF)
CombinedMask1[I] = Mask[I];
@@ -6514,9 +6710,9 @@ protected:
// again.
if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
- SmallVector<int> ExtMask1(Mask.size(), UndefMaskElem);
+ SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(CombinedMask1)) {
- if (I == UndefMaskElem)
+ if (I == PoisonMaskElem)
continue;
ExtMask1[Idx] = SV1->getMaskValue(I);
}
@@ -6524,9 +6720,9 @@ protected:
cast<FixedVectorType>(SV1->getOperand(1)->getType())
->getNumElements(),
ExtMask1, UseMask::SecondArg);
- SmallVector<int> ExtMask2(CombinedMask2.size(), UndefMaskElem);
+ SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(CombinedMask2)) {
- if (I == UndefMaskElem)
+ if (I == PoisonMaskElem)
continue;
ExtMask2[Idx] = SV2->getMaskValue(I);
}
@@ -6566,64 +6762,360 @@ protected:
->getElementCount()
.getKnownMinValue());
for (int I = 0, E = Mask.size(); I < E; ++I) {
- if (CombinedMask2[I] != UndefMaskElem) {
- assert(CombinedMask1[I] == UndefMaskElem &&
+ if (CombinedMask2[I] != PoisonMaskElem) {
+ assert(CombinedMask1[I] == PoisonMaskElem &&
"Expected undefined mask element");
CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
}
}
+ const int Limit = CombinedMask1.size() * 2;
+ if (Op1 == Op2 && Limit == 2 * VF &&
+ all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&
+ (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||
+ (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&
+ isa<ShuffleVectorInst>(Op1) &&
+ cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
+ ArrayRef(CombinedMask1))))
+ return Builder.createIdentity(Op1);
return Builder.createShuffleVector(
Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
CombinedMask1);
}
if (isa<PoisonValue>(V1))
- return PoisonValue::get(FixedVectorType::get(
- cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+ return Builder.createPoison(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size());
SmallVector<int> NewMask(Mask.begin(), Mask.end());
bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
assert(V1 && "Expected non-null value after looking through shuffles.");
if (!IsIdentity)
return Builder.createShuffleVector(V1, NewMask);
- return V1;
+ return Builder.createIdentity(V1);
}
};
} // namespace
-InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
- ArrayRef<Value *> VectorizedVals) {
- ArrayRef<Value *> VL = E->Scalars;
+/// Merges shuffle masks and emits final shuffle instruction, if required. It
+/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
+/// when the actual shuffle instruction is generated only if this is actually
+/// required. Otherwise, the shuffle instruction emission is delayed till the
+/// end of the process, to reduce the number of emitted instructions and further
+/// analysis/transformations.
+class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
+ bool IsFinalized = false;
+ SmallVector<int> CommonMask;
+ SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
+ const TargetTransformInfo &TTI;
+ InstructionCost Cost = 0;
+ ArrayRef<Value *> VectorizedVals;
+ BoUpSLP &R;
+ SmallPtrSetImpl<Value *> &CheckedExtracts;
+ constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
+ if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
+ return TTI::TCC_Free;
+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+ InstructionCost GatherCost = 0;
+ SmallVector<Value *> Gathers(VL.begin(), VL.end());
+ // Improve gather cost for gather of loads, if we can group some of the
+ // loads into vector loads.
+ InstructionsState S = getSameOpcode(VL, *R.TLI);
+ if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
+ !S.isAltShuffle() &&
+ !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
+ !isSplat(Gathers)) {
+ BoUpSLP::ValueSet VectorizedLoads;
+ unsigned StartIdx = 0;
+ unsigned VF = VL.size() / 2;
+ unsigned VectorizedCnt = 0;
+ unsigned ScatterVectorizeCnt = 0;
+ const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
+ for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+ for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
+ Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ if (!VectorizedLoads.count(Slice.front()) &&
+ !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
+ SmallVector<Value *> PointerOps;
+ OrdersType CurrentOrder;
+ LoadsState LS =
+ canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,
+ *R.LI, *R.TLI, CurrentOrder, PointerOps);
+ switch (LS) {
+ case LoadsState::Vectorize:
+ case LoadsState::ScatterVectorize:
+ // Mark the vectorized loads so that we don't vectorize them
+ // again.
+ if (LS == LoadsState::Vectorize)
+ ++VectorizedCnt;
+ else
+ ++ScatterVectorizeCnt;
+ VectorizedLoads.insert(Slice.begin(), Slice.end());
+ // If we vectorized initial block, no need to try to vectorize
+ // it again.
+ if (Cnt == StartIdx)
+ StartIdx += VF;
+ break;
+ case LoadsState::Gather:
+ break;
+ }
+ }
+ }
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= VL.size())
+ break;
+ // Found vectorizable parts - exit.
+ if (!VectorizedLoads.empty())
+ break;
+ }
+ if (!VectorizedLoads.empty()) {
+ unsigned NumParts = TTI.getNumberOfParts(VecTy);
+ bool NeedInsertSubvectorAnalysis =
+ !NumParts || (VL.size() / VF) > NumParts;
+ // Get the cost for gathered loads.
+ for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
+ if (VectorizedLoads.contains(VL[I]))
+ continue;
+ GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
+ }
+ // Exclude potentially vectorized loads from list of gathered
+ // scalars.
+ auto *LI = cast<LoadInst>(S.MainOp);
+ Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));
+ // The cost for vectorized loads.
+ InstructionCost ScalarsCost = 0;
+ for (Value *V : VectorizedLoads) {
+ auto *LI = cast<LoadInst>(V);
+ ScalarsCost +=
+ TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
+ LI->getAlign(), LI->getPointerAddressSpace(),
+ CostKind, TTI::OperandValueInfo(), LI);
+ }
+ auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
+ Align Alignment = LI->getAlign();
+ GatherCost +=
+ VectorizedCnt *
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI);
+ GatherCost += ScatterVectorizeCnt *
+ TTI.getGatherScatterOpCost(
+ Instruction::Load, LoadTy, LI->getPointerOperand(),
+ /*VariableMask=*/false, Alignment, CostKind, LI);
+ if (NeedInsertSubvectorAnalysis) {
+ // Add the cost for the subvectors insert.
+ for (int I = VF, E = VL.size(); I < E; I += VF)
+ GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
+ std::nullopt, CostKind, I, LoadTy);
+ }
+ GatherCost -= ScalarsCost;
+ }
+ } else if (!Root && isSplat(VL)) {
+ // Found the broadcasting of the single scalar, calculate the cost as
+ // the broadcast.
+ const auto *It =
+ find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
+ assert(It != VL.end() && "Expected at least one non-undef value.");
+ // Add broadcast for non-identity shuffle only.
+ bool NeedShuffle =
+ count(VL, *It) > 1 &&
+ (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
+ InstructionCost InsertCost = TTI.getVectorInstrCost(
+ Instruction::InsertElement, VecTy, CostKind,
+ NeedShuffle ? 0 : std::distance(VL.begin(), It),
+ PoisonValue::get(VecTy), *It);
+ return InsertCost +
+ (NeedShuffle ? TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast, VecTy,
+ /*Mask=*/std::nullopt, CostKind, /*Index=*/0,
+ /*SubTp=*/nullptr, /*Args=*/*It)
+ : TTI::TCC_Free);
+ }
+ return GatherCost +
+ (all_of(Gathers, UndefValue::classof)
+ ? TTI::TCC_Free
+ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
+ };
- Type *ScalarTy = VL[0]->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
- else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
- ScalarTy = CI->getOperand(0)->getType();
- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
- ScalarTy = IE->getOperand(1)->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ /// Compute the cost of creating a vector of type \p VecTy containing the
+ /// extracted values from \p VL.
+ InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+ TTI::ShuffleKind ShuffleKind) {
+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+ unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
- // If we have computed a smaller type for the expression, update VecTy so
- // that the costs will be accurate.
- if (MinBWs.count(VL[0]))
- VecTy = FixedVectorType::get(
- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
- unsigned EntryVF = E->getVectorFactor();
- auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+ if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||
+ !NumOfParts || VecTy->getNumElements() < NumOfParts)
+ return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- // FIXME: it tries to fix a problem with MSVC buildbots.
- TargetTransformInfo *TTI = this->TTI;
- auto AdjustExtractsCost = [=](InstructionCost &Cost) {
+ bool AllConsecutive = true;
+ unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
+ unsigned Idx = -1;
+ InstructionCost Cost = 0;
+
+ // Process extracts in blocks of EltsPerVector to check if the source vector
+ // operand can be re-used directly. If not, add the cost of creating a
+ // shuffle to extract the values into a vector register.
+ SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
+ for (auto *V : VL) {
+ ++Idx;
+
+ // Reached the start of a new vector registers.
+ if (Idx % EltsPerVector == 0) {
+ RegMask.assign(EltsPerVector, PoisonMaskElem);
+ AllConsecutive = true;
+ continue;
+ }
+
+ // Need to exclude undefs from analysis.
+ if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+ continue;
+
+ // Check all extracts for a vector register on the target directly
+ // extract values in order.
+ unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
+ if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) {
+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
+ }
+
+ if (AllConsecutive)
+ continue;
+
+ // Skip all indices, except for the last index per vector block.
+ if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
+ continue;
+
+ // If we have a series of extracts which are not consecutive and hence
+ // cannot re-use the source vector register directly, compute the shuffle
+ // cost to extract the vector with EltsPerVector elements.
+ Cost += TTI.getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc,
+ FixedVectorType::get(VecTy->getElementType(), EltsPerVector),
+ RegMask);
+ }
+ return Cost;
+ }
+
+ class ShuffleCostBuilder {
+ const TargetTransformInfo &TTI;
+
+ static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
+ int Limit = 2 * VF;
+ return Mask.empty() ||
+ (VF == Mask.size() &&
+ all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask));
+ }
+
+ public:
+ ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
+ ~ShuffleCostBuilder() = default;
+ InstructionCost createShuffleVector(Value *V1, Value *,
+ ArrayRef<int> Mask) const {
+ // Empty mask or identity mask are free.
+ unsigned VF =
+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+ if (isEmptyOrIdentity(Mask, VF))
+ return TTI::TCC_Free;
+ return TTI.getShuffleCost(
+ TTI::SK_PermuteTwoSrc,
+ FixedVectorType::get(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+ Mask);
+ }
+ InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
+ // Empty mask or identity mask are free.
+ if (isEmptyOrIdentity(Mask, Mask.size()))
+ return TTI::TCC_Free;
+ return TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc,
+ FixedVectorType::get(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+ Mask);
+ }
+ InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
+ InstructionCost createPoison(Type *Ty, unsigned VF) const {
+ return TTI::TCC_Free;
+ }
+ void resizeToMatch(Value *&, Value *&) const {}
+ };
+
+ /// Smart shuffle instruction emission, walks through shuffles trees and
+ /// tries to find the best matching vector for the actual shuffle
+ /// instruction.
+ InstructionCost
+ createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
+ const PointerUnion<Value *, const TreeEntry *> &P2,
+ ArrayRef<int> Mask) {
+ ShuffleCostBuilder Builder(TTI);
+ Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
+ unsigned CommonVF = 0;
+ if (!V1) {
+ const TreeEntry *E = P1.get<const TreeEntry *>();
+ unsigned VF = E->getVectorFactor();
+ if (V2) {
+ unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
+ if (V2VF != VF && V2VF == E->Scalars.size())
+ VF = E->Scalars.size();
+ } else if (!P2.isNull()) {
+ const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ if (E->Scalars.size() == E2->Scalars.size())
+ CommonVF = VF = E->Scalars.size();
+ } else {
+ // P2 is empty, check that we have same node + reshuffle (if any).
+ if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
+ VF = E->Scalars.size();
+ SmallVector<int> CommonMask(Mask.begin(), Mask.end());
+ ::addMask(CommonMask, E->getCommonMask());
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), VF));
+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(
+ V1, nullptr, CommonMask, Builder);
+ }
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), VF));
+ }
+ if (!V2 && !P2.isNull()) {
+ const TreeEntry *E = P2.get<const TreeEntry *>();
+ unsigned VF = E->getVectorFactor();
+ unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ if (!CommonVF && V1VF == E->Scalars.size())
+ CommonVF = E->Scalars.size();
+ if (CommonVF)
+ VF = CommonVF;
+ V2 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), VF));
+ }
+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,
+ Builder);
+ }
+
+public:
+ ShuffleCostEstimator(TargetTransformInfo &TTI,
+ ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
+ SmallPtrSetImpl<Value *> &CheckedExtracts)
+ : TTI(TTI), VectorizedVals(VectorizedVals), R(R),
+ CheckedExtracts(CheckedExtracts) {}
+ Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
+ TTI::ShuffleKind ShuffleKind) {
+ if (Mask.empty())
+ return nullptr;
+ Value *VecBase = nullptr;
+ ArrayRef<Value *> VL = E->Scalars;
+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
// If the resulting type is scalarized, do not adjust the cost.
- unsigned VecNumParts = TTI->getNumberOfParts(VecTy);
+ unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
if (VecNumParts == VecTy->getNumElements())
- return;
+ return nullptr;
DenseMap<Value *, int> ExtractVectorsTys;
- SmallPtrSet<Value *, 4> CheckedExtracts;
- for (auto *V : VL) {
- if (isa<UndefValue>(V))
+ for (auto [I, V] : enumerate(VL)) {
+ // Ignore non-extractelement scalars.
+ if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
continue;
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
@@ -6631,17 +7123,18 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// vectorized tree.
// Also, avoid adjusting the cost for extractelements with multiple uses
// in different graph entries.
- const TreeEntry *VE = getTreeEntry(V);
+ const TreeEntry *VE = R.getTreeEntry(V);
if (!CheckedExtracts.insert(V).second ||
- !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+ !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
(VE && VE != E))
continue;
auto *EE = cast<ExtractElementInst>(V);
+ VecBase = EE->getVectorOperand();
std::optional<unsigned> EEIdx = getExtractIndex(EE);
if (!EEIdx)
continue;
unsigned Idx = *EEIdx;
- if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) {
+ if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
auto It =
ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
It->getSecond() = std::min<int>(It->second, Idx);
@@ -6654,18 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
})) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
- Cost -=
- TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
- EE->getVectorOperandType(), Idx);
+ Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+ EE->getVectorOperandType(), Idx);
// Add back the cost of s|zext which is subtracted separately.
- Cost += TTI->getCastInstrCost(
+ Cost += TTI.getCastInstrCost(
Ext->getOpcode(), Ext->getType(), EE->getType(),
TTI::getCastContextHint(Ext), CostKind, Ext);
continue;
}
}
- Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
- Idx);
+ Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+ Idx);
}
// Add a cost for subvector extracts/inserts if required.
for (const auto &Data : ExtractVectorsTys) {
@@ -6673,34 +7165,148 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
unsigned NumElts = VecTy->getNumElements();
if (Data.second % NumElts == 0)
continue;
- if (TTI->getNumberOfParts(EEVTy) > VecNumParts) {
+ if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
unsigned Idx = (Data.second / NumElts) * NumElts;
unsigned EENumElts = EEVTy->getNumElements();
+ if (Idx % NumElts == 0)
+ continue;
if (Idx + NumElts <= EENumElts) {
- Cost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, VecTy);
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, std::nullopt, CostKind, Idx, VecTy);
} else {
// Need to round up the subvector type vectorization factor to avoid a
// crash in cost model functions. Make SubVT so that Idx + VF of SubVT
// <= EENumElts.
auto *SubVT =
FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
- Cost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, SubVT);
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, std::nullopt, CostKind, Idx, SubVT);
}
} else {
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
- VecTy, std::nullopt, CostKind, 0, EEVTy);
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+ VecTy, std::nullopt, CostKind, 0, EEVTy);
}
}
- };
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ // Found the bunch of extractelement instructions that must be gathered
+ // into a vector and can be represented as a permutation elements in a
+ // single input vector or of 2 input vectors.
+ Cost += computeExtractCost(VL, Mask, ShuffleKind);
+ return VecBase;
+ }
+ void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign({E1, E2});
+ }
+ void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, E1);
+ }
+ /// Adds another one input vector and the mask for the shuffling.
+ void add(Value *V1, ArrayRef<int> Mask) {
+ assert(CommonMask.empty() && InVectors.empty() &&
+ "Expected empty input mask/vectors.");
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, V1);
+ }
+ Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Cost += getBuildVectorCost(VL, Root);
+ if (!Root) {
+ assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
+ // FIXME: Need to find a way to avoid use of getNullValue here.
+ SmallVector<Constant *> Vals;
+ for (Value *V : VL) {
+ if (isa<UndefValue>(V)) {
+ Vals.push_back(cast<Constant>(V));
+ continue;
+ }
+ Vals.push_back(Constant::getNullValue(V->getType()));
+ }
+ return ConstantVector::get(Vals);
+ }
+ return ConstantVector::getSplat(
+ ElementCount::getFixed(VL.size()),
+ Constant::getNullValue(VL.front()->getType()));
+ }
+ /// Finalize emission of the shuffles.
+ InstructionCost
+ finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+ function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
+ IsFinalized = true;
+ if (Action) {
+ const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
+ if (InVectors.size() == 2) {
+ Cost += createShuffle(Vec, InVectors.back(), CommonMask);
+ InVectors.pop_back();
+ } else {
+ Cost += createShuffle(Vec, nullptr, CommonMask);
+ }
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ assert(VF > 0 &&
+ "Expected vector length for the final value before action.");
+ Value *V = Vec.dyn_cast<Value *>();
+ if (!Vec.isNull() && !V)
+ V = Constant::getNullValue(FixedVectorType::get(
+ Vec.get<const TreeEntry *>()->Scalars.front()->getType(),
+ CommonMask.size()));
+ Action(V, CommonMask);
+ }
+ ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
+ if (CommonMask.empty())
+ return Cost;
+ int Limit = CommonMask.size() * 2;
+ if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(CommonMask))
+ return Cost;
+ return Cost +
+ createShuffle(InVectors.front(),
+ InVectors.size() == 2 ? InVectors.back() : nullptr,
+ CommonMask);
+ }
+
+ ~ShuffleCostEstimator() {
+ assert((IsFinalized || CommonMask.empty()) &&
+ "Shuffle construction must be finalized.");
+ }
+};
+
+InstructionCost
+BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
+ SmallPtrSetImpl<Value *> &CheckedExtracts) {
+ ArrayRef<Value *> VL = E->Scalars;
+
+ Type *ScalarTy = VL[0]->getType();
+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+ ScalarTy = IE->getOperand(1)->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ // If we have computed a smaller type for the expression, update VecTy so
+ // that the costs will be accurate.
+ if (MinBWs.count(VL[0]))
+ VecTy = FixedVectorType::get(
+ IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ unsigned EntryVF = E->getVectorFactor();
+ auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
+ ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,
+ CheckedExtracts);
+ unsigned VF = E->getVectorFactor();
+ SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
+ E->ReuseShuffleIndices.end());
SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
// Build a mask out of the reorder indices and reorder scalars per this
// mask.
@@ -6709,195 +7315,104 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
SmallVector<int> Mask;
+ SmallVector<int> ExtractMask;
+ std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
SmallVector<const TreeEntry *> Entries;
+ Type *ScalarTy = GatheredScalars.front()->getType();
+ // Check for gathered extracts.
+ ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
+ SmallVector<Value *> IgnoredVals;
+ if (UserIgnoreList)
+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+
+ bool Resized = false;
+ if (Value *VecBase = Estimator.adjustExtracts(
+ E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+ if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+ Resized = true;
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+
// Do not try to look for reshuffled loads for gathered loads (they will be
// handled later), for vectorized scalars, and cases, which are definitely
// not profitable (splats and small gather nodes.)
- if (E->getOpcode() != Instruction::Load || E->isAltShuffle() ||
+ if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+ E->isAltShuffle() ||
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
isSplat(E->Scalars) ||
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
if (GatherShuffle) {
- // Remove shuffled elements from list of gathers.
- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
- if (Mask[I] != UndefMaskElem)
- GatheredScalars[I] = PoisonValue::get(ScalarTy);
- }
assert((Entries.size() == 1 || Entries.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
- InstructionCost GatherCost = 0;
- int Limit = Mask.size() * 2;
- if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask)) {
+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+ Entries.front()->isSame(E->Scalars)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
dbgs()
<< "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n");
- if (NeedToShuffleReuses)
- GatherCost =
- TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- FinalVecTy, E->ReuseShuffleIndices);
- } else {
- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
- << " entries for bundle that starts with "
- << *VL.front() << ".\n");
- // Detected that instead of gather we can emit a shuffle of single/two
- // previously vectorized nodes. Add the cost of the permutation rather
- // than gather.
- ::addMask(Mask, E->ReuseShuffleIndices);
- GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
- }
- if (!all_of(GatheredScalars, UndefValue::classof))
- GatherCost += getGatherCost(GatheredScalars);
- return GatherCost;
- }
- if ((E->getOpcode() == Instruction::ExtractElement ||
- all_of(E->Scalars,
- [](Value *V) {
- return isa<ExtractElementInst, UndefValue>(V);
- })) &&
- allSameType(VL)) {
- // Check that gather of extractelements can be represented as just a
- // shuffle of a single/two vectors the scalars are extracted from.
- SmallVector<int> Mask;
- std::optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
- isFixedVectorShuffle(VL, Mask);
- if (ShuffleKind) {
- // Found the bunch of extractelement instructions that must be gathered
- // into a vector and can be represented as a permutation elements in a
- // single input vector or of 2 input vectors.
- InstructionCost Cost =
- computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
- AdjustExtractsCost(Cost);
- if (NeedToShuffleReuses)
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- FinalVecTy, E->ReuseShuffleIndices);
- return Cost;
- }
- }
- if (isSplat(VL)) {
- // Found the broadcasting of the single scalar, calculate the cost as the
- // broadcast.
- assert(VecTy == FinalVecTy &&
- "No reused scalars expected for broadcast.");
- const auto *It =
- find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
- // If all values are undefs - consider cost free.
- if (It == VL.end())
- return TTI::TCC_Free;
- // Add broadcast for non-identity shuffle only.
- bool NeedShuffle =
- VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof);
- InstructionCost InsertCost =
- TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
- /*Index=*/0, PoisonValue::get(VecTy), *It);
- return InsertCost + (NeedShuffle
- ? TTI->getShuffleCost(
- TargetTransformInfo::SK_Broadcast, VecTy,
- /*Mask=*/std::nullopt, CostKind,
- /*Index=*/0,
- /*SubTp=*/nullptr, /*Args=*/VL[0])
- : TTI::TCC_Free);
- }
- InstructionCost ReuseShuffleCost = 0;
- if (NeedToShuffleReuses)
- ReuseShuffleCost = TTI->getShuffleCost(
- TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
- // Improve gather cost for gather of loads, if we can group some of the
- // loads into vector loads.
- if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
- !E->isAltShuffle()) {
- BoUpSLP::ValueSet VectorizedLoads;
- unsigned StartIdx = 0;
- unsigned VF = VL.size() / 2;
- unsigned VectorizedCnt = 0;
- unsigned ScatterVectorizeCnt = 0;
- const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
- for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
- for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
- Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
- if (!VectorizedLoads.count(Slice.front()) &&
- !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
- SmallVector<Value *> PointerOps;
- OrdersType CurrentOrder;
- LoadsState LS =
- canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
- *TLI, CurrentOrder, PointerOps);
- switch (LS) {
- case LoadsState::Vectorize:
- case LoadsState::ScatterVectorize:
- // Mark the vectorized loads so that we don't vectorize them
- // again.
- if (LS == LoadsState::Vectorize)
- ++VectorizedCnt;
- else
- ++ScatterVectorizeCnt;
- VectorizedLoads.insert(Slice.begin(), Slice.end());
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += VF;
- break;
- case LoadsState::Gather:
- break;
- }
+ // Restore the mask for previous partially matched values.
+ for (auto [I, V] : enumerate(E->Scalars)) {
+ if (isa<PoisonValue>(V)) {
+ Mask[I] = PoisonMaskElem;
+ continue;
}
+ if (Mask[I] == PoisonMaskElem)
+ Mask[I] = Entries.front()->findLaneForValue(V);
}
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= VL.size())
- break;
- // Found vectorizable parts - exit.
- if (!VectorizedLoads.empty())
- break;
+ Estimator.add(Entries.front(), Mask);
+ return Estimator.finalize(E->ReuseShuffleIndices);
}
- if (!VectorizedLoads.empty()) {
- InstructionCost GatherCost = 0;
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
- bool NeedInsertSubvectorAnalysis =
- !NumParts || (VL.size() / VF) > NumParts;
- // Get the cost for gathered loads.
- for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
- if (VectorizedLoads.contains(VL[I]))
- continue;
- GatherCost += getGatherCost(VL.slice(I, VF));
- }
- // The cost for vectorized loads.
- InstructionCost ScalarsCost = 0;
- for (Value *V : VectorizedLoads) {
- auto *LI = cast<LoadInst>(V);
- ScalarsCost +=
- TTI->getMemoryOpCost(Instruction::Load, LI->getType(),
- LI->getAlign(), LI->getPointerAddressSpace(),
- CostKind, TTI::OperandValueInfo(), LI);
- }
- auto *LI = cast<LoadInst>(E->getMainOp());
- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
- Align Alignment = LI->getAlign();
- GatherCost +=
- VectorizedCnt *
- TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI);
- GatherCost += ScatterVectorizeCnt *
- TTI->getGatherScatterOpCost(
- Instruction::Load, LoadTy, LI->getPointerOperand(),
- /*VariableMask=*/false, Alignment, CostKind, LI);
- if (NeedInsertSubvectorAnalysis) {
- // Add the cost for the subvectors insert.
- for (int I = VF, E = VL.size(); I < E; I += VF)
- GatherCost +=
- TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
- std::nullopt, CostKind, I, LoadTy);
- }
- return ReuseShuffleCost + GatherCost - ScalarsCost;
+ if (!Resized) {
+ unsigned VF1 = Entries.front()->getVectorFactor();
+ unsigned VF2 = Entries.back()->getVectorFactor();
+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
}
+ // Remove shuffled elements from list of gathers.
+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+ if (Mask[I] != PoisonMaskElem)
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
+ << " entries for bundle that starts with "
+ << *VL.front() << ".\n";);
+ if (Entries.size() == 1)
+ Estimator.add(Entries.front(), Mask);
+ else
+ Estimator.add(Entries.front(), Entries.back(), Mask);
+ if (all_of(GatheredScalars, PoisonValue ::classof))
+ return Estimator.finalize(E->ReuseShuffleIndices);
+ return Estimator.finalize(
+ E->ReuseShuffleIndices, E->Scalars.size(),
+ [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+ Vec = Estimator.gather(GatheredScalars,
+ Constant::getNullValue(FixedVectorType::get(
+ GatheredScalars.front()->getType(),
+ GatheredScalars.size())));
+ });
}
- return ReuseShuffleCost + getGatherCost(VL);
+ if (!all_of(GatheredScalars, PoisonValue::classof)) {
+ auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
+ bool SameGathers = VL.equals(Gathers);
+ Value *BV = Estimator.gather(
+ Gathers, SameGathers ? nullptr
+ : Constant::getNullValue(FixedVectorType::get(
+ GatheredScalars.front()->getType(),
+ GatheredScalars.size())));
+ SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
+ std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
+ Estimator.add(BV, ReuseMask);
+ }
+ if (ExtractShuffle)
+ Estimator.add(E, std::nullopt);
+ return Estimator.finalize(E->ReuseShuffleIndices);
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
@@ -6945,48 +7460,89 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
}
InstructionCost VecCost = VectorCost(CommonCost);
- LLVM_DEBUG(
- dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost));
- // Disable warnings for `this` and `E` are unused. Required for
- // `dumpTreeCosts`.
- (void)this;
- (void)E;
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
+ ScalarCost, "Calculated costs for Tree"));
return VecCost - ScalarCost;
};
// Calculate cost difference from vectorizing set of GEPs.
// Negative value means vectorizing is profitable.
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
- InstructionCost CostSavings = 0;
- for (Value *V : Ptrs) {
- if (V == BasePtr)
- continue;
- auto *Ptr = dyn_cast<GetElementPtrInst>(V);
- // GEPs may contain just addresses without instructions, considered free.
- // GEPs with all constant indices also considered to have zero cost.
- if (!Ptr || Ptr->hasAllConstantIndices())
- continue;
-
- // Here we differentiate two cases: when GEPs represent a regular
- // vectorization tree node (and hence vectorized) and when the set is
- // arguments of a set of loads or stores being vectorized. In the former
- // case all the scalar GEPs will be removed as a result of vectorization.
+ InstructionCost ScalarCost = 0;
+ InstructionCost VecCost = 0;
+ // Here we differentiate two cases: (1) when Ptrs represent a regular
+ // vectorization tree node (as they are pointer arguments of scattered
+ // loads) or (2) when Ptrs are the arguments of loads or stores being
+ // vectorized as plane wide unit-stride load/store since all the
+ // loads/stores are known to be from/to adjacent locations.
+ assert(E->State == TreeEntry::Vectorize &&
+ "Entry state expected to be Vectorize here.");
+ if (isa<LoadInst, StoreInst>(VL0)) {
+ // Case 2: estimate costs for pointer related costs when vectorizing to
+ // a wide load/store.
+ // Scalar cost is estimated as a set of pointers with known relationship
+ // between them.
+ // For vector code we will use BasePtr as argument for the wide load/store
+ // but we also need to account all the instructions which are going to
+ // stay in vectorized code due to uses outside of these scalar
+ // loads/stores.
+ ScalarCost = TTI->getPointersChainCost(
+ Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
+ CostKind);
+
+ SmallVector<const Value *> PtrsRetainedInVecCode;
+ for (Value *V : Ptrs) {
+ if (V == BasePtr) {
+ PtrsRetainedInVecCode.push_back(V);
+ continue;
+ }
+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);
+ // For simplicity assume Ptr to stay in vectorized code if it's not a
+ // GEP instruction. We don't care since it's cost considered free.
+ // TODO: We should check for any uses outside of vectorizable tree
+ // rather than just single use.
+ if (!Ptr || !Ptr->hasOneUse())
+ PtrsRetainedInVecCode.push_back(V);
+ }
+
+ if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
+ // If all pointers stay in vectorized code then we don't have
+ // any savings on that.
+ LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost,
+ "Calculated GEPs cost for Tree"));
+ return InstructionCost{TTI::TCC_Free};
+ }
+ VecCost = TTI->getPointersChainCost(
+ PtrsRetainedInVecCode, BasePtr,
+ TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind);
+ } else {
+ // Case 1: Ptrs are the arguments of loads that we are going to transform
+ // into masked gather load intrinsic.
+ // All the scalar GEPs will be removed as a result of vectorization.
// For any external uses of some lanes extract element instructions will
- // be generated (which cost is estimated separately). For the latter case
- // since the set of GEPs itself is not vectorized those used more than
- // once will remain staying in vectorized code as well. So we should not
- // count them as savings.
- if (!Ptr->hasOneUse() && isa<LoadInst, StoreInst>(VL0))
- continue;
-
- // TODO: it is target dependent, so need to implement and then use a TTI
- // interface.
- CostSavings += TTI->getArithmeticInstrCost(Instruction::Add,
- Ptr->getType(), CostKind);
- }
- LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n";
- E->dump());
- LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n");
- return InstructionCost() - CostSavings;
+ // be generated (which cost is estimated separately).
+ TTI::PointersChainInfo PtrsInfo =
+ all_of(Ptrs,
+ [](const Value *V) {
+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);
+ return Ptr && !Ptr->hasAllConstantIndices();
+ })
+ ? TTI::PointersChainInfo::getUnknownStride()
+ : TTI::PointersChainInfo::getKnownStride();
+
+ ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
+ CostKind);
+ if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
+ SmallVector<const Value *> Indices(BaseGEP->indices());
+ VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(),
+ BaseGEP->getPointerOperand(), Indices, VecTy,
+ CostKind);
+ }
+ }
+
+ LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
+ "Calculated GEPs cost for Tree"));
+
+ return VecCost - ScalarCost;
};
switch (ShuffleOrOp) {
@@ -7062,7 +7618,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
- SmallVector<int> InsertMask(NumElts, UndefMaskElem);
+ SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
unsigned OffsetBeg = *getInsertIndex(VL.front());
unsigned OffsetEnd = OffsetBeg;
InsertMask[OffsetBeg] = 0;
@@ -7099,13 +7655,13 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
SmallVector<int> Mask;
if (!E->ReorderIndices.empty()) {
inversePermutation(E->ReorderIndices, Mask);
- Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
+ Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
} else {
- Mask.assign(VecSz, UndefMaskElem);
+ Mask.assign(VecSz, PoisonMaskElem);
std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
}
bool IsIdentity = true;
- SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
+ SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
@@ -7148,14 +7704,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
InsertVecTy);
} else {
for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
- Mask[I] = InMask.test(I) ? UndefMaskElem : I;
+ Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
I <= End; ++I)
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
Mask[I] = I + VecSz;
for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
Mask[I] =
- ((I >= InMask.size()) || InMask.test(I)) ? UndefMaskElem : I;
+ ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
}
}
@@ -7422,11 +7978,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
VecCost +=
TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
- VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(),
+ auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
+ VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
CI0->getPredicate(), CostKind, VL0);
VecCost += TTI->getCmpSelInstrCost(
- E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ E->getOpcode(), VecTy, MaskTy,
cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
E->getAltOp());
} else {
@@ -7615,7 +8171,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
InstructionCost Cost = 0;
- SmallPtrSet<Instruction*, 4> LiveValues;
+ SmallPtrSet<Instruction *, 4> LiveValues;
Instruction *PrevInst = nullptr;
// The entries in VectorizableTree are not necessarily ordered by their
@@ -7626,6 +8182,8 @@ InstructionCost BoUpSLP::getSpillCost() const {
// are grouped together. Using dominance ensures a deterministic order.
SmallVector<Instruction *, 16> OrderedScalars;
for (const auto &TEPtr : VectorizableTree) {
+ if (TEPtr->State != TreeEntry::Vectorize)
+ continue;
Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
if (!Inst)
continue;
@@ -7639,7 +8197,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
if (NodeA != NodeB)
- return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
+ return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
return B->comesBefore(A);
});
@@ -7698,7 +8256,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
};
// Debug information does not impact spill cost.
- if (isa<CallInst>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
+ if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
&*PrevInstIt != PrevInst)
NumCalls++;
@@ -7706,7 +8264,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
}
if (NumCalls) {
- SmallVector<Type*, 4> V;
+ SmallVector<Type *, 4> V;
for (auto *II : LiveValues) {
auto *ScalarTy = II->getType();
if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
@@ -7797,8 +8355,8 @@ static T *performExtractsShuffleAction(
ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
- if (Mask[Idx] == UndefMaskElem)
- Mask[Idx] = IsBasePoison.test(Idx) ? UndefMaskElem : Idx;
+ if (Mask[Idx] == PoisonMaskElem)
+ Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
else
Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
}
@@ -7827,8 +8385,8 @@ static T *performExtractsShuffleAction(
// can shuffle them directly.
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
- if (SecMask[I] != UndefMaskElem) {
- assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ if (SecMask[I] != PoisonMaskElem) {
+ assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
Mask[I] = SecMask[I] + Vec1VF;
}
}
@@ -7841,12 +8399,12 @@ static T *performExtractsShuffleAction(
ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
- if (Mask[I] != UndefMaskElem) {
- assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ if (Mask[I] != PoisonMaskElem) {
+ assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
if (Res1.second)
Mask[I] = I;
- } else if (SecMask[I] != UndefMaskElem) {
- assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ } else if (SecMask[I] != PoisonMaskElem) {
+ assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
}
}
@@ -7863,11 +8421,11 @@ static T *performExtractsShuffleAction(
ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
- if (SecMask[I] != UndefMaskElem) {
- assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
+ if (SecMask[I] != PoisonMaskElem) {
+ assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
"Multiple uses of scalars.");
Mask[I] = (Res.second ? I : SecMask[I]) + VF;
- } else if (Mask[I] != UndefMaskElem) {
+ } else if (Mask[I] != PoisonMaskElem) {
Mask[I] = I;
}
}
@@ -7877,12 +8435,23 @@ static T *performExtractsShuffleAction(
}
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
+ // Build a map for gathered scalars to the nodes where they are used.
+ ValueToGatherNodes.clear();
+ for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
+ if (EntryPtr->State != TreeEntry::NeedToGather)
+ continue;
+ for (Value *V : EntryPtr->Scalars)
+ if (!isConstant(V))
+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
+ EntryPtr.get());
+ }
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+ SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
if (TE.State == TreeEntry::NeedToGather) {
@@ -7898,7 +8467,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}
- InstructionCost C = getEntryCost(&TE, VectorizedVals);
+ InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
@@ -7951,7 +8520,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
(void)ShuffleMasks.emplace_back();
SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
@@ -7965,7 +8534,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
do {
IEBase = cast<InsertElementInst>(Base);
int Idx = *getInsertIndex(IEBase);
- assert(Mask[Idx] == UndefMaskElem &&
+ assert(Mask[Idx] == PoisonMaskElem &&
"InsertElementInstruction used already.");
Mask[Idx] = Idx;
Base = IEBase->getOperand(0);
@@ -7985,7 +8554,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
int InIdx = *InsertIdx;
SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
Mask[InIdx] = EU.Lane;
DemandedElts[VecId].setBit(InIdx);
continue;
@@ -8024,7 +8593,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
(all_of(Mask,
[VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
!ShuffleVectorInst::isIdentityMask(Mask)))) {
- SmallVector<int> OrigMask(VecVF, UndefMaskElem);
+ SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
OrigMask.begin());
C = TTI->getShuffleCost(
@@ -8110,17 +8679,23 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// No need to check for the topmost gather node.
if (TE == VectorizableTree.front().get())
return std::nullopt;
- Mask.assign(VL.size(), UndefMaskElem);
+ Mask.assign(VL.size(), PoisonMaskElem);
assert(TE->UserTreeIndices.size() == 1 &&
"Expected only single user of the gather node.");
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
Instruction &UserInst =
getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);
- auto *PHI = dyn_cast<PHINode>(&UserInst);
- auto *NodeUI = DT->getNode(
- PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx)
- : UserInst.getParent());
+ BasicBlock *ParentBB = nullptr;
+ // Main node of PHI entries keeps the correct order of operands/incoming
+ // blocks.
+ if (auto *PHI =
+ dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) {
+ ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx);
+ } else {
+ ParentBB = UserInst.getParent();
+ }
+ auto *NodeUI = DT->getNode(ParentBB);
assert(NodeUI && "Should only process reachable instructions");
SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
auto CheckOrdering = [&](Instruction *LastEI) {
@@ -8147,45 +8722,6 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
return false;
return true;
};
- // Build a lists of values to tree entries.
- DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
- for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
- if (EntryPtr.get() == TE)
- continue;
- if (EntryPtr->State != TreeEntry::NeedToGather)
- continue;
- if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) {
- return GatheredScalars.contains(V);
- }))
- continue;
- assert(EntryPtr->UserTreeIndices.size() == 1 &&
- "Expected only single user of the gather node.");
- Instruction &EntryUserInst =
- getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE);
- if (&UserInst == &EntryUserInst) {
- // If 2 gathers are operands of the same entry, compare operands indices,
- // use the earlier one as the base.
- if (TE->UserTreeIndices.front().UserTE ==
- EntryPtr->UserTreeIndices.front().UserTE &&
- TE->UserTreeIndices.front().EdgeIdx <
- EntryPtr->UserTreeIndices.front().EdgeIdx)
- continue;
- }
- // Check if the user node of the TE comes after user node of EntryPtr,
- // otherwise EntryPtr depends on TE.
- auto *EntryPHI = dyn_cast<PHINode>(&EntryUserInst);
- auto *EntryI =
- EntryPHI
- ? EntryPHI
- ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx)
- ->getTerminator()
- : &EntryUserInst;
- if (!CheckOrdering(EntryI))
- continue;
- for (Value *V : EntryPtr->Scalars)
- if (!isConstant(V))
- ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
- }
// Find all tree entries used by the gathered values. If no common entries
// found - not a shuffle.
// Here we build a set of tree nodes for each gathered value and trying to
@@ -8195,16 +8731,58 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// have a permutation of 2 input vectors.
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
DenseMap<Value *, int> UsedValuesEntry;
- for (Value *V : TE->Scalars) {
+ for (Value *V : VL) {
if (isConstant(V))
continue;
// Build a list of tree entries where V is used.
SmallPtrSet<const TreeEntry *, 4> VToTEs;
- auto It = ValueToTEs.find(V);
- if (It != ValueToTEs.end())
- VToTEs = It->second;
- if (const TreeEntry *VTE = getTreeEntry(V))
+ for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
+ if (TEPtr == TE)
+ continue;
+ assert(any_of(TEPtr->Scalars,
+ [&](Value *V) { return GatheredScalars.contains(V); }) &&
+ "Must contain at least single gathered value.");
+ assert(TEPtr->UserTreeIndices.size() == 1 &&
+ "Expected only single user of the gather node.");
+ PHINode *EntryPHI =
+ dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp());
+ Instruction *EntryUserInst =
+ EntryPHI ? nullptr
+ : &getLastInstructionInBundle(
+ TEPtr->UserTreeIndices.front().UserTE);
+ if (&UserInst == EntryUserInst) {
+ assert(!EntryPHI && "Unexpected phi node entry.");
+ // If 2 gathers are operands of the same entry, compare operands
+ // indices, use the earlier one as the base.
+ if (TE->UserTreeIndices.front().UserTE ==
+ TEPtr->UserTreeIndices.front().UserTE &&
+ TE->UserTreeIndices.front().EdgeIdx <
+ TEPtr->UserTreeIndices.front().EdgeIdx)
+ continue;
+ }
+ // Check if the user node of the TE comes after user node of EntryPtr,
+ // otherwise EntryPtr depends on TE.
+ auto *EntryI =
+ EntryPHI
+ ? EntryPHI
+ ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx)
+ ->getTerminator()
+ : EntryUserInst;
+ if ((ParentBB != EntryI->getParent() ||
+ TE->UserTreeIndices.front().EdgeIdx <
+ TEPtr->UserTreeIndices.front().EdgeIdx ||
+ TE->UserTreeIndices.front().UserTE !=
+ TEPtr->UserTreeIndices.front().UserTE) &&
+ !CheckOrdering(EntryI))
+ continue;
+ VToTEs.insert(TEPtr);
+ }
+ if (const TreeEntry *VTE = getTreeEntry(V)) {
+ Instruction &EntryUserInst = getLastInstructionInBundle(VTE);
+ if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst))
+ continue;
VToTEs.insert(VTE);
+ }
if (VToTEs.empty())
continue;
if (UsedTEs.empty()) {
@@ -8260,13 +8838,13 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
});
- if (It != FirstEntries.end()) {
+ if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
Entries.push_back(*It);
std::iota(Mask.begin(), Mask.end(), 0);
// Clear undef scalars.
for (int I = 0, Sz = VL.size(); I < Sz; ++I)
- if (isa<PoisonValue>(TE->Scalars[I]))
- Mask[I] = UndefMaskElem;
+ if (isa<PoisonValue>(VL[I]))
+ Mask[I] = PoisonMaskElem;
return TargetTransformInfo::SK_PermuteSingleSrc;
}
// No perfect match, just shuffle, so choose the first tree node from the
@@ -8302,10 +8880,18 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
break;
}
}
- // No 2 source vectors with the same vector factor - give up and do regular
- // gather.
- if (Entries.empty())
- return std::nullopt;
+ // No 2 source vectors with the same vector factor - just choose 2 with max
+ // index.
+ if (Entries.empty()) {
+ Entries.push_back(
+ *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(),
+ [](const TreeEntry *TE1, const TreeEntry *TE2) {
+ return TE1->Idx < TE2->Idx;
+ }));
+ Entries.push_back(SecondEntries.front());
+ VF = std::max(Entries.front()->getVectorFactor(),
+ Entries.back()->getVectorFactor());
+ }
}
bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof);
@@ -8427,19 +9013,8 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
return std::nullopt;
}
-InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,
- const APInt &ShuffledIndices,
- bool NeedToShuffle) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost Cost =
- TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,
- /*Extract*/ false, CostKind);
- if (NeedToShuffle)
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
- return Cost;
-}
-
-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
+ bool ForPoisonSrc) const {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -8451,20 +9026,36 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// shuffle candidates.
APInt ShuffledElements = APInt::getZero(VL.size());
DenseSet<Value *> UniqueElements;
- // Iterate in reverse order to consider insert elements with the high cost.
- for (unsigned I = VL.size(); I > 0; --I) {
- unsigned Idx = I - 1;
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost Cost;
+ auto EstimateInsertCost = [&](unsigned I, Value *V) {
+ if (!ForPoisonSrc)
+ Cost +=
+ TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
+ I, Constant::getNullValue(VecTy), V);
+ };
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ Value *V = VL[I];
// No need to shuffle duplicates for constants.
- if (isConstant(VL[Idx])) {
- ShuffledElements.setBit(Idx);
+ if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
+ ShuffledElements.setBit(I);
continue;
}
- if (!UniqueElements.insert(VL[Idx]).second) {
+ if (!UniqueElements.insert(V).second) {
DuplicateNonConst = true;
- ShuffledElements.setBit(Idx);
+ ShuffledElements.setBit(I);
+ continue;
}
+ EstimateInsertCost(I, V);
}
- return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
+ if (ForPoisonSrc)
+ Cost =
+ TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
+ /*Extract*/ false, CostKind);
+ if (DuplicateNonConst)
+ Cost +=
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ return Cost;
}
// Perform operand reordering on the instructions in VL and return the reordered
@@ -8483,6 +9074,9 @@ void BoUpSLP::reorderInputsAccordingToOpcode(
}
Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
+ auto &Res = EntryToLastInstruction.FindAndConstruct(E);
+ if (Res.second)
+ return *Res.second;
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block (except for extractelement-like instructions with
// constant indeces).
@@ -8497,7 +9091,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
isVectorLikeInstWithConstOps(I);
}));
- auto &&FindLastInst = [E, Front, this, &BB]() {
+ auto FindLastInst = [&]() {
Instruction *LastInst = Front;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
@@ -8508,9 +9102,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
LastInst = I;
continue;
}
- assert(isVectorLikeInstWithConstOps(LastInst) &&
- isVectorLikeInstWithConstOps(I) &&
- "Expected vector-like insts only.");
+ assert(((E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(I)) ||
+ (isVectorLikeInstWithConstOps(LastInst) &&
+ isVectorLikeInstWithConstOps(I))) &&
+ "Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(LastInst->getParent())) {
LastInst = I;
continue;
@@ -8531,7 +9127,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
return LastInst;
};
- auto &&FindFirstInst = [E, Front, this]() {
+ auto FindFirstInst = [&]() {
Instruction *FirstInst = Front;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
@@ -8542,9 +9138,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
FirstInst = I;
continue;
}
- assert(isVectorLikeInstWithConstOps(FirstInst) &&
- isVectorLikeInstWithConstOps(I) &&
- "Expected vector-like insts only.");
+ assert(((E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(I)) ||
+ (isVectorLikeInstWithConstOps(FirstInst) &&
+ isVectorLikeInstWithConstOps(I))) &&
+ "Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(FirstInst->getParent())) {
FirstInst = I;
continue;
@@ -8566,22 +9164,23 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// Set the insert point to the beginning of the basic block if the entry
// should not be scheduled.
- if (E->State != TreeEntry::NeedToGather &&
- (doesNotNeedToSchedule(E->Scalars) ||
+ if (doesNotNeedToSchedule(E->Scalars) ||
+ (E->State != TreeEntry::NeedToGather &&
all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
- Instruction *InsertInst;
- if (all_of(E->Scalars, [](Value *V) {
+ if ((E->getOpcode() == Instruction::GetElementPtr &&
+ any_of(E->Scalars,
+ [](Value *V) {
+ return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
+ })) ||
+ all_of(E->Scalars, [](Value *V) {
return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V);
}))
- InsertInst = FindLastInst();
+ Res.second = FindLastInst();
else
- InsertInst = FindFirstInst();
- return *InsertInst;
+ Res.second = FindFirstInst();
+ return *Res.second;
}
- // The last instruction in the bundle in program order.
- Instruction *LastInst = nullptr;
-
// Find the last instruction. The common case should be that BB has been
// scheduled, and the last instruction is VL.back(). So we start with
// VL.back() and iterate over schedule data until we reach the end of the
@@ -8594,7 +9193,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
- LastInst = Bundle->Inst;
+ Res.second = Bundle->Inst;
}
// LastInst can still be null at this point if there's either not an entry
@@ -8615,15 +9214,15 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// not ideal. However, this should be exceedingly rare since it requires that
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
- if (!LastInst)
- LastInst = FindLastInst();
- assert(LastInst && "Failed to find last instruction in bundle");
- return *LastInst;
+ if (!Res.second)
+ Res.second = FindLastInst();
+ assert(Res.second && "Failed to find last instruction in bundle");
+ return *Res.second;
}
void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
auto *Front = E->getMainOp();
- Instruction *LastInst = EntryToLastInstruction.lookup(E);
+ Instruction *LastInst = &getLastInstructionInBundle(E);
assert(LastInst && "Failed to find last instruction in bundle");
// If the instruction is PHI, set the insert point after all the PHIs.
bool IsPHI = isa<PHINode>(LastInst);
@@ -8641,7 +9240,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
-Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
// List of instructions/lanes from current block and/or the blocks which are
// part of the current loop. These instructions will be inserted at the end to
// make it possible to optimize loops and hoist invariant instructions out of
@@ -8658,7 +9257,8 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
for (int I = 0, E = VL.size(); I < E; ++I) {
if (auto *Inst = dyn_cast<Instruction>(VL[I]))
if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
- getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&
+ getTreeEntry(Inst) ||
+ (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
PostponedIndices.insert(I).second)
PostponedInsts.emplace_back(Inst, I);
}
@@ -8681,7 +9281,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
Value *Val0 =
isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
- Value *Vec = PoisonValue::get(VecTy);
+ Value *Vec = Root ? Root : PoisonValue::get(VecTy);
SmallVector<int> NonConsts;
// Insert constant values at first.
for (int I = 0, E = VL.size(); I < E; ++I) {
@@ -8691,6 +9291,18 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
NonConsts.push_back(I);
continue;
}
+ if (Root) {
+ if (!isa<UndefValue>(VL[I])) {
+ NonConsts.push_back(I);
+ continue;
+ }
+ if (isa<PoisonValue>(VL[I]))
+ continue;
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
+ if (SV->getMaskValue(I) == PoisonMaskElem)
+ continue;
+ }
+ }
Vec = CreateInsertElement(Vec, VL[I], I);
}
// Insert non-constant values.
@@ -8789,6 +9401,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
}
return Vec;
}
+ Value *createIdentity(Value *V) { return V; }
+ Value *createPoison(Type *Ty, unsigned VF) {
+ return PoisonValue::get(FixedVectorType::get(Ty, VF));
+ }
/// Resizes 2 input vector to match the sizes, if the they are not equal
/// yet. The smallest vector is resized to the size of the larger vector.
void resizeToMatch(Value *&V1, Value *&V2) {
@@ -8798,7 +9414,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
int VF = std::max(V1VF, V2VF);
int MinVF = std::min(V1VF, V2VF);
- SmallVector<int> IdentityMask(VF, UndefMaskElem);
+ SmallVector<int> IdentityMask(VF, PoisonMaskElem);
std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
0);
Value *&Op = MinVF == V1VF ? V1 : V2;
@@ -8821,7 +9437,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
assert(V1 && "Expected at least one vector value.");
ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
R.CSEBlocks);
- return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder);
+ return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
+ ShuffleBuilder);
}
/// Transforms mask \p CommonMask per given \p Mask to make proper set after
@@ -8829,7 +9446,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
ArrayRef<int> Mask) {
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem)
+ if (Mask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
}
@@ -8837,6 +9454,39 @@ public:
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
: Builder(Builder), R(R) {}
+ /// Adjusts extractelements after reusing them.
+ Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+ Value *VecBase = nullptr;
+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+ int Idx = Mask[I];
+ if (Idx == PoisonMaskElem)
+ continue;
+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ VecBase = EI->getVectorOperand();
+ // If the only one use is vectorized - can delete the extractelement
+ // itself.
+ if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
+ return !R.ScalarToTreeEntry.count(U);
+ }))
+ continue;
+ R.eraseInstruction(EI);
+ }
+ return VecBase;
+ }
+ /// Checks if the specified entry \p E needs to be delayed because of its
+ /// dependency nodes.
+ Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
+ // No need to delay emission if all deps are ready.
+ if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
+ return nullptr;
+ // Postpone gather emission, will be emitted after the end of the
+ // process to keep correct order.
+ auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
+ E->getVectorFactor());
+ return Builder.CreateAlignedLoad(
+ VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
+ MaybeAlign());
+ }
/// Adds 2 input vectors and the mask for their shuffling.
void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
@@ -8849,15 +9499,15 @@ public:
Value *Vec = InVectors.front();
if (InVectors.size() == 2) {
Vec = createShuffle(Vec, InVectors.back(), CommonMask);
- transformMaskAfterShuffle(CommonMask, Mask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
} else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Mask.size()) {
Vec = createShuffle(Vec, nullptr, CommonMask);
- transformMaskAfterShuffle(CommonMask, Mask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
}
V1 = createShuffle(V1, V2, Mask);
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem)
+ if (Mask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx + Sz;
InVectors.front() = Vec;
if (InVectors.size() == 2)
@@ -8870,7 +9520,7 @@ public:
if (InVectors.empty()) {
if (!isa<FixedVectorType>(V1->getType())) {
V1 = createShuffle(V1, nullptr, CommonMask);
- CommonMask.assign(Mask.size(), UndefMaskElem);
+ CommonMask.assign(Mask.size(), PoisonMaskElem);
transformMaskAfterShuffle(CommonMask, Mask);
}
InVectors.push_back(V1);
@@ -8892,7 +9542,7 @@ public:
transformMaskAfterShuffle(CommonMask, CommonMask);
}
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem)
+ if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
CommonMask[Idx] =
V->getType() != V1->getType()
? Idx + Sz
@@ -8910,7 +9560,7 @@ public:
// Check if second vector is required if the used elements are already
// used from the first one.
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) {
+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
InVectors.push_back(V1);
break;
}
@@ -8919,7 +9569,7 @@ public:
if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
VF = FTy->getNumElements();
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem)
+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
}
/// Adds another one input vector and the mask for the shuffling.
@@ -8928,17 +9578,46 @@ public:
inversePermutation(Order, NewMask);
add(V1, NewMask);
}
+ Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ return R.gather(VL, Root);
+ }
+ Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
/// Finalize emission of the shuffles.
+ /// \param Action the action (if any) to be performed before final applying of
+ /// the \p ExtMask mask.
Value *
- finalize(ArrayRef<int> ExtMask = std::nullopt) {
+ finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+ function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
+ if (Action) {
+ Value *Vec = InVectors.front();
+ if (InVectors.size() == 2) {
+ Vec = createShuffle(Vec, InVectors.back(), CommonMask);
+ InVectors.pop_back();
+ } else {
+ Vec = createShuffle(Vec, nullptr, CommonMask);
+ }
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ assert(VF > 0 &&
+ "Expected vector length for the final value before action.");
+ unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+ if (VecVF < VF) {
+ SmallVector<int> ResizeMask(VF, PoisonMaskElem);
+ std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
+ Vec = createShuffle(Vec, nullptr, ResizeMask);
+ }
+ Action(Vec, CommonMask);
+ InVectors.front() = Vec;
+ }
if (!ExtMask.empty()) {
if (CommonMask.empty()) {
CommonMask.assign(ExtMask.begin(), ExtMask.end());
} else {
- SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
- if (ExtMask[I] == UndefMaskElem)
+ if (ExtMask[I] == PoisonMaskElem)
continue;
NewMask[I] = CommonMask[ExtMask[I]];
}
@@ -9009,18 +9688,18 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
// ... (use %2)
// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
// br %block
- SmallVector<int> UniqueIdxs(VF, UndefMaskElem);
+ SmallVector<int> UniqueIdxs(VF, PoisonMaskElem);
SmallSet<int, 4> UsedIdxs;
int Pos = 0;
for (int Idx : VE->ReuseShuffleIndices) {
- if (Idx != static_cast<int>(VF) && Idx != UndefMaskElem &&
+ if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem &&
UsedIdxs.insert(Idx).second)
UniqueIdxs[Idx] = Pos;
++Pos;
}
assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
"less than original vector size.");
- UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);
+ UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem);
V = FinalShuffle(V, UniqueIdxs);
} else {
assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
@@ -9031,6 +9710,21 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
V = FinalShuffle(V, UniformMask);
}
}
+ // Need to update the operand gather node, if actually the operand is not a
+ // vectorized node, but the buildvector/gather node, which matches one of
+ // the vectorized nodes.
+ if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
+ }) == VE->UserTreeIndices.end()) {
+ auto *It = find_if(
+ VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather &&
+ TE->UserTreeIndices.front().UserTE == E &&
+ TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
+ });
+ assert(It != VectorizableTree.end() && "Expected gather node operand.");
+ (*It)->VectorizedValue = V;
+ }
return V;
}
}
@@ -9049,108 +9743,370 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->getOpcode() != Instruction::InsertElement &&
E->getOpcode() != Instruction::PHI) {
- Instruction *LastInst = EntryToLastInstruction.lookup(E);
+ Instruction *LastInst = &getLastInstructionInBundle(E);
assert(LastInst && "Failed to find last instruction in bundle");
Builder.SetInsertPoint(LastInst);
}
return vectorizeTree(I->get());
}
-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+template <typename BVTy, typename ResTy, typename... Args>
+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
unsigned VF = E->getVectorFactor();
- ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
- SmallVector<Value *> Gathered(
- VF, PoisonValue::get(E->Scalars.front()->getType()));
bool NeedFreeze = false;
- SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
- // Build a mask out of the redorder indices and reorder scalars per this mask.
+ SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
+ E->ReuseShuffleIndices.end());
+ SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
+ // Build a mask out of the reorder indices and reorder scalars per this
+ // mask.
SmallVector<int> ReorderMask;
inversePermutation(E->ReorderIndices, ReorderMask);
if (!ReorderMask.empty())
- reorderScalars(VL, ReorderMask);
- SmallVector<int> ReuseMask(VF, UndefMaskElem);
- if (!allConstant(VL)) {
+ reorderScalars(GatheredScalars, ReorderMask);
+ auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {
+ if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+ }))
+ return false;
+ TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
+ unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
+ if (UserTE->getNumOperands() != 2)
+ return false;
+ auto *It =
+ find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
+ return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
+ return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
+ }) != TE->UserTreeIndices.end();
+ });
+ if (It == VectorizableTree.end())
+ return false;
+ unsigned I =
+ *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
+ int Sz = Mask.size();
+ if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask))
+ std::iota(Mask.begin(), Mask.end(), 0);
+ else
+ std::fill(Mask.begin(), Mask.end(), I);
+ return true;
+ };
+ BVTy ShuffleBuilder(Params...);
+ ResTy Res = ResTy();
+ SmallVector<int> Mask;
+ SmallVector<int> ExtractMask;
+ std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+ std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+ SmallVector<const TreeEntry *> Entries;
+ Type *ScalarTy = GatheredScalars.front()->getType();
+ if (!all_of(GatheredScalars, UndefValue::classof)) {
+ // Check for gathered extracts.
+ ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
+ SmallVector<Value *> IgnoredVals;
+ if (UserIgnoreList)
+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+ bool Resized = false;
+ if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+ if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+ Resized = true;
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+ // Gather extracts after we check for full matched gathers only.
+ if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+ E->isAltShuffle() ||
+ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+ isSplat(E->Scalars) ||
+ (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
+ GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+ }
+ if (GatherShuffle) {
+ if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
+ // Delay emission of gathers which are not ready yet.
+ PostponedGathers.insert(E);
+ // Postpone gather emission, will be emitted after the end of the
+ // process to keep correct order.
+ return Delayed;
+ }
+ assert((Entries.size() == 1 || Entries.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+ Entries.front()->isSame(E->Scalars)) {
+ // Perfect match in the graph, will reuse the previously vectorized
+ // node. Cost is 0.
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: perfect diamond match for gather bundle that starts with "
+ << *E->Scalars.front() << ".\n");
+ // Restore the mask for previous partially matched values.
+ if (Entries.front()->ReorderIndices.empty() &&
+ ((Entries.front()->ReuseShuffleIndices.empty() &&
+ E->Scalars.size() == Entries.front()->Scalars.size()) ||
+ (E->Scalars.size() ==
+ Entries.front()->ReuseShuffleIndices.size()))) {
+ std::iota(Mask.begin(), Mask.end(), 0);
+ } else {
+ for (auto [I, V] : enumerate(E->Scalars)) {
+ if (isa<PoisonValue>(V)) {
+ Mask[I] = PoisonMaskElem;
+ continue;
+ }
+ Mask[I] = Entries.front()->findLaneForValue(V);
+ }
+ }
+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+ Res = ShuffleBuilder.finalize(E->getCommonMask());
+ return Res;
+ }
+ if (!Resized) {
+ unsigned VF1 = Entries.front()->getVectorFactor();
+ unsigned VF2 = Entries.back()->getVectorFactor();
+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+ // Remove shuffled elements from list of gathers.
+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+ if (Mask[I] != PoisonMaskElem)
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ }
+ }
+ auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
+ SmallVectorImpl<int> &ReuseMask,
+ bool IsRootPoison) {
// For splats with can emit broadcasts instead of gathers, so try to find
// such sequences.
- bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back());
+ bool IsSplat = IsRootPoison && isSplat(Scalars) &&
+ (Scalars.size() > 2 || Scalars.front() == Scalars.back());
+ Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
SmallVector<int> UndefPos;
DenseMap<Value *, unsigned> UniquePositions;
// Gather unique non-const values and all constant values.
// For repeated values, just shuffle them.
- for (auto [I, V] : enumerate(VL)) {
+ int NumNonConsts = 0;
+ int SinglePos = 0;
+ for (auto [I, V] : enumerate(Scalars)) {
if (isa<UndefValue>(V)) {
if (!isa<PoisonValue>(V)) {
- Gathered[I] = V;
ReuseMask[I] = I;
UndefPos.push_back(I);
}
continue;
}
if (isConstant(V)) {
- Gathered[I] = V;
ReuseMask[I] = I;
continue;
}
+ ++NumNonConsts;
+ SinglePos = I;
+ Value *OrigV = V;
+ Scalars[I] = PoisonValue::get(ScalarTy);
if (IsSplat) {
- Gathered.front() = V;
+ Scalars.front() = OrigV;
ReuseMask[I] = 0;
} else {
- const auto Res = UniquePositions.try_emplace(V, I);
- Gathered[Res.first->second] = V;
+ const auto Res = UniquePositions.try_emplace(OrigV, I);
+ Scalars[Res.first->second] = OrigV;
ReuseMask[I] = Res.first->second;
}
}
- if (!UndefPos.empty() && IsSplat) {
+ if (NumNonConsts == 1) {
+ // Restore single insert element.
+ if (IsSplat) {
+ ReuseMask.assign(VF, PoisonMaskElem);
+ std::swap(Scalars.front(), Scalars[SinglePos]);
+ if (!UndefPos.empty() && UndefPos.front() == 0)
+ Scalars.front() = UndefValue::get(ScalarTy);
+ }
+ ReuseMask[SinglePos] = SinglePos;
+ } else if (!UndefPos.empty() && IsSplat) {
// For undef values, try to replace them with the simple broadcast.
// We can do it if the broadcasted value is guaranteed to be
// non-poisonous, or by freezing the incoming scalar value first.
- auto *It = find_if(Gathered, [this, E](Value *V) {
+ auto *It = find_if(Scalars, [this, E](Value *V) {
return !isa<UndefValue>(V) &&
(getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
- any_of(V->uses(), [E](const Use &U) {
- // Check if the value already used in the same operation in
- // one of the nodes already.
- return E->UserTreeIndices.size() == 1 &&
- is_contained(
- E->UserTreeIndices.front().UserTE->Scalars,
- U.getUser()) &&
- E->UserTreeIndices.front().EdgeIdx != U.getOperandNo();
- }));
+ (E->UserTreeIndices.size() == 1 &&
+ any_of(V->uses(), [E](const Use &U) {
+ // Check if the value already used in the same operation in
+ // one of the nodes already.
+ return E->UserTreeIndices.front().EdgeIdx !=
+ U.getOperandNo() &&
+ is_contained(
+ E->UserTreeIndices.front().UserTE->Scalars,
+ U.getUser());
+ })));
});
- if (It != Gathered.end()) {
+ if (It != Scalars.end()) {
// Replace undefs by the non-poisoned scalars and emit broadcast.
- int Pos = std::distance(Gathered.begin(), It);
+ int Pos = std::distance(Scalars.begin(), It);
for_each(UndefPos, [&](int I) {
// Set the undef position to the non-poisoned scalar.
ReuseMask[I] = Pos;
- // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already.
+ // Replace the undef by the poison, in the mask it is replaced by
+ // non-poisoned scalar already.
if (I != Pos)
- Gathered[I] = PoisonValue::get(Gathered[I]->getType());
+ Scalars[I] = PoisonValue::get(ScalarTy);
});
} else {
// Replace undefs by the poisons, emit broadcast and then emit
// freeze.
for_each(UndefPos, [&](int I) {
- ReuseMask[I] = UndefMaskElem;
- if (isa<UndefValue>(Gathered[I]))
- Gathered[I] = PoisonValue::get(Gathered[I]->getType());
+ ReuseMask[I] = PoisonMaskElem;
+ if (isa<UndefValue>(Scalars[I]))
+ Scalars[I] = PoisonValue::get(ScalarTy);
});
NeedFreeze = true;
}
}
+ };
+ if (ExtractShuffle || GatherShuffle) {
+ bool IsNonPoisoned = true;
+ bool IsUsedInExpr = false;
+ Value *Vec1 = nullptr;
+ if (ExtractShuffle) {
+ // Gather of extractelements can be represented as just a shuffle of
+ // a single/two vectors the scalars are extracted from.
+ // Find input vectors.
+ Value *Vec2 = nullptr;
+ for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+ if (ExtractMask[I] == PoisonMaskElem ||
+ (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+ ExtractMask[I] = PoisonMaskElem;
+ continue;
+ }
+ if (isa<UndefValue>(E->Scalars[I]))
+ continue;
+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ if (!Vec1) {
+ Vec1 = EI->getVectorOperand();
+ } else if (Vec1 != EI->getVectorOperand()) {
+ assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+ "Expected only 1 or 2 vectors shuffle.");
+ Vec2 = EI->getVectorOperand();
+ }
+ }
+ if (Vec2) {
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
+ ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
+ } else if (Vec1) {
+ IsUsedInExpr = FindReusedSplat(ExtractMask);
+ ShuffleBuilder.add(Vec1, ExtractMask);
+ IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
+ } else {
+ ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
+ ScalarTy, GatheredScalars.size())),
+ ExtractMask);
+ }
+ }
+ if (GatherShuffle) {
+ if (Entries.size() == 1) {
+ IsUsedInExpr = FindReusedSplat(Mask);
+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+ } else {
+ ShuffleBuilder.add(Entries.front()->VectorizedValue,
+ Entries.back()->VectorizedValue, Mask);
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
+ isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+ }
+ }
+ // Try to figure out best way to combine values: build a shuffle and insert
+ // elements or just build several shuffles.
+ // Insert non-constant scalars.
+ SmallVector<Value *> NonConstants(GatheredScalars);
+ int EMSz = ExtractMask.size();
+ int MSz = Mask.size();
+ // Try to build constant vector and shuffle with it only if currently we
+ // have a single permutation and more than 1 scalar constants.
+ bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
+ bool IsIdentityShuffle =
+ (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc &&
+ none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
+ ShuffleVectorInst::isIdentityMask(ExtractMask)) ||
+ (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc &&
+ none_of(Mask, [&](int I) { return I >= MSz; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask));
+ bool EnoughConstsForShuffle =
+ IsSingleShuffle &&
+ (none_of(GatheredScalars,
+ [](Value *V) {
+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+ }) ||
+ any_of(GatheredScalars,
+ [](Value *V) {
+ return isa<Constant>(V) && !isa<UndefValue>(V);
+ })) &&
+ (!IsIdentityShuffle ||
+ (GatheredScalars.size() == 2 &&
+ any_of(GatheredScalars,
+ [](Value *V) { return !isa<UndefValue>(V); })) ||
+ count_if(GatheredScalars, [](Value *V) {
+ return isa<Constant>(V) && !isa<PoisonValue>(V);
+ }) > 1);
+ // NonConstants array contains just non-constant values, GatheredScalars
+ // contains only constant to build final vector and then shuffle.
+ for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
+ if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
+ NonConstants[I] = PoisonValue::get(ScalarTy);
+ else
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ // Generate constants for final shuffle and build a mask for them.
+ if (!all_of(GatheredScalars, PoisonValue::classof)) {
+ SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
+ TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ ShuffleBuilder.add(BV, BVMask);
+ }
+ if (all_of(NonConstants, [=](Value *V) {
+ return isa<PoisonValue>(V) ||
+ (IsSingleShuffle && ((IsIdentityShuffle &&
+ IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
+ }))
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ else
+ Res = ShuffleBuilder.finalize(
+ E->ReuseShuffleIndices, E->Scalars.size(),
+ [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+ TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
+ Vec = ShuffleBuilder.gather(NonConstants, Vec);
+ });
+ } else if (!allConstant(GatheredScalars)) {
+ // Gather unique scalars and all constants.
+ SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
+ TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ ShuffleBuilder.add(BV, ReuseMask);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
} else {
- ReuseMask.clear();
- copy(VL, Gathered.begin());
+ // Gather all constants.
+ SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
+ for (auto [I, V] : enumerate(E->Scalars)) {
+ if (!isa<PoisonValue>(V))
+ Mask[I] = I;
+ }
+ Value *BV = ShuffleBuilder.gather(E->Scalars);
+ ShuffleBuilder.add(BV, Mask);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
}
- // Gather unique scalars and all constants.
- Value *Vec = gather(Gathered);
- ShuffleBuilder.add(Vec, ReuseMask);
- Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+
if (NeedFreeze)
- Vec = Builder.CreateFreeze(Vec);
- return Vec;
+ Res = ShuffleBuilder.createFreeze(Res);
+ return Res;
+}
+
+Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+ return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
+ *this);
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@@ -9161,10 +10117,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return E->VectorizedValue;
}
+ if (E->State == TreeEntry::NeedToGather) {
+ if (E->getMainOp() && E->Idx == 0)
+ setInsertPointAfterBundle(E);
+ Value *Vec = createBuildVector(E);
+ E->VectorizedValue = Vec;
+ return Vec;
+ }
+
auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
- if (E->State != TreeEntry::NeedToGather &&
- E->getOpcode() == Instruction::Store) {
+ if (E->getOpcode() == Instruction::Store) {
ArrayRef<int> Mask =
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
E->ReorderIndices.size());
@@ -9175,45 +10138,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
};
- if (E->State == TreeEntry::NeedToGather) {
- if (E->Idx > 0) {
- // We are in the middle of a vectorizable chain. We need to gather the
- // scalars from the users.
- Value *Vec = createBuildVector(E);
- E->VectorizedValue = Vec;
- return Vec;
- }
- if (E->getMainOp())
- setInsertPointAfterBundle(E);
- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
- // Build a mask out of the reorder indices and reorder scalars per this
- // mask.
- SmallVector<int> ReorderMask;
- inversePermutation(E->ReorderIndices, ReorderMask);
- if (!ReorderMask.empty())
- reorderScalars(GatheredScalars, ReorderMask);
- Value *Vec;
- SmallVector<int> Mask;
- SmallVector<const TreeEntry *> Entries;
- std::optional<TargetTransformInfo::ShuffleKind> Shuffle =
- isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
- if (Shuffle) {
- assert((Entries.size() == 1 || Entries.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
- Entries.back()->VectorizedValue, Mask);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleExtractSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- } else {
- Vec = gather(E->Scalars);
- }
- Vec = FinalShuffle(Vec, E);
- E->VectorizedValue = Vec;
- return Vec;
- }
-
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");
@@ -9248,7 +10172,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
- SmallPtrSet<BasicBlock*, 4> VisitedBBs;
+ SmallPtrSet<BasicBlock *, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -9314,14 +10238,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<int> Mask;
if (!E->ReorderIndices.empty()) {
inversePermutation(E->ReorderIndices, Mask);
- Mask.append(NumElts - NumScalars, UndefMaskElem);
+ Mask.append(NumElts - NumScalars, PoisonMaskElem);
} else {
- Mask.assign(NumElts, UndefMaskElem);
+ Mask.assign(NumElts, PoisonMaskElem);
std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
}
// Create InsertVector shuffle if necessary
bool IsIdentity = true;
- SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+ SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
Value *Scalar = E->Scalars[PrevMask[I]];
@@ -9337,9 +10261,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
}
- SmallVector<int> InsertMask(NumElts, UndefMaskElem);
+ SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
for (unsigned I = 0; I < NumElts; I++) {
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
InsertMask[Offset + I] = I;
}
SmallBitVector UseMask =
@@ -9354,10 +10278,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
if (!IsFirstPoison.all()) {
for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == UndefMaskElem && !IsFirstPoison.test(I))
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
InsertMask[I] = I + NumElts;
}
- }
+ }
V = Builder.CreateShuffleVector(
V,
IsFirstPoison.all() ? PoisonValue::get(V->getType())
@@ -9372,8 +10296,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallBitVector IsFirstPoison =
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == UndefMaskElem)
- InsertMask[I] = IsFirstPoison.test(I) ? UndefMaskElem : I;
+ if (InsertMask[I] == PoisonMaskElem)
+ InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
else
InsertMask[I] += NumElts;
}
@@ -9544,20 +10468,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
LoadInst *LI = cast<LoadInst>(VL0);
Instruction *NewLI;
- unsigned AS = LI->getPointerAddressSpace();
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
- Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
- NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
+ NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
- // The pointer operand uses an in-tree scalar so we add the new BitCast
- // or LoadInst to ExternalUses list to make sure that an extract will
+ // The pointer operand uses an in-tree scalar so we add the new
+ // LoadInst to ExternalUses list to make sure that an extract will
// be generated in the future.
if (TreeEntry *Entry = getTreeEntry(PO)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(PO);
- ExternalUses.emplace_back(
- PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);
+ ExternalUses.emplace_back(PO, NewLI, FoundLane);
}
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
@@ -9653,7 +10574,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CallInst *CI = cast<CallInst>(VL0);
setInsertPointAfterBundle(E);
- Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (Function *FI = CI->getCalledFunction())
IID = FI->getIntrinsicID();
@@ -9665,8 +10586,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
- SmallVector<Type *, 2> TysForDecl =
- {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+ SmallVector<Type *, 2> TysForDecl;
+ // Add return type if intrinsic is overloaded on it.
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))
+ TysForDecl.push_back(
+ FixedVectorType::get(CI->getType(), E->Scalars.size()));
for (int j = 0, e = CI->arg_size(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
@@ -9808,14 +10732,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
default:
- llvm_unreachable("unknown inst");
+ llvm_unreachable("unknown inst");
}
return nullptr;
}
Value *BoUpSLP::vectorizeTree() {
ExtraValueToDebugLocsMap ExternallyUsedValues;
- return vectorizeTree(ExternallyUsedValues);
+ SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
+ return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
}
namespace {
@@ -9829,28 +10754,51 @@ struct ShuffledInsertData {
};
} // namespace
-Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
- Instruction *ReductionRoot) {
+Value *BoUpSLP::vectorizeTree(
+ const ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
+ Instruction *ReductionRoot) {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
}
-
- // Pre-gather last instructions.
- for (const std::unique_ptr<TreeEntry> &E : VectorizableTree) {
- if ((E->State == TreeEntry::NeedToGather &&
- (!E->getMainOp() || E->Idx > 0)) ||
- (E->State != TreeEntry::NeedToGather &&
- E->getOpcode() == Instruction::ExtractValue) ||
- E->getOpcode() == Instruction::InsertElement)
- continue;
- Instruction *LastInst = &getLastInstructionInBundle(E.get());
- EntryToLastInstruction.try_emplace(E.get(), LastInst);
- }
+ // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
+ // need to rebuild it.
+ EntryToLastInstruction.clear();
Builder.SetInsertPoint(ReductionRoot ? ReductionRoot
: &F->getEntryBlock().front());
auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+ // Run through the list of postponed gathers and emit them, replacing the temp
+ // emitted allocas with actual vector instructions.
+ ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
+ DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
+ for (const TreeEntry *E : PostponedNodes) {
+ auto *TE = const_cast<TreeEntry *>(E);
+ if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
+ if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
+ TE->UserTreeIndices.front().EdgeIdx)))
+ // Found gather node which is absolutely the same as one of the
+ // vectorized nodes. It may happen after reordering.
+ continue;
+ auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
+ TE->VectorizedValue = nullptr;
+ auto *UserI =
+ cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
+ Builder.SetInsertPoint(PrevVec);
+ Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
+ Value *Vec = vectorizeTree(TE);
+ PrevVec->replaceAllUsesWith(Vec);
+ PostponedValues.try_emplace(Vec).first->second.push_back(TE);
+ // Replace the stub vector node, if it was used before for one of the
+ // buildvector nodes already.
+ auto It = PostponedValues.find(PrevVec);
+ if (It != PostponedValues.end()) {
+ for (TreeEntry *VTE : It->getSecond())
+ VTE->VectorizedValue = Vec;
+ }
+ eraseInstruction(PrevVec);
+ }
// If the vectorized tree can be rewritten in a smaller type, we truncate the
// vectorized root. InstCombine will then rewrite the entire expression. We
@@ -9968,14 +10916,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
Builder.SetInsertPoint(&F->getEntryBlock().front());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- auto &NewInstLocs = ExternallyUsedValues[NewInst];
- auto It = ExternallyUsedValues.find(Scalar);
- assert(It != ExternallyUsedValues.end() &&
- "Externally used scalar is not found in ExternallyUsedValues");
- NewInstLocs.append(It->second);
- ExternallyUsedValues.erase(Scalar);
// Required to update internally referenced instructions.
Scalar->replaceAllUsesWith(NewInst);
+ ReplacedExternals.emplace_back(Scalar, NewInst);
continue;
}
@@ -10004,7 +10947,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
ShuffledInserts.size() - 1);
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
@@ -10017,7 +10960,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
do {
IEBase = cast<InsertElementInst>(Base);
int IEIdx = *getInsertIndex(IEBase);
- assert(Mask[Idx] == UndefMaskElem &&
+ assert(Mask[Idx] == PoisonMaskElem &&
"InsertElementInstruction used already.");
Mask[IEIdx] = IEIdx;
Base = IEBase->getOperand(0);
@@ -10035,7 +10978,7 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
}
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
- Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask.assign(FTy->getNumElements(), PoisonMaskElem);
Mask[Idx] = ExternalUse.Lane;
It->InsertElements.push_back(cast<InsertElementInst>(User));
continue;
@@ -10077,8 +11020,8 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
}
auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
- SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
- SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
for (int I = 0, E = Mask.size(); I < E; ++I) {
if (Mask[I] < VF)
@@ -10103,9 +11046,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
return std::make_pair(Vec, true);
}
if (!ForSingleMask) {
- SmallVector<int> ResizeMask(VF, UndefMaskElem);
+ SmallVector<int> ResizeMask(VF, PoisonMaskElem);
for (unsigned I = 0; I < VF; ++I) {
- if (Mask[I] != UndefMaskElem)
+ if (Mask[I] != PoisonMaskElem)
ResizeMask[Mask[I]] = Mask[I];
}
Vec = CreateShuffle(Vec, nullptr, ResizeMask);
@@ -10308,14 +11251,14 @@ void BoUpSLP::optimizeGatherSequence() {
// registers.
unsigned LastUndefsCnt = 0;
for (int I = 0, E = NewMask.size(); I < E; ++I) {
- if (SM1[I] == UndefMaskElem)
+ if (SM1[I] == PoisonMaskElem)
++LastUndefsCnt;
else
LastUndefsCnt = 0;
- if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&
+ if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
NewMask[I] != SM1[I])
return false;
- if (NewMask[I] == UndefMaskElem)
+ if (NewMask[I] == PoisonMaskElem)
NewMask[I] = SM1[I];
}
// Check if the last undefs actually change the final number of used vector
@@ -10590,11 +11533,20 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
+ // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
+ // against the budget. Otherwise debug info could affect codegen.
BasicBlock::reverse_iterator UpIter =
++ScheduleStart->getIterator().getReverse();
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
+ auto IsAssumeLikeIntr = [](const Instruction &I) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ return II->isAssumeLikeIntrinsic();
+ return false;
+ };
+ UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
+ DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
&*DownIter != I) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
@@ -10604,6 +11556,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
++UpIter;
++DownIter;
+
+ UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
+ DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
}
if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
assert(I->getParent() == ScheduleStart->getParent() &&
@@ -10804,7 +11759,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
unsigned numAliased = 0;
unsigned DistToSrc = 1;
- for ( ; DepDest; DepDest = DepDest->NextLoadStore) {
+ for (; DepDest; DepDest = DepDest->NextLoadStore) {
assert(isInSchedulingRegion(DepDest));
// We have two limits to reduce the complexity:
@@ -11163,8 +12118,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// we can truncate the roots to this narrower type.
for (auto *Root : TreeRoot) {
auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
- MaxBitWidth = std::max<unsigned>(
- Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+ MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
+ MaxBitWidth);
}
// True if the roots can be zero-extended back to their original type, rather
@@ -11223,8 +12178,7 @@ void BoUpSLP::computeMinimumValueSizes() {
}
// Round MaxBitWidth up to the next power-of-two.
- if (!isPowerOf2_64(MaxBitWidth))
- MaxBitWidth = NextPowerOf2(MaxBitWidth);
+ MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
// If the maximum bit width we compute is less than the with of the roots'
// type, we can proceed with the narrowing. Otherwise, do nothing.
@@ -11242,60 +12196,6 @@ void BoUpSLP::computeMinimumValueSizes() {
MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
}
-namespace {
-
-/// The SLPVectorizer Pass.
-struct SLPVectorizer : public FunctionPass {
- SLPVectorizerPass Impl;
-
- /// Pass identification, replacement for typeid
- static char ID;
-
- explicit SLPVectorizer() : FunctionPass(ID) {
- initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
- }
-
- bool doInitialization(Module &M) override { return false; }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- FunctionPass::getAnalysisUsage(AU);
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<DemandedBitsWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<InjectTLIMappingsLegacy>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-
-} // end anonymous namespace
-
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
@@ -11536,7 +12436,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(Operands[0]);
- unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
MaxElts);
@@ -11618,17 +12518,8 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
}
}
-bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
- if (!A || !B)
- return false;
- if (isa<InsertElementInst>(A) || isa<InsertElementInst>(B))
- return false;
- Value *VL[] = {A, B};
- return tryToVectorizeList(VL, R);
-}
-
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- bool LimitForRegisterSize) {
+ bool MaxVFOnly) {
if (VL.size() < 2)
return false;
@@ -11663,7 +12554,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = R.getMinVF(Sz);
- unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
if (MaxVF < 2) {
R.getORE()->emit([&]() {
@@ -11690,21 +12581,17 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
- unsigned OpsWidth = 0;
+ unsigned ActualVF = std::min(MaxInst - I, VF);
- if (I + VF > MaxInst)
- OpsWidth = MaxInst - I;
- else
- OpsWidth = VF;
-
- if (!isPowerOf2_32(OpsWidth))
+ if (!isPowerOf2_32(ActualVF))
continue;
- if ((LimitForRegisterSize && OpsWidth < MaxVF) ||
- (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
+ if (MaxVFOnly && ActualVF < MaxVF)
+ break;
+ if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
break;
- ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+ ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
// Check that a previous iteration of this loop did not delete the Value.
if (llvm::any_of(Ops, [&R](Value *V) {
auto *I = dyn_cast<Instruction>(V);
@@ -11712,7 +12599,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
}))
continue;
- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
<< "\n");
R.buildTree(Ops);
@@ -11730,7 +12617,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
MinCost = std::min(MinCost, Cost);
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
- << " for VF=" << OpsWidth << "\n");
+ << " for VF=" << ActualVF << "\n");
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
@@ -11806,14 +12693,14 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
}
if (Candidates.size() == 1)
- return tryToVectorizePair(Op0, Op1, R);
+ return tryToVectorizeList({Op0, Op1}, R);
// We have multiple options. Try to pick the single best.
std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
if (!BestCandidate)
return false;
- return tryToVectorizePair(Candidates[*BestCandidate].first,
- Candidates[*BestCandidate].second, R);
+ return tryToVectorizeList(
+ {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
}
namespace {
@@ -11857,6 +12744,9 @@ class HorizontalReduction {
WeakTrackingVH ReductionRoot;
/// The type of reduction operation.
RecurKind RdxKind;
+ /// Checks if the optimization of original scalar identity operations on
+ /// matched horizontal reductions is enabled and allowed.
+ bool IsSupportedHorRdxIdentityOp = false;
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
@@ -11888,6 +12778,9 @@ class HorizontalReduction {
return I->getFastMathFlags().noNaNs();
}
+ if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
+ return true;
+
return I->isAssociative();
}
@@ -11905,6 +12798,7 @@ class HorizontalReduction {
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+ bool IsConstant = isConstant(LHS) && isConstant(RHS);
switch (Kind) {
case RecurKind::Or:
if (UseSelect &&
@@ -11926,29 +12820,49 @@ class HorizontalReduction {
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
case RecurKind::FMax:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ maxnum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
case RecurKind::FMin:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ minnum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
+ case RecurKind::FMaximum:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ maximum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
+ return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
+ case RecurKind::FMinimum:
+ if (IsConstant)
+ return ConstantFP::get(LHS->getType(),
+ minimum(cast<ConstantFP>(LHS)->getValueAPF(),
+ cast<ConstantFP>(RHS)->getValueAPF()));
+ return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
case RecurKind::SMax:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
case RecurKind::SMin:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
case RecurKind::UMax:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
case RecurKind::UMin:
- if (UseSelect) {
+ if (IsConstant || UseSelect) {
Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
@@ -11984,6 +12898,7 @@ class HorizontalReduction {
return Op;
}
+public:
static RecurKind getRdxKind(Value *V) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
@@ -12010,6 +12925,10 @@ class HorizontalReduction {
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
return RecurKind::FMin;
+ if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
+ return RecurKind::FMaximum;
+ if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
+ return RecurKind::FMinimum;
// This matches either cmp+select or intrinsics. SLP is expected to handle
// either form.
// TODO: If we are canonicalizing to intrinsics, we can remove several
@@ -12086,6 +13005,7 @@ class HorizontalReduction {
return isCmpSelMinMax(I) ? 1 : 0;
}
+private:
/// Total number of operands in the reduction operation.
static unsigned getNumberOfOperands(Instruction *I) {
return isCmpSelMinMax(I) ? 3 : 2;
@@ -12134,17 +13054,6 @@ class HorizontalReduction {
}
}
- static Value *getLHS(RecurKind Kind, Instruction *I) {
- if (Kind == RecurKind::None)
- return nullptr;
- return I->getOperand(getFirstOperandIndex(I));
- }
- static Value *getRHS(RecurKind Kind, Instruction *I) {
- if (Kind == RecurKind::None)
- return nullptr;
- return I->getOperand(getFirstOperandIndex(I) + 1);
- }
-
static bool isGoodForReduction(ArrayRef<Value *> Data) {
int Sz = Data.size();
auto *I = dyn_cast<Instruction>(Data.front());
@@ -12156,65 +13065,39 @@ public:
HorizontalReduction() = default;
/// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+ bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
ScalarEvolution &SE, const DataLayout &DL,
const TargetLibraryInfo &TLI) {
- assert((!Phi || is_contained(Phi->operands(), Inst)) &&
- "Phi needs to use the binary operator");
- assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
- isa<IntrinsicInst>(Inst)) &&
- "Expected binop, select, or intrinsic for reduction matching");
- RdxKind = getRdxKind(Inst);
-
- // We could have a initial reductions that is not an add.
- // r *= v1 + v2 + v3 + v4
- // In such a case start looking for a tree rooted in the first '+'.
- if (Phi) {
- if (getLHS(RdxKind, Inst) == Phi) {
- Phi = nullptr;
- Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst));
- if (!Inst)
- return false;
- RdxKind = getRdxKind(Inst);
- } else if (getRHS(RdxKind, Inst) == Phi) {
- Phi = nullptr;
- Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst));
- if (!Inst)
- return false;
- RdxKind = getRdxKind(Inst);
- }
- }
-
- if (!isVectorizable(RdxKind, Inst))
+ RdxKind = HorizontalReduction::getRdxKind(Root);
+ if (!isVectorizable(RdxKind, Root))
return false;
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
- Type *Ty = Inst->getType();
+ Type *Ty = Root->getType();
if (!isValidElementType(Ty) || Ty->isPointerTy())
return false;
// Though the ultimate reduction may have multiple uses, its condition must
// have only single use.
- if (auto *Sel = dyn_cast<SelectInst>(Inst))
+ if (auto *Sel = dyn_cast<SelectInst>(Root))
if (!Sel->getCondition()->hasOneUse())
return false;
- ReductionRoot = Inst;
+ ReductionRoot = Root;
// Iterate through all the operands of the possible reduction tree and
// gather all the reduced values, sorting them by their value id.
- BasicBlock *BB = Inst->getParent();
- bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
- SmallVector<Instruction *> Worklist(1, Inst);
+ BasicBlock *BB = Root->getParent();
+ bool IsCmpSelMinMax = isCmpSelMinMax(Root);
+ SmallVector<Instruction *> Worklist(1, Root);
// Checks if the operands of the \p TreeN instruction are also reduction
// operations or should be treated as reduced values or an extra argument,
// which is not part of the reduction.
- auto &&CheckOperands = [this, IsCmpSelMinMax,
- BB](Instruction *TreeN,
- SmallVectorImpl<Value *> &ExtraArgs,
- SmallVectorImpl<Value *> &PossibleReducedVals,
- SmallVectorImpl<Instruction *> &ReductionOps) {
+ auto CheckOperands = [&](Instruction *TreeN,
+ SmallVectorImpl<Value *> &ExtraArgs,
+ SmallVectorImpl<Value *> &PossibleReducedVals,
+ SmallVectorImpl<Instruction *> &ReductionOps) {
for (int I = getFirstOperandIndex(TreeN),
End = getNumberOfOperands(TreeN);
I < End; ++I) {
@@ -12229,10 +13112,14 @@ public:
}
// If the edge is not an instruction, or it is different from the main
// reduction opcode or has too many uses - possible reduced value.
+ // Also, do not try to reduce const values, if the operation is not
+ // foldable.
if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
- !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+ !isVectorizable(RdxKind, EdgeInst) ||
+ (R.isAnalyzedReductionRoot(EdgeInst) &&
+ all_of(EdgeInst->operands(), Constant::classof))) {
PossibleReducedVals.push_back(EdgeVal);
continue;
}
@@ -12246,10 +13133,43 @@ public:
// instructions (grouping them by the predicate).
MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
PossibleReducedVals;
- initReductionOps(Inst);
+ initReductionOps(Root);
DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
SmallSet<size_t, 2> LoadKeyUsed;
SmallPtrSet<Value *, 4> DoNotReverseVals;
+
+ auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
+ Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
+ if (LoadKeyUsed.contains(Key)) {
+ auto LIt = LoadsMap.find(Ptr);
+ if (LIt != LoadsMap.end()) {
+ for (LoadInst *RLI : LIt->second) {
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(), DL, SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ for (LoadInst *RLI : LIt->second) {
+ if (arePointersCompatible(RLI->getPointerOperand(),
+ LI->getPointerOperand(), TLI)) {
+ hash_code SubKey = hash_value(RLI->getPointerOperand());
+ DoNotReverseVals.insert(RLI);
+ return SubKey;
+ }
+ }
+ if (LIt->second.size() > 2) {
+ hash_code SubKey =
+ hash_value(LIt->second.back()->getPointerOperand());
+ DoNotReverseVals.insert(LIt->second.back());
+ return SubKey;
+ }
+ }
+ }
+ LoadKeyUsed.insert(Key);
+ LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
+ return hash_value(LI->getPointerOperand());
+ };
+
while (!Worklist.empty()) {
Instruction *TreeN = Worklist.pop_back_val();
SmallVector<Value *> Args;
@@ -12269,41 +13189,8 @@ public:
// results.
for (Value *V : PossibleRedVals) {
size_t Key, Idx;
- std::tie(Key, Idx) = generateKeySubkey(
- V, &TLI,
- [&](size_t Key, LoadInst *LI) {
- Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
- if (LoadKeyUsed.contains(Key)) {
- auto LIt = LoadsMap.find(Ptr);
- if (LIt != LoadsMap.end()) {
- for (LoadInst *RLI: LIt->second) {
- if (getPointersDiff(
- RLI->getType(), RLI->getPointerOperand(),
- LI->getType(), LI->getPointerOperand(), DL, SE,
- /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
- }
- for (LoadInst *RLI : LIt->second) {
- if (arePointersCompatible(RLI->getPointerOperand(),
- LI->getPointerOperand(), TLI)) {
- hash_code SubKey = hash_value(RLI->getPointerOperand());
- DoNotReverseVals.insert(RLI);
- return SubKey;
- }
- }
- if (LIt->second.size() > 2) {
- hash_code SubKey =
- hash_value(LIt->second.back()->getPointerOperand());
- DoNotReverseVals.insert(LIt->second.back());
- return SubKey;
- }
- }
- }
- LoadKeyUsed.insert(Key);
- LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
- return hash_value(LI->getPointerOperand());
- },
- /*AllowAlternate=*/false);
+ std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
++PossibleReducedVals[Key][Idx]
.insert(std::make_pair(V, 0))
.first->second;
@@ -12312,40 +13199,8 @@ public:
PossibleReductionOps.rend());
} else {
size_t Key, Idx;
- std::tie(Key, Idx) = generateKeySubkey(
- TreeN, &TLI,
- [&](size_t Key, LoadInst *LI) {
- Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
- if (LoadKeyUsed.contains(Key)) {
- auto LIt = LoadsMap.find(Ptr);
- if (LIt != LoadsMap.end()) {
- for (LoadInst *RLI: LIt->second) {
- if (getPointersDiff(RLI->getType(),
- RLI->getPointerOperand(), LI->getType(),
- LI->getPointerOperand(), DL, SE,
- /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
- }
- for (LoadInst *RLI : LIt->second) {
- if (arePointersCompatible(RLI->getPointerOperand(),
- LI->getPointerOperand(), TLI)) {
- hash_code SubKey = hash_value(RLI->getPointerOperand());
- DoNotReverseVals.insert(RLI);
- return SubKey;
- }
- }
- if (LIt->second.size() > 2) {
- hash_code SubKey = hash_value(LIt->second.back()->getPointerOperand());
- DoNotReverseVals.insert(LIt->second.back());
- return SubKey;
- }
- }
- }
- LoadKeyUsed.insert(Key);
- LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
- return hash_value(LI->getPointerOperand());
- },
- /*AllowAlternate=*/false);
+ std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
++PossibleReducedVals[Key][Idx]
.insert(std::make_pair(TreeN, 0))
.first->second;
@@ -12407,14 +13262,18 @@ public:
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
- size_t NumReducedVals =
+ unsigned NumReducedVals =
std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
- [](size_t Num, ArrayRef<Value *> Vals) {
+ [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
if (!isGoodForReduction(Vals))
return Num;
return Num + Vals.size();
});
- if (NumReducedVals < ReductionLimit) {
+ if (NumReducedVals < ReductionLimit &&
+ (!AllowHorRdxIdenityOptimization ||
+ all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
+ return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
+ }))) {
for (ReductionOpsType &RdxOps : ReductionOps)
for (Value *RdxOp : RdxOps)
V.analyzedReductionRoot(cast<Instruction>(RdxOp));
@@ -12428,6 +13287,7 @@ public:
DenseMap<Value *, WeakTrackingVH> TrackedVals(
ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+ SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
// The same extra argument may be used several times, so log each attempt
// to use it.
@@ -12448,6 +13308,18 @@ public:
return cast<Instruction>(ScalarCond);
};
+ // Return new VectorizedTree, based on previous value.
+ auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
+ if (VectorizedTree) {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(
+ cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+ return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
+ ReductionOps);
+ }
+ // Initialize the final value in the reduction.
+ return Res;
+ };
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
@@ -12459,6 +13331,12 @@ public:
continue;
IgnoreList.insert(RdxOp);
}
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (Value *U : IgnoreList)
+ if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+ RdxFMF &= FPMO->getFastMathFlags();
bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
// Need to track reduced vals, they may be changed during vectorization of
@@ -12519,16 +13397,82 @@ public:
}
}
}
+
+ // Emit code for constant values.
+ if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
+ allConstant(Candidates)) {
+ Value *Res = Candidates.front();
+ ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
+ for (Value *VC : ArrayRef(Candidates).drop_front()) {
+ Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
+ ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
+ if (auto *ResI = dyn_cast<Instruction>(Res))
+ V.analyzedReductionRoot(ResI);
+ }
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
+ continue;
+ }
+
unsigned NumReducedVals = Candidates.size();
- if (NumReducedVals < ReductionLimit)
+ if (NumReducedVals < ReductionLimit &&
+ (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
+ !isSplat(Candidates)))
continue;
+ // Check if we support repeated scalar values processing (optimization of
+ // original scalar identity operations on matched horizontal reductions).
+ IsSupportedHorRdxIdentityOp =
+ AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
+ RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
+ // Gather same values.
+ MapVector<Value *, unsigned> SameValuesCounter;
+ if (IsSupportedHorRdxIdentityOp)
+ for (Value *V : Candidates)
+ ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
+ // Used to check if the reduced values used same number of times. In this
+ // case the compiler may produce better code. E.g. if reduced values are
+ // aabbccdd (8 x values), then the first node of the tree will have a node
+ // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
+ // Plus, the final reduction will be performed on <8 x aabbccdd>.
+ // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
+ // x abcd) * 2.
+ // Currently it only handles add/fadd/xor. and/or/min/max do not require
+ // this analysis, other operations may require an extra estimation of
+ // the profitability.
+ bool SameScaleFactor = false;
+ bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
+ SameValuesCounter.size() != Candidates.size();
+ if (OptReusedScalars) {
+ SameScaleFactor =
+ (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
+ RdxKind == RecurKind::Xor) &&
+ all_of(drop_begin(SameValuesCounter),
+ [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
+ return P.second == SameValuesCounter.front().second;
+ });
+ Candidates.resize(SameValuesCounter.size());
+ transform(SameValuesCounter, Candidates.begin(),
+ [](const auto &P) { return P.first; });
+ NumReducedVals = Candidates.size();
+ // Have a reduction of the same element.
+ if (NumReducedVals == 1) {
+ Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
+ unsigned Cnt = SameValuesCounter.lookup(OrigV);
+ Value *RedVal =
+ emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
+ VectorizedVals.try_emplace(OrigV, Cnt);
+ continue;
+ }
+ }
+
unsigned MaxVecRegSize = V.getMaxVecRegSize();
unsigned EltSize = V.getVectorElementSize(Candidates[0]);
- unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);
+ unsigned MaxElts =
+ RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
unsigned ReduxWidth = std::min<unsigned>(
- PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
+ llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
unsigned Start = 0;
unsigned Pos = Start;
// Restarts vectorization attempt with lower vector factor.
@@ -12551,6 +13495,7 @@ public:
ReduxWidth /= 2;
return IsAnyRedOpGathered;
};
+ bool AnyVectorized = false;
while (Pos < NumReducedVals - ReduxWidth + 1 &&
ReduxWidth >= ReductionLimit) {
// Dependency in tree of the reduction ops - drop this attempt, try
@@ -12603,34 +13548,24 @@ public:
LocalExternallyUsedValues[TrackedVals[V]];
});
}
- // Number of uses of the candidates in the vector of values.
- SmallDenseMap<Value *, unsigned> NumUses(Candidates.size());
- for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
- Value *V = Candidates[Cnt];
- ++NumUses.try_emplace(V, 0).first->getSecond();
- }
- for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
- Value *V = Candidates[Cnt];
- ++NumUses.try_emplace(V, 0).first->getSecond();
+ if (!IsSupportedHorRdxIdentityOp) {
+ // Number of uses of the candidates in the vector of values.
+ assert(SameValuesCounter.empty() &&
+ "Reused values counter map is not empty");
+ for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+ if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
+ continue;
+ Value *V = Candidates[Cnt];
+ Value *OrigV = TrackedToOrig.find(V)->second;
+ ++SameValuesCounter[OrigV];
+ }
}
SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
// Gather externally used values.
SmallPtrSet<Value *, 4> Visited;
- for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
- Value *RdxVal = Candidates[Cnt];
- if (!Visited.insert(RdxVal).second)
+ for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+ if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
continue;
- // Check if the scalar was vectorized as part of the vectorization
- // tree but not the top node.
- if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
- LocalExternallyUsedValues[RdxVal];
- continue;
- }
- unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
- if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
- LocalExternallyUsedValues[RdxVal];
- }
- for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
Value *RdxVal = Candidates[Cnt];
if (!Visited.insert(RdxVal).second)
continue;
@@ -12640,42 +13575,34 @@ public:
LocalExternallyUsedValues[RdxVal];
continue;
}
- unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
- if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
+ Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+ unsigned NumOps =
+ VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
+ if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
LocalExternallyUsedValues[RdxVal];
}
+ // Do not need the list of reused scalars in regular mode anymore.
+ if (!IsSupportedHorRdxIdentityOp)
+ SameValuesCounter.clear();
for (Value *RdxVal : VL)
if (RequiredExtract.contains(RdxVal))
LocalExternallyUsedValues[RdxVal];
+ // Update LocalExternallyUsedValues for the scalar, replaced by
+ // extractelement instructions.
+ for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
+ auto It = ExternallyUsedValues.find(Pair.first);
+ if (It == ExternallyUsedValues.end())
+ continue;
+ LocalExternallyUsedValues[Pair.second].append(It->second);
+ }
V.buildExternalUses(LocalExternallyUsedValues);
V.computeMinimumValueSizes();
- // Intersect the fast-math-flags from all reduction operations.
- FastMathFlags RdxFMF;
- RdxFMF.set();
- for (Value *U : IgnoreList)
- if (auto *FPMO = dyn_cast<FPMathOperator>(U))
- RdxFMF &= FPMO->getFastMathFlags();
// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);
InstructionCost ReductionCost =
- getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
- if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
- Instruction *MainOp = V.getFirstNodeMainOp();
- for (Value *V : VL) {
- auto *VI = dyn_cast<LoadInst>(V);
- // Add the costs of scalar GEP pointers, to be removed from the
- // code.
- if (!VI || VI == MainOp)
- continue;
- auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
- if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
- continue;
- TreeCost -= TTI->getArithmeticInstrCost(
- Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
- }
- }
+ getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
InstructionCost Cost = TreeCost + ReductionCost;
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
if (!Cost.isValid())
@@ -12716,8 +13643,8 @@ public:
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
// Vectorize a tree.
- Value *VectorizedRoot =
- V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
+ Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
+ ReplacedExternals, InsertPt);
Builder.SetInsertPoint(InsertPt);
@@ -12727,29 +13654,48 @@ public:
if (isBoolLogicOp(RdxRootInst))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+ // Emit code to correctly handle reused reduced values, if required.
+ if (OptReusedScalars && !SameScaleFactor) {
+ VectorizedRoot =
+ emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
+ SameValuesCounter, TrackedToOrig);
+ }
+
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (!VectorizedTree) {
- // Initialize the final value in the reduction.
- VectorizedTree = ReducedSubTree;
- } else {
- // Update the final value in the reduction.
- Builder.SetCurrentDebugLocation(
- cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- ReducedSubTree, "op.rdx", ReductionOps);
- }
+ // Improved analysis for add/fadd/xor reductions with same scale factor
+ // for all operands of reductions. We can emit scalar ops for them
+ // instead.
+ if (OptReusedScalars && SameScaleFactor)
+ ReducedSubTree = emitScaleForReusedOps(
+ ReducedSubTree, Builder, SameValuesCounter.front().second);
+
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
// Count vectorized reduced values to exclude them from final reduction.
for (Value *RdxVal : VL) {
- ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0)
- .first->getSecond();
+ Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+ if (IsSupportedHorRdxIdentityOp) {
+ VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
+ continue;
+ }
+ ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
if (!V.isVectorized(RdxVal))
RequiredExtract.insert(RdxVal);
}
Pos += ReduxWidth;
Start = Pos;
- ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
+ ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
+ AnyVectorized = true;
+ }
+ if (OptReusedScalars && !AnyVectorized) {
+ for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
+ Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
+ VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
+ Value *OrigV = TrackedToOrig.find(P.first)->second;
+ VectorizedVals.try_emplace(OrigV, P.second);
+ }
+ continue;
}
}
if (VectorizedTree) {
@@ -12757,7 +13703,7 @@ public:
// possible problem with poison propagation. If not possible to reorder
// (both operands are originally RHS), emit an extra freeze instruction
// for the LHS operand.
- //I.e., if we have original code like this:
+ // I.e., if we have original code like this:
// RedOp1 = select i1 ?, i1 LHS, i1 false
// RedOp2 = select i1 RHS, i1 ?, i1 false
@@ -12892,7 +13838,8 @@ private:
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
- unsigned ReduxWidth, FastMathFlags FMF) {
+ bool IsCmpSelMinMax, unsigned ReduxWidth,
+ FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Value *FirstReducedVal = ReducedVals.front();
Type *ScalarTy = FirstReducedVal->getType();
@@ -12900,7 +13847,36 @@ private:
InstructionCost VectorCost = 0, ScalarCost;
// If all of the reduced values are constant, the vector cost is 0, since
// the reduction value can be calculated at the compile time.
- bool AllConsts = all_of(ReducedVals, isConstant);
+ bool AllConsts = allConstant(ReducedVals);
+ auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
+ InstructionCost Cost = 0;
+ // Scalar cost is repeated for N-1 elements.
+ int Cnt = ReducedVals.size();
+ for (Value *RdxVal : ReducedVals) {
+ if (Cnt == 1)
+ break;
+ --Cnt;
+ if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
+ Cost += GenCostFn();
+ continue;
+ }
+ InstructionCost ScalarCost = 0;
+ for (User *U : RdxVal->users()) {
+ auto *RdxOp = cast<Instruction>(U);
+ if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+ ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
+ continue;
+ }
+ ScalarCost = InstructionCost::getInvalid();
+ break;
+ }
+ if (ScalarCost.isValid())
+ Cost += ScalarCost;
+ else
+ Cost += GenCostFn();
+ }
+ return Cost;
+ };
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@@ -12913,52 +13889,32 @@ private:
if (!AllConsts)
VectorCost =
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
- ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+ ScalarCost = EvaluateScalarCost([&]() {
+ return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+ });
break;
}
case RecurKind::FMax:
- case RecurKind::FMin: {
- auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- if (!AllConsts) {
- auto *VecCondTy =
- cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- VectorCost =
- TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*IsUnsigned=*/false, CostKind);
- }
- CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
- ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
- SclCondTy, RdxPred, CostKind) +
- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- SclCondTy, RdxPred, CostKind);
- break;
- }
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
case RecurKind::SMax:
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin: {
- auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- if (!AllConsts) {
- auto *VecCondTy =
- cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- bool IsUnsigned =
- RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- IsUnsigned, CostKind);
- }
- CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
- ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
- SclCondTy, RdxPred, CostKind) +
- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- SclCondTy, RdxPred, CostKind);
+ Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
+ if (!AllConsts)
+ VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+ ScalarCost = EvaluateScalarCost([&]() {
+ IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
+ return TTI->getIntrinsicInstrCost(ICA, CostKind);
+ });
break;
}
default:
llvm_unreachable("Expected arithmetic or min/max reduction operation");
}
- // Scalar cost is repeated for N-1 elements.
- ScalarCost *= (ReduxWidth - 1);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a splitting reduction)\n");
@@ -12977,8 +13933,148 @@ private:
++NumVectorInstructions;
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
}
-};
+ /// Emits optimized code for unique scalar value reused \p Cnt times.
+ Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
+ unsigned Cnt) {
+ assert(IsSupportedHorRdxIdentityOp &&
+ "The optimization of matched scalar identity horizontal reductions "
+ "must be supported.");
+ switch (RdxKind) {
+ case RecurKind::Add: {
+ // res = mul vv, n
+ Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
+ << VectorizedValue << ". (HorRdx)\n");
+ return Builder.CreateMul(VectorizedValue, Scale);
+ }
+ case RecurKind::Xor: {
+ // res = n % 2 ? 0 : vv
+ LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
+ << ". (HorRdx)\n");
+ if (Cnt % 2 == 0)
+ return Constant::getNullValue(VectorizedValue->getType());
+ return VectorizedValue;
+ }
+ case RecurKind::FAdd: {
+ // res = fmul v, n
+ Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
+ LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
+ << VectorizedValue << ". (HorRdx)\n");
+ return Builder.CreateFMul(VectorizedValue, Scale);
+ }
+ case RecurKind::And:
+ case RecurKind::Or:
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin:
+ case RecurKind::FMax:
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
+ // res = vv
+ return VectorizedValue;
+ case RecurKind::Mul:
+ case RecurKind::FMul:
+ case RecurKind::FMulAdd:
+ case RecurKind::SelectICmp:
+ case RecurKind::SelectFCmp:
+ case RecurKind::None:
+ llvm_unreachable("Unexpected reduction kind for repeated scalar.");
+ }
+ return nullptr;
+ }
+
+ /// Emits actual operation for the scalar identity values, found during
+ /// horizontal reduction analysis.
+ Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
+ ArrayRef<Value *> VL,
+ const MapVector<Value *, unsigned> &SameValuesCounter,
+ const DenseMap<Value *, Value *> &TrackedToOrig) {
+ assert(IsSupportedHorRdxIdentityOp &&
+ "The optimization of matched scalar identity horizontal reductions "
+ "must be supported.");
+ switch (RdxKind) {
+ case RecurKind::Add: {
+ // root = mul prev_root, <1, 1, n, 1>
+ SmallVector<Constant *> Vals;
+ for (Value *V : VL) {
+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+ Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
+ }
+ auto *Scale = ConstantVector::get(Vals);
+ LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
+ << VectorizedValue << ". (HorRdx)\n");
+ return Builder.CreateMul(VectorizedValue, Scale);
+ }
+ case RecurKind::And:
+ case RecurKind::Or:
+ // No need for multiple or/and(s).
+ LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
+ << ". (HorRdx)\n");
+ return VectorizedValue;
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin:
+ case RecurKind::FMax:
+ case RecurKind::FMin:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimum:
+ // No need for multiple min/max(s) of the same value.
+ LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
+ << ". (HorRdx)\n");
+ return VectorizedValue;
+ case RecurKind::Xor: {
+ // Replace values with even number of repeats with 0, since
+ // x xor x = 0.
+ // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
+ // 7>, if elements 4th and 6th elements have even number of repeats.
+ SmallVector<int> Mask(
+ cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
+ PoisonMaskElem);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ bool NeedShuffle = false;
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ Value *V = VL[I];
+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+ if (Cnt % 2 == 0) {
+ Mask[I] = VF;
+ NeedShuffle = true;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
+ : Mask) dbgs()
+ << I << " ";
+ dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
+ if (NeedShuffle)
+ VectorizedValue = Builder.CreateShuffleVector(
+ VectorizedValue,
+ ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
+ return VectorizedValue;
+ }
+ case RecurKind::FAdd: {
+ // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
+ SmallVector<Constant *> Vals;
+ for (Value *V : VL) {
+ unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+ Vals.push_back(ConstantFP::get(V->getType(), Cnt));
+ }
+ auto *Scale = ConstantVector::get(Vals);
+ return Builder.CreateFMul(VectorizedValue, Scale);
+ }
+ case RecurKind::Mul:
+ case RecurKind::FMul:
+ case RecurKind::FMulAdd:
+ case RecurKind::SelectICmp:
+ case RecurKind::SelectFCmp:
+ case RecurKind::None:
+ llvm_unreachable("Unexpected reduction kind for reused scalars.");
+ }
+ return nullptr;
+ }
+};
} // end anonymous namespace
static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
@@ -13075,15 +14171,15 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
return false;
}
-/// Try and get a reduction value from a phi node.
+/// Try and get a reduction instruction from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
///
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
/// if not possible.
-static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
- BasicBlock *ParentBB, LoopInfo *LI) {
+static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
+ BasicBlock *ParentBB, LoopInfo *LI) {
// There are situations where the reduction value is not dominated by the
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
@@ -13092,13 +14188,13 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
};
- Value *Rdx = nullptr;
+ Instruction *Rdx = nullptr;
// Return the incoming value if it comes from the same BB as the phi node.
if (P->getIncomingBlock(0) == ParentBB) {
- Rdx = P->getIncomingValue(0);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
} else if (P->getIncomingBlock(1) == ParentBB) {
- Rdx = P->getIncomingValue(1);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
}
if (Rdx && DominatedReduxValue(Rdx))
@@ -13115,9 +14211,9 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
// There is a loop latch, return the incoming value if it comes from
// that. This reduction pattern occasionally turns up.
if (P->getIncomingBlock(0) == BBLatch) {
- Rdx = P->getIncomingValue(0);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
} else if (P->getIncomingBlock(1) == BBLatch) {
- Rdx = P->getIncomingValue(1);
+ Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
}
if (Rdx && DominatedReduxValue(Rdx))
@@ -13133,6 +14229,10 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
return true;
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
return true;
+ if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
+ return true;
if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
return true;
if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
@@ -13144,21 +14244,63 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
return false;
}
+/// We could have an initial reduction that is not an add.
+/// r *= v1 + v2 + v3 + v4
+/// In such a case start looking for a tree rooted in the first '+'.
+/// \Returns the new root if found, which may be nullptr if not an instruction.
+static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
+ Instruction *Root) {
+ assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
+ isa<IntrinsicInst>(Root)) &&
+ "Expected binop, select, or intrinsic for reduction matching");
+ Value *LHS =
+ Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
+ Value *RHS =
+ Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
+ if (LHS == Phi)
+ return dyn_cast<Instruction>(RHS);
+ if (RHS == Phi)
+ return dyn_cast<Instruction>(LHS);
+ return nullptr;
+}
+
+/// \p Returns the first operand of \p I that does not match \p Phi. If
+/// operand is not an instruction it returns nullptr.
+static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
+ Value *Op0 = nullptr;
+ Value *Op1 = nullptr;
+ if (!matchRdxBop(I, Op0, Op1))
+ return nullptr;
+ return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
+}
+
+/// \Returns true if \p I is a candidate instruction for reduction vectorization.
+static bool isReductionCandidate(Instruction *I) {
+ bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
+ Value *B0 = nullptr, *B1 = nullptr;
+ bool IsBinop = matchRdxBop(I, B0, B1);
+ return IsBinop || IsSelect;
+}
+
bool SLPVectorizerPass::vectorizeHorReduction(
- PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
if (!ShouldVectorizeHor)
return false;
+ bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
- auto *Root = dyn_cast_or_null<Instruction>(V);
- if (!Root)
+ if (Root->getParent() != BB || isa<PHINode>(Root))
return false;
- if (!isa<BinaryOperator>(Root))
- P = nullptr;
+ // If we can find a secondary reduction root, use that instead.
+ auto SelectRoot = [&]() {
+ if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
+ HorizontalReduction::getRdxKind(Root) != RecurKind::None)
+ if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
+ return NewRoot;
+ return Root;
+ };
- if (Root->getParent() != BB || isa<PHINode>(Root))
- return false;
// Start analysis starting from Root instruction. If horizontal reduction is
// found, try to vectorize it. If it is not a horizontal reduction or
// vectorization is not possible or not effective, and currently analyzed
@@ -13171,22 +14313,32 @@ bool SLPVectorizerPass::vectorizeHorReduction(
// If a horizintal reduction was not matched or vectorized we collect
// instructions for possible later attempts for vectorization.
std::queue<std::pair<Instruction *, unsigned>> Stack;
- Stack.emplace(Root, 0);
+ Stack.emplace(SelectRoot(), 0);
SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
- auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0,
- Value *&B1) -> Value * {
+ auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
if (R.isAnalyzedReductionRoot(Inst))
return nullptr;
- bool IsBinop = matchRdxBop(Inst, B0, B1);
- bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
- if (IsBinop || IsSelect) {
- HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI))
- return HorRdx.tryToReduce(R, TTI, *TLI);
+ if (!isReductionCandidate(Inst))
+ return nullptr;
+ HorizontalReduction HorRdx;
+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+ return nullptr;
+ return HorRdx.tryToReduce(R, TTI, *TLI);
+ };
+ auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
+ if (TryOperandsAsNewSeeds && FutureSeed == Root) {
+ FutureSeed = getNonPhiOperand(Root, P);
+ if (!FutureSeed)
+ return false;
}
- return nullptr;
+ // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
+ // analysis is done separately.
+ if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
+ PostponedInsts.push_back(FutureSeed);
+ return true;
};
+
while (!Stack.empty()) {
Instruction *Inst;
unsigned Level;
@@ -13197,37 +14349,19 @@ bool SLPVectorizerPass::vectorizeHorReduction(
// iteration while stack was populated before that happened.
if (R.isDeleted(Inst))
continue;
- Value *B0 = nullptr, *B1 = nullptr;
- if (Value *V = TryToReduce(Inst, B0, B1)) {
+ if (Value *VectorizedV = TryToReduce(Inst)) {
Res = true;
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- if (auto *I = dyn_cast<Instruction>(V)) {
+ if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
// Try to find another reduction.
Stack.emplace(I, Level);
continue;
}
} else {
- bool IsBinop = B0 && B1;
- if (P && IsBinop) {
- Inst = dyn_cast<Instruction>(B0);
- if (Inst == P)
- Inst = dyn_cast<Instruction>(B1);
- if (!Inst) {
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- continue;
- }
+ // We could not vectorize `Inst` so try to use it as a future seed.
+ if (!TryAppendToPostponedInsts(Inst)) {
+ assert(Stack.empty() && "Expected empty stack");
+ break;
}
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
- // analysis is done separately.
- if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
- PostponedInsts.push_back(Inst);
}
// Try to vectorize operands.
@@ -13246,11 +14380,11 @@ bool SLPVectorizerPass::vectorizeHorReduction(
return Res;
}
-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI) {
SmallVector<WeakTrackingVH> PostponedInsts;
- bool Res = vectorizeHorReduction(P, V, BB, R, TTI, PostponedInsts);
+ bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
Res |= tryToVectorize(PostponedInsts, R);
return Res;
}
@@ -13297,13 +14431,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
}
template <typename T>
-static bool
-tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
- function_ref<unsigned(T *)> Limit,
- function_ref<bool(T *, T *)> Comparator,
- function_ref<bool(T *, T *)> AreCompatible,
- function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
- bool LimitForRegisterSize) {
+static bool tryToVectorizeSequence(
+ SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
+ function_ref<bool(T *, T *)> AreCompatible,
+ function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
+ bool MaxVFOnly, BoUpSLP &R) {
bool Changed = false;
// Sort by type, parent, operands.
stable_sort(Incoming, Comparator);
@@ -13331,21 +14463,29 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
// same/alternate ops only, this may result in some extra final
// vectorization.
if (NumElts > 1 &&
- TryToVectorizeHelper(ArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
+ TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
// Success start over because instructions might have been changed.
Changed = true;
- } else if (NumElts < Limit(*IncIt) &&
- (Candidates.empty() ||
- Candidates.front()->getType() == (*IncIt)->getType())) {
- Candidates.append(IncIt, std::next(IncIt, NumElts));
+ } else {
+ /// \Returns the minimum number of elements that we will attempt to
+ /// vectorize.
+ auto GetMinNumElements = [&R](Value *V) {
+ unsigned EltSize = R.getVectorElementSize(V);
+ return std::max(2U, R.getMaxVecRegSize() / EltSize);
+ };
+ if (NumElts < GetMinNumElements(*IncIt) &&
+ (Candidates.empty() ||
+ Candidates.front()->getType() == (*IncIt)->getType())) {
+ Candidates.append(IncIt, std::next(IncIt, NumElts));
+ }
}
// Final attempt to vectorize instructions with the same types.
if (Candidates.size() > 1 &&
(SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
- if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) {
+ if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
// Success start over because instructions might have been changed.
Changed = true;
- } else if (LimitForRegisterSize) {
+ } else if (MaxVFOnly) {
// Try to vectorize using small vectors.
for (auto *It = Candidates.begin(), *End = Candidates.end();
It != End;) {
@@ -13353,9 +14493,8 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
++SameTypeIt;
unsigned NumElts = (SameTypeIt - It);
- if (NumElts > 1 &&
- TryToVectorizeHelper(ArrayRef(It, NumElts),
- /*LimitForRegisterSize=*/false))
+ if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
+ /*MaxVFOnly=*/false))
Changed = true;
It = SameTypeIt;
}
@@ -13378,11 +14517,12 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
/// of the second cmp instruction.
template <bool IsCompatibility>
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
- function_ref<bool(Instruction *)> IsDeleted) {
+ const DominatorTree &DT) {
+ assert(isValidElementType(V->getType()) &&
+ isValidElementType(V2->getType()) &&
+ "Expected valid element types only.");
auto *CI1 = cast<CmpInst>(V);
auto *CI2 = cast<CmpInst>(V2);
- if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))
- return false;
if (CI1->getOperand(0)->getType()->getTypeID() <
CI2->getOperand(0)->getType()->getTypeID())
return !IsCompatibility;
@@ -13411,31 +14551,102 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
return false;
if (auto *I1 = dyn_cast<Instruction>(Op1))
if (auto *I2 = dyn_cast<Instruction>(Op2)) {
- if (I1->getParent() != I2->getParent())
- return false;
+ if (IsCompatibility) {
+ if (I1->getParent() != I2->getParent())
+ return false;
+ } else {
+ // Try to compare nodes with same parent.
+ DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
+ DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
+ if (!NodeI1)
+ return NodeI2 != nullptr;
+ if (!NodeI2)
+ return false;
+ assert((NodeI1 == NodeI2) ==
+ (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeI1 != NodeI2)
+ return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
+ }
InstructionsState S = getSameOpcode({I1, I2}, TLI);
- if (S.getOpcode())
+ if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
continue;
- return false;
+ return !IsCompatibility && I1->getOpcode() < I2->getOpcode();
}
}
return IsCompatibility;
}
-bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,
- BasicBlock *BB, BoUpSLP &R,
- bool AtTerminator) {
+template <typename ItT>
+bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
+ BasicBlock *BB, BoUpSLP &R) {
+ bool Changed = false;
+ // Try to find reductions first.
+ for (CmpInst *I : CmpInsts) {
+ if (R.isDeleted(I))
+ continue;
+ for (Value *Op : I->operands())
+ if (auto *RootOp = dyn_cast<Instruction>(Op))
+ Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
+ }
+ // Try to vectorize operands as vector bundles.
+ for (CmpInst *I : CmpInsts) {
+ if (R.isDeleted(I))
+ continue;
+ Changed |= tryToVectorize(I, R);
+ }
+ // Try to vectorize list of compares.
+ // Sort by type, compare predicate, etc.
+ auto CompareSorter = [&](Value *V, Value *V2) {
+ if (V == V2)
+ return false;
+ return compareCmp<false>(V, V2, *TLI, *DT);
+ };
+
+ auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
+ if (V1 == V2)
+ return true;
+ return compareCmp<true>(V1, V2, *TLI, *DT);
+ };
+
+ SmallVector<Value *> Vals;
+ for (Instruction *V : CmpInsts)
+ if (!R.isDeleted(V) && isValidElementType(V->getType()))
+ Vals.push_back(V);
+ if (Vals.size() <= 1)
+ return Changed;
+ Changed |= tryToVectorizeSequence<Value>(
+ Vals, CompareSorter, AreCompatibleCompares,
+ [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
+ // Exclude possible reductions from other blocks.
+ bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
+ return any_of(V->users(), [V](User *U) {
+ auto *Select = dyn_cast<SelectInst>(U);
+ return Select &&
+ Select->getParent() != cast<Instruction>(V)->getParent();
+ });
+ });
+ if (ArePossiblyReducedInOtherBlock)
+ return false;
+ return tryToVectorizeList(Candidates, R, MaxVFOnly);
+ },
+ /*MaxVFOnly=*/true, R);
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
+ BasicBlock *BB, BoUpSLP &R) {
+ assert(all_of(Instructions,
+ [](auto *I) {
+ return isa<InsertElementInst, InsertValueInst>(I);
+ }) &&
+ "This function only accepts Insert instructions");
bool OpsChanged = false;
- SmallVector<Instruction *, 4> PostponedCmps;
SmallVector<WeakTrackingVH> PostponedInsts;
// pass1 - try to vectorize reductions only
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
continue;
- if (isa<CmpInst>(I)) {
- PostponedCmps.push_back(I);
- continue;
- }
OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
}
// pass2 - try to match and vectorize a buildvector sequence.
@@ -13451,63 +14662,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,
// Now try to vectorize postponed instructions.
OpsChanged |= tryToVectorize(PostponedInsts, R);
- if (AtTerminator) {
- // Try to find reductions first.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- for (Value *Op : I->operands())
- OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
- }
- // Try to vectorize operands as vector bundles.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- OpsChanged |= tryToVectorize(I, R);
- }
- // Try to vectorize list of compares.
- // Sort by type, compare predicate, etc.
- auto CompareSorter = [&](Value *V, Value *V2) {
- return compareCmp<false>(V, V2, *TLI,
- [&R](Instruction *I) { return R.isDeleted(I); });
- };
-
- auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
- if (V1 == V2)
- return true;
- return compareCmp<true>(V1, V2, *TLI,
- [&R](Instruction *I) { return R.isDeleted(I); });
- };
- auto Limit = [&R](Value *V) {
- unsigned EltSize = R.getVectorElementSize(V);
- return std::max(2U, R.getMaxVecRegSize() / EltSize);
- };
-
- SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());
- OpsChanged |= tryToVectorizeSequence<Value>(
- Vals, Limit, CompareSorter, AreCompatibleCompares,
- [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
- // Exclude possible reductions from other blocks.
- bool ArePossiblyReducedInOtherBlock =
- any_of(Candidates, [](Value *V) {
- return any_of(V->users(), [V](User *U) {
- return isa<SelectInst>(U) &&
- cast<SelectInst>(U)->getParent() !=
- cast<Instruction>(V)->getParent();
- });
- });
- if (ArePossiblyReducedInOtherBlock)
- return false;
- return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
- },
- /*LimitForRegisterSize=*/true);
- Instructions.clear();
- } else {
- Instructions.clear();
- // Insert in reverse order since the PostponedCmps vector was filled in
- // reverse order.
- Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend());
- }
+ Instructions.clear();
return OpsChanged;
}
@@ -13603,10 +14758,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
return true;
};
- auto Limit = [&R](Value *V) {
- unsigned EltSize = R.getVectorElementSize(V);
- return std::max(2U, R.getMaxVecRegSize() / EltSize);
- };
bool HaveVectorizedPhiNodes = false;
do {
@@ -13648,19 +14799,44 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
- Incoming, Limit, PHICompare, AreCompatiblePHIs,
- [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
- return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+ Incoming, PHICompare, AreCompatiblePHIs,
+ [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
+ return tryToVectorizeList(Candidates, R, MaxVFOnly);
},
- /*LimitForRegisterSize=*/true);
+ /*MaxVFOnly=*/true, R);
Changed |= HaveVectorizedPhiNodes;
VisitedInstrs.insert(Incoming.begin(), Incoming.end());
} while (HaveVectorizedPhiNodes);
VisitedInstrs.clear();
- InstSetVector PostProcessInstructions;
- SmallDenseSet<Instruction *, 4> KeyNodes;
+ InstSetVector PostProcessInserts;
+ SmallSetVector<CmpInst *, 8> PostProcessCmps;
+ // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
+ // also vectorizes `PostProcessCmps`.
+ auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
+ bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
+ if (VectorizeCmps) {
+ Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
+ PostProcessCmps.clear();
+ }
+ PostProcessInserts.clear();
+ return Changed;
+ };
+ // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
+ auto IsInPostProcessInstrs = [&](Instruction *I) {
+ if (auto *Cmp = dyn_cast<CmpInst>(I))
+ return PostProcessCmps.contains(Cmp);
+ return isa<InsertElementInst, InsertValueInst>(I) &&
+ PostProcessInserts.contains(I);
+ };
+ // Returns true if `I` is an instruction without users, like terminator, or
+ // function call with ignored return value, store. Ignore unused instructions
+ // (basing on instruction type, except for CallInst and InvokeInst).
+ auto HasNoUsers = [](Instruction *I) {
+ return I->use_empty() &&
+ (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
+ };
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
// Skip instructions with scalable type. The num of elements is unknown at
// compile-time for scalable type.
@@ -13672,9 +14848,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
- if (it->use_empty() && KeyNodes.contains(&*it) &&
- vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
- it->isTerminator())) {
+ if (HasNoUsers(&*it) &&
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
@@ -13692,8 +14867,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() == 2) {
// Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
- TTI)) {
+ Instruction *Root = getReductionInstr(DT, P, BB, LI);
+ if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -13714,19 +14889,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Postponed instructions should not be vectorized here, delay their
// vectorization.
if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
- PI && !PostProcessInstructions.contains(PI))
- Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+ PI && !IsInPostProcessInstrs(PI))
+ Changed |= vectorizeRootInstruction(nullptr, PI,
P->getIncomingBlock(I), R, TTI);
}
continue;
}
- // Ran into an instruction without users, like terminator, or function call
- // with ignored return value, store. Ignore unused instructions (basing on
- // instruction type, except for CallInst and InvokeInst).
- if (it->use_empty() &&
- (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {
- KeyNodes.insert(&*it);
+ if (HasNoUsers(&*it)) {
bool OpsChanged = false;
auto *SI = dyn_cast<StoreInst>(it);
bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
@@ -13746,16 +14916,16 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Postponed instructions should not be vectorized here, delay their
// vectorization.
if (auto *VI = dyn_cast<Instruction>(V);
- VI && !PostProcessInstructions.contains(VI))
+ VI && !IsInPostProcessInstrs(VI))
// Try to match and vectorize a horizontal reduction.
- OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
+ OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
}
}
// Start vectorization of post-process list of instructions from the
// top-tree instructions to try to vectorize as many instructions as
// possible.
- OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
- it->isTerminator());
+ OpsChanged |=
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator());
if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
@@ -13766,8 +14936,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
}
- if (isa<CmpInst, InsertElementInst, InsertValueInst>(it))
- PostProcessInstructions.insert(&*it);
+ if (isa<InsertElementInst, InsertValueInst>(it))
+ PostProcessInserts.insert(&*it);
+ else if (isa<CmpInst>(it))
+ PostProcessCmps.insert(cast<CmpInst>(&*it));
}
return Changed;
@@ -13928,10 +15100,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
return V1->getValueOperand()->getValueID() ==
V2->getValueOperand()->getValueID();
};
- auto Limit = [&R, this](StoreInst *SI) {
- unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
- return R.getMinVF(EltSize);
- };
// Attempt to sort and vectorize each of the store-groups.
for (auto &Pair : Stores) {
@@ -13945,28 +15113,11 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
continue;
Changed |= tryToVectorizeSequence<StoreInst>(
- Pair.second, Limit, StoreSorter, AreCompatibleStores,
+ Pair.second, StoreSorter, AreCompatibleStores,
[this, &R](ArrayRef<StoreInst *> Candidates, bool) {
return vectorizeStores(Candidates, R);
},
- /*LimitForRegisterSize=*/false);
+ /*MaxVFOnly=*/false, R);
}
return Changed;
}
-
-char SLPVectorizer::ID = 0;
-
-static const char lv_name[] = "SLP Vectorizer";
-
-INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
-INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
-
-Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }