aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-03-20 11:40:34 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-05-14 11:43:05 +0000
commit349cc55c9796c4596a5b9904cd3281af295f878f (patch)
tree410c5a785075730a35f1272ca6a7adf72222ad03 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parentcb2ae6163174b90e999326ecec3699ee093a5d43 (diff)
parentc0981da47d5696fe36474fcf86b4ce03ae3ff818 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp2381
1 files changed, 1594 insertions, 787 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d06bc7d79a7..e3ef0b794f68 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21,6 +21,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
@@ -200,12 +201,39 @@ static bool isValidElementType(Type *Ty) {
!Ty->isPPC_FP128Ty();
}
+/// \returns True if the value is a constant (but not globals/constant
+/// expressions).
+static bool isConstant(Value *V) {
+ return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
+}
+
+/// Checks if \p V is one of vector-like instructions, i.e. undef,
+/// insertelement/extractelement with constant indices for fixed vector type or
+/// extractvalue instruction.
+static bool isVectorLikeInstWithConstOps(Value *V) {
+ if (!isa<InsertElementInst, ExtractElementInst>(V) &&
+ !isa<ExtractValueInst, UndefValue>(V))
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I || isa<ExtractValueInst>(I))
+ return true;
+ if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
+ return false;
+ if (isa<ExtractElementInst>(I))
+ return isConstant(I->getOperand(1));
+ assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
+ return isConstant(I->getOperand(2));
+}
+
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
+ if (all_of(VL, isVectorLikeInstWithConstOps))
+ return true;
+
BasicBlock *BB = I0->getParent();
for (int I = 1, E = VL.size(); I < E; I++) {
auto *II = dyn_cast<Instruction>(VL[I]);
@@ -218,12 +246,6 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
return true;
}
-/// \returns True if the value is a constant (but not globals/constant
-/// expressions).
-static bool isConstant(Value *V) {
- return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
-}
-
/// \returns True if all of the values in \p VL are constants (but not
/// globals/constant expressions).
static bool allConstant(ArrayRef<Value *> VL) {
@@ -232,12 +254,21 @@ static bool allConstant(ArrayRef<Value *> VL) {
return all_of(VL, isConstant);
}
-/// \returns True if all of the values in \p VL are identical.
+/// \returns True if all of the values in \p VL are identical or some of them
+/// are UndefValue.
static bool isSplat(ArrayRef<Value *> VL) {
- for (unsigned i = 1, e = VL.size(); i < e; ++i)
- if (VL[i] != VL[0])
+ Value *FirstNonUndef = nullptr;
+ for (Value *V : VL) {
+ if (isa<UndefValue>(V))
+ continue;
+ if (!FirstNonUndef) {
+ FirstNonUndef = V;
+ continue;
+ }
+ if (V != FirstNonUndef)
return false;
- return true;
+ }
+ return FirstNonUndef != nullptr;
}
/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
@@ -295,8 +326,10 @@ static bool isCommutative(Instruction *I) {
/// TODO: Can we split off and reuse the shuffle mask detection from
/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
-isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
+isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
+ if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
+ return None;
unsigned Size =
cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
Value *Vec1 = nullptr;
@@ -504,7 +537,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
if (hasVectorInstrinsicScalarOpd(ID, i))
return (CI->getArgOperand(i) == Scalar);
}
@@ -535,13 +568,67 @@ static bool isSimple(Instruction *I) {
return true;
}
+/// Shuffles \p Mask in accordance with the given \p SubMask.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+ if (SubMask.empty())
+ return;
+ if (Mask.empty()) {
+ Mask.append(SubMask.begin(), SubMask.end());
+ return;
+ }
+ SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
+ int TermValue = std::min(Mask.size(), SubMask.size());
+ for (int I = 0, E = SubMask.size(); I < E; ++I) {
+ if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
+ Mask[SubMask[I]] >= TermValue)
+ continue;
+ NewMask[I] = Mask[SubMask[I]];
+ }
+ Mask.swap(NewMask);
+}
+
+/// Order may have elements assigned special value (size) which is out of
+/// bounds. Such indices only appear on places which correspond to undef values
+/// (see canReuseExtract for details) and used in order to avoid undef values
+/// have effect on operands ordering.
+/// The first loop below simply finds all unused indices and then the next loop
+/// nest assigns these indices for undef values positions.
+/// As an example below Order has two undef positions and they have assigned
+/// values 3 and 7 respectively:
+/// before: 6 9 5 4 9 2 1 0
+/// after: 6 3 5 4 7 2 1 0
+static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
+ const unsigned Sz = Order.size();
+ SmallBitVector UsedIndices(Sz);
+ SmallVector<int> MaskedIndices;
+ for (unsigned I = 0; I < Sz; ++I) {
+ if (Order[I] < Sz)
+ UsedIndices.set(Order[I]);
+ else
+ MaskedIndices.push_back(I);
+ }
+ if (MaskedIndices.empty())
+ return;
+ SmallVector<int> AvailableIndices(MaskedIndices.size());
+ unsigned Cnt = 0;
+ int Idx = UsedIndices.find_first();
+ do {
+ AvailableIndices[Cnt] = Idx;
+ Idx = UsedIndices.find_next(Idx);
+ ++Cnt;
+ } while (Idx > 0);
+ assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
+ for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
+ Order[MaskedIndices[I]] = AvailableIndices[I];
+}
+
namespace llvm {
static void inversePermutation(ArrayRef<unsigned> Indices,
SmallVectorImpl<int> &Mask) {
Mask.clear();
const unsigned E = Indices.size();
- Mask.resize(E, E + 1);
+ Mask.resize(E, UndefMaskElem);
for (unsigned I = 0; I < E; ++I)
Mask[Indices[I]] = I;
}
@@ -581,6 +668,22 @@ static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
return Index;
}
+/// Reorders the list of scalars in accordance with the given \p Order and then
+/// the \p Mask. \p Order - is the original order of the scalars, need to
+/// reorder scalars into an unordered state at first according to the given
+/// order. Then the ordered scalars are shuffled once again in accordance with
+/// the provided mask.
+static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
+ ArrayRef<int> Mask) {
+ assert(!Mask.empty() && "Expected non-empty mask.");
+ SmallVector<Value *> Prev(Scalars.size(),
+ UndefValue::get(Scalars.front()->getType()));
+ Prev.swap(Scalars);
+ for (unsigned I = 0, E = Prev.size(); I < E; ++I)
+ if (Mask[I] != UndefMaskElem)
+ Scalars[Mask[I]] = Prev[I];
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -645,13 +748,12 @@ public:
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
- /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
- /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
- /// into account (and updating it, if required) list of externally used
- /// values stored in \p ExternallyUsedValues.
- void buildTree(ArrayRef<Value *> Roots,
- ExtraValueToDebugLocsMap &ExternallyUsedValues,
- ArrayRef<Value *> UserIgnoreLst = None);
+ /// Builds external uses of the vectorized scalars, i.e. the list of
+ /// vectorized scalars to be extracted, their lanes and their scalar users. \p
+ /// ExternallyUsedValues contains additional list of external uses to handle
+ /// vectorization of reductions.
+ void
+ buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
@@ -659,8 +761,6 @@ public:
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
- NumOpsWantToKeepOrder.clear();
- NumOpsWantToKeepOriginalOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -674,103 +774,28 @@ public:
/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
- /// \returns The best order of instructions for vectorization.
- Optional<ArrayRef<unsigned>> bestOrder() const {
- assert(llvm::all_of(
- NumOpsWantToKeepOrder,
- [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
- return D.getFirst().size() ==
- VectorizableTree[0]->Scalars.size();
- }) &&
- "All orders must have the same size as number of instructions in "
- "tree node.");
- auto I = std::max_element(
- NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
- [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
- const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
- return D1.second < D2.second;
- });
- if (I == NumOpsWantToKeepOrder.end() ||
- I->getSecond() <= NumOpsWantToKeepOriginalOrder)
- return None;
-
- return makeArrayRef(I->getFirst());
- }
-
- /// Builds the correct order for root instructions.
- /// If some leaves have the same instructions to be vectorized, we may
- /// incorrectly evaluate the best order for the root node (it is built for the
- /// vector of instructions without repeated instructions and, thus, has less
- /// elements than the root node). This function builds the correct order for
- /// the root node.
- /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
- /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
- /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
- /// be reordered, the best order will be \<1, 0\>. We need to extend this
- /// order for the root node. For the root node this order should look like
- /// \<3, 0, 1, 2\>. This function extends the order for the reused
- /// instructions.
- void findRootOrder(OrdersType &Order) {
- // If the leaf has the same number of instructions to vectorize as the root
- // - order must be set already.
- unsigned RootSize = VectorizableTree[0]->Scalars.size();
- if (Order.size() == RootSize)
- return;
- SmallVector<unsigned, 4> RealOrder(Order.size());
- std::swap(Order, RealOrder);
- SmallVector<int, 4> Mask;
- inversePermutation(RealOrder, Mask);
- Order.assign(Mask.begin(), Mask.end());
- // The leaf has less number of instructions - need to find the true order of
- // the root.
- // Scan the nodes starting from the leaf back to the root.
- const TreeEntry *PNode = VectorizableTree.back().get();
- SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
- SmallPtrSet<const TreeEntry *, 4> Visited;
- while (!Nodes.empty() && Order.size() != RootSize) {
- const TreeEntry *PNode = Nodes.pop_back_val();
- if (!Visited.insert(PNode).second)
- continue;
- const TreeEntry &Node = *PNode;
- for (const EdgeInfo &EI : Node.UserTreeIndices)
- if (EI.UserTE)
- Nodes.push_back(EI.UserTE);
- if (Node.ReuseShuffleIndices.empty())
- continue;
- // Build the order for the parent node.
- OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
- SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
- // The algorithm of the order extension is:
- // 1. Calculate the number of the same instructions for the order.
- // 2. Calculate the index of the new order: total number of instructions
- // with order less than the order of the current instruction + reuse
- // number of the current instruction.
- // 3. The new order is just the index of the instruction in the original
- // vector of the instructions.
- for (unsigned I : Node.ReuseShuffleIndices)
- ++OrderCounter[Order[I]];
- SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
- for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
- unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
- unsigned OrderIdx = Order[ReusedIdx];
- unsigned NewIdx = 0;
- for (unsigned J = 0; J < OrderIdx; ++J)
- NewIdx += OrderCounter[J];
- NewIdx += CurrentCounter[OrderIdx];
- ++CurrentCounter[OrderIdx];
- assert(NewOrder[NewIdx] == RootSize &&
- "The order index should not be written already.");
- NewOrder[NewIdx] = I;
- }
- std::swap(Order, NewOrder);
- }
- assert(Order.size() == RootSize &&
- "Root node is expected or the size of the order must be the same as "
- "the number of elements in the root node.");
- assert(llvm::all_of(Order,
- [RootSize](unsigned Val) { return Val != RootSize; }) &&
- "All indices must be initialized");
- }
+ /// Checks if the specified gather tree entry \p TE can be represented as a
+ /// shuffled vector entry + (possibly) permutation with other gathers. It
+ /// implements the checks only for possibly ordered scalars (Loads,
+ /// ExtractElement, ExtractValue), which can be part of the graph.
+ Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+
+ /// Reorders the current graph to the most profitable order starting from the
+ /// root node to the leaf nodes. The best order is chosen only from the nodes
+ /// of the same size (vectorization factor). Smaller nodes are considered
+ /// parts of subgraph with smaller VF and they are reordered independently. We
+ /// can make it because we still need to extend smaller nodes to the wider VF
+ /// and we can merge reordering shuffles with the widening shuffles.
+ void reorderTopToBottom();
+
+ /// Reorders the current graph to the most profitable order starting from
+ /// leaves to the root. It allows to rotate small subgraphs and reduce the
+ /// number of reshuffles if the leaf nodes use the same order. In this case we
+ /// can merge the orders and just shuffle user node instead of shuffling its
+ /// operands. Plus, even the leaf nodes have different orders, it allows to
+ /// sink reordering in the graph closer to the root node and merge it later
+ /// during analysis.
+ void reorderBottomToTop(bool IgnoreReorder = false);
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
@@ -793,6 +818,10 @@ public:
return MinVecRegSize;
}
+ unsigned getMinVF(unsigned Sz) const {
+ return std::max(2U, getMinVecRegSize() / Sz);
+ }
+
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
@@ -809,7 +838,7 @@ public:
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
- bool isTreeTinyAndNotFullyVectorizable() const;
+ bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
/// can be load combined in the backend. Load combining may not be allowed in
@@ -1578,10 +1607,12 @@ private:
Value *vectorizeTree(ArrayRef<Value *> VL);
/// \returns the scalarization cost for this type. Scalarization in this
- /// context means the creation of vectors from a group of scalars.
- InstructionCost
- getGatherCost(FixedVectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices) const;
+ /// context means the creation of vectors from a group of scalars. If \p
+ /// NeedToShuffle is true, need to add a cost of reshuffling some of the
+ /// vector elements.
+ InstructionCost getGatherCost(FixedVectorType *Ty,
+ const DenseSet<unsigned> &ShuffledIndices,
+ bool NeedToShuffle) const;
/// Checks if the gathered \p VL can be represented as shuffle(s) of previous
/// tree entries.
@@ -1605,7 +1636,7 @@ private:
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
- bool isFullyVectorizableTinyTree() const;
+ bool isFullyVectorizableTinyTree(bool ForReduction) const;
/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
@@ -1621,14 +1652,43 @@ private:
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
- if (VL.size() == Scalars.size())
- return std::equal(VL.begin(), VL.end(), Scalars.begin());
- return VL.size() == ReuseShuffleIndices.size() &&
- std::equal(
- VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
- [this](Value *V, int Idx) { return V == Scalars[Idx]; });
+ auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
+ if (Mask.size() != VL.size() && VL.size() == Scalars.size())
+ return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ return VL.size() == Mask.size() &&
+ std::equal(VL.begin(), VL.end(), Mask.begin(),
+ [Scalars](Value *V, int Idx) {
+ return (isa<UndefValue>(V) &&
+ Idx == UndefMaskElem) ||
+ (Idx != UndefMaskElem && V == Scalars[Idx]);
+ });
+ };
+ if (!ReorderIndices.empty()) {
+ // TODO: implement matching if the nodes are just reordered, still can
+ // treat the vector as the same if the list of scalars matches VL
+ // directly, without reordering.
+ SmallVector<int> Mask;
+ inversePermutation(ReorderIndices, Mask);
+ if (VL.size() == Scalars.size())
+ return IsSame(Scalars, Mask);
+ if (VL.size() == ReuseShuffleIndices.size()) {
+ ::addMask(Mask, ReuseShuffleIndices);
+ return IsSame(Scalars, Mask);
+ }
+ return false;
+ }
+ return IsSame(Scalars, ReuseShuffleIndices);
}
+ /// \return Final vectorization factor for the node. Defined by the total
+ /// number of vectorized scalars, including those, used several times in the
+ /// entry and counted in the \a ReuseShuffleIndices, if any.
+ unsigned getVectorFactor() const {
+ if (!ReuseShuffleIndices.empty())
+ return ReuseShuffleIndices.size();
+ return Scalars.size();
+ };
+
/// A vector of scalars.
ValueList Scalars;
@@ -1701,6 +1761,12 @@ private:
}
}
+ /// Reorders operands of the node to the given mask \p Mask.
+ void reorderOperands(ArrayRef<int> Mask) {
+ for (ValueList &Operand : Operands)
+ reorderScalars(Operand, Mask);
+ }
+
/// \returns the \p OpIdx operand of this TreeEntry.
ValueList &getOperand(unsigned OpIdx) {
assert(OpIdx < Operands.size() && "Off bounds");
@@ -1760,19 +1826,14 @@ private:
return AltOp ? AltOp->getOpcode() : 0;
}
- /// Update operations state of this entry if reorder occurred.
- bool updateStateIfReorder() {
- if (ReorderIndices.empty())
- return false;
- InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
- setOperations(S);
- return true;
- }
- /// When ReuseShuffleIndices is empty it just returns position of \p V
- /// within vector of Scalars. Otherwise, try to remap on its reuse index.
+ /// When ReuseReorderShuffleIndices is empty it just returns position of \p
+ /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
int findLaneForValue(Value *V) const {
unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
+ if (!ReorderIndices.empty())
+ FoundLane = ReorderIndices[FoundLane];
+ assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
if (!ReuseShuffleIndices.empty()) {
FoundLane = std::distance(ReuseShuffleIndices.begin(),
find(ReuseShuffleIndices, FoundLane));
@@ -1856,7 +1917,7 @@ private:
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
- ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<int> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
TreeEntry::EntryState EntryState =
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
@@ -1869,7 +1930,7 @@ private:
Optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
- ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<int> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
(Bundle && EntryState != TreeEntry::NeedToGather)) &&
@@ -1877,12 +1938,25 @@ private:
VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
TreeEntry *Last = VectorizableTree.back().get();
Last->Idx = VectorizableTree.size() - 1;
- Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->State = EntryState;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
- Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
- Last->setOperations(S);
+ if (ReorderIndices.empty()) {
+ Last->Scalars.assign(VL.begin(), VL.end());
+ Last->setOperations(S);
+ } else {
+ // Reorder scalars and build final mask.
+ Last->Scalars.assign(VL.size(), nullptr);
+ transform(ReorderIndices, Last->Scalars.begin(),
+ [VL](unsigned Idx) -> Value * {
+ if (Idx >= VL.size())
+ return UndefValue::get(VL.front()->getType());
+ return VL[Idx];
+ });
+ InstructionsState S = getSameOpcode(Last->Scalars);
+ Last->setOperations(S);
+ Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
+ }
if (Last->State != TreeEntry::NeedToGather) {
for (Value *V : VL) {
assert(!getTreeEntry(V) && "Scalar already in tree!");
@@ -1965,12 +2039,9 @@ private:
if (result.hasValue()) {
return result.getValue();
}
- MemoryLocation Loc2 = getLocation(Inst2, AA);
bool aliased = true;
- if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
- // Do the alias check.
- aliased = !AA->isNoAlias(Loc1, Loc2);
- }
+ if (Loc1.Ptr && isSimple(Inst1))
+ aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));
// Store the result in the cache.
result = aliased;
return aliased;
@@ -2434,14 +2505,6 @@ private:
}
};
- /// Contains orders of operations along with the number of bundles that have
- /// operations in this order. It stores only those orders that require
- /// reordering, if reordering is not required it is counted using \a
- /// NumOpsWantToKeepOriginalOrder.
- DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
- /// Number of bundles that do not require reordering.
- unsigned NumOpsWantToKeepOriginalOrder = 0;
-
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
@@ -2540,10 +2603,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
std::string Str;
raw_string_ostream OS(Str);
- if (isSplat(Entry->Scalars)) {
- OS << "<splat> " << *Entry->Scalars[0];
- return Str;
- }
+ if (isSplat(Entry->Scalars))
+ OS << "<splat> ";
for (auto V : Entry->Scalars) {
OS << *V;
if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
@@ -2594,21 +2655,539 @@ void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
};
}
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
- ArrayRef<Value *> UserIgnoreLst) {
- ExtraValueToDebugLocsMap ExternallyUsedValues;
- buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
+/// contains original mask for the scalars reused in the node. Procedure
+/// transform this mask in accordance with the given \p Mask.
+static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
+ assert(!Mask.empty() && Reuses.size() == Mask.size() &&
+ "Expected non-empty mask.");
+ SmallVector<int> Prev(Reuses.begin(), Reuses.end());
+ Prev.swap(Reuses);
+ for (unsigned I = 0, E = Prev.size(); I < E; ++I)
+ if (Mask[I] != UndefMaskElem)
+ Reuses[Mask[I]] = Prev[I];
}
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
- ExtraValueToDebugLocsMap &ExternallyUsedValues,
- ArrayRef<Value *> UserIgnoreLst) {
- deleteTree();
- UserIgnoreList = UserIgnoreLst;
- if (!allSameType(Roots))
+/// Reorders the given \p Order according to the given \p Mask. \p Order - is
+/// the original order of the scalars. Procedure transforms the provided order
+/// in accordance with the given \p Mask. If the resulting \p Order is just an
+/// identity order, \p Order is cleared.
+static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
+ assert(!Mask.empty() && "Expected non-empty mask.");
+ SmallVector<int> MaskOrder;
+ if (Order.empty()) {
+ MaskOrder.resize(Mask.size());
+ std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
+ } else {
+ inversePermutation(Order, MaskOrder);
+ }
+ reorderReuses(MaskOrder, Mask);
+ if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
+ Order.clear();
return;
- buildTree_rec(Roots, 0, EdgeInfo());
+ }
+ Order.assign(Mask.size(), Mask.size());
+ for (unsigned I = 0, E = Mask.size(); I < E; ++I)
+ if (MaskOrder[I] != UndefMaskElem)
+ Order[MaskOrder[I]] = I;
+ fixupOrderingIndices(Order);
+}
+Optional<BoUpSLP::OrdersType>
+BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
+ assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
+ unsigned NumScalars = TE.Scalars.size();
+ OrdersType CurrentOrder(NumScalars, NumScalars);
+ SmallVector<int> Positions;
+ SmallBitVector UsedPositions(NumScalars);
+ const TreeEntry *STE = nullptr;
+ // Try to find all gathered scalars that are gets vectorized in other
+ // vectorize node. Here we can have only one single tree vector node to
+ // correctly identify order of the gathered scalars.
+ for (unsigned I = 0; I < NumScalars; ++I) {
+ Value *V = TE.Scalars[I];
+ if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
+ continue;
+ if (const auto *LocalSTE = getTreeEntry(V)) {
+ if (!STE)
+ STE = LocalSTE;
+ else if (STE != LocalSTE)
+ // Take the order only from the single vector node.
+ return None;
+ unsigned Lane =
+ std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
+ if (Lane >= NumScalars)
+ return None;
+ if (CurrentOrder[Lane] != NumScalars) {
+ if (Lane != I)
+ continue;
+ UsedPositions.reset(CurrentOrder[Lane]);
+ }
+ // The partial identity (where only some elements of the gather node are
+ // in the identity order) is good.
+ CurrentOrder[Lane] = I;
+ UsedPositions.set(I);
+ }
+ }
+ // Need to keep the order if we have a vector entry and at least 2 scalars or
+ // the vectorized entry has just 2 scalars.
+ if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
+ auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
+ for (unsigned I = 0; I < NumScalars; ++I)
+ if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
+ return false;
+ return true;
+ };
+ if (IsIdentityOrder(CurrentOrder)) {
+ CurrentOrder.clear();
+ return CurrentOrder;
+ }
+ auto *It = CurrentOrder.begin();
+ for (unsigned I = 0; I < NumScalars;) {
+ if (UsedPositions.test(I)) {
+ ++I;
+ continue;
+ }
+ if (*It == NumScalars) {
+ *It = I;
+ ++I;
+ }
+ ++It;
+ }
+ return CurrentOrder;
+ }
+ return None;
+}
+
+void BoUpSLP::reorderTopToBottom() {
+ // Maps VF to the graph nodes.
+ DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
+ // ExtractElement gather nodes which can be vectorized and need to handle
+ // their ordering.
+ DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+ // Find all reorderable nodes with the given VF.
+ // Currently the are vectorized loads,extracts + some gathering of extracts.
+ for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
+ const std::unique_ptr<TreeEntry> &TE) {
+ // No need to reorder if need to shuffle reuses, still need to shuffle the
+ // node.
+ if (!TE->ReuseShuffleIndices.empty())
+ return;
+ if (TE->State == TreeEntry::Vectorize &&
+ isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,
+ InsertElementInst>(TE->getMainOp()) &&
+ !TE->isAltShuffle()) {
+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ return;
+ }
+ if (TE->State == TreeEntry::NeedToGather) {
+ if (TE->getOpcode() == Instruction::ExtractElement &&
+ !TE->isAltShuffle() &&
+ isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
+ ->getVectorOperandType()) &&
+ allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
+ // Check that gather of extractelements can be represented as
+ // just a shuffle of a single vector.
+ OrdersType CurrentOrder;
+ bool Reuse =
+ canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
+ if (Reuse || !CurrentOrder.empty()) {
+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ GathersToOrders.try_emplace(TE.get(), CurrentOrder);
+ return;
+ }
+ }
+ if (Optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(*TE.get())) {
+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+ }
+ }
+ });
+
+ // Reorder the graph nodes according to their vectorization factor.
+ for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
+ VF /= 2) {
+ auto It = VFToOrderedEntries.find(VF);
+ if (It == VFToOrderedEntries.end())
+ continue;
+ // Try to find the most profitable order. We just are looking for the most
+ // used order and reorder scalar elements in the nodes according to this
+ // mostly used order.
+ const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond();
+ // All operands are reordered and used only in this node - propagate the
+ // most used order to the user node.
+ MapVector<OrdersType, unsigned,
+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
+ OrdersUses;
+ SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+ for (const TreeEntry *OpTE : OrderedEntries) {
+ // No need to reorder this nodes, still need to extend and to use shuffle,
+ // just need to merge reordering shuffle and the reuse shuffle.
+ if (!OpTE->ReuseShuffleIndices.empty())
+ continue;
+ // Count number of orders uses.
+ const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
+ if (OpTE->State == TreeEntry::NeedToGather)
+ return GathersToOrders.find(OpTE)->second;
+ return OpTE->ReorderIndices;
+ }();
+ // Stores actually store the mask, not the order, need to invert.
+ if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
+ OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
+ SmallVector<int> Mask;
+ inversePermutation(Order, Mask);
+ unsigned E = Order.size();
+ OrdersType CurrentOrder(E, E);
+ transform(Mask, CurrentOrder.begin(), [E](int Idx) {
+ return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+ });
+ fixupOrderingIndices(CurrentOrder);
+ ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+ } else {
+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ }
+ }
+ // Set order of the user node.
+ if (OrdersUses.empty())
+ continue;
+ // Choose the most used order.
+ ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
+ unsigned Cnt = OrdersUses.front().second;
+ for (const auto &Pair : drop_begin(OrdersUses)) {
+ if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+ BestOrder = Pair.first;
+ Cnt = Pair.second;
+ }
+ }
+ // Set order of the user node.
+ if (BestOrder.empty())
+ continue;
+ SmallVector<int> Mask;
+ inversePermutation(BestOrder, Mask);
+ SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+ unsigned E = BestOrder.size();
+ transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
+ return I < E ? static_cast<int>(I) : UndefMaskElem;
+ });
+ // Do an actual reordering, if profitable.
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ // Just do the reordering for the nodes with the given VF.
+ if (TE->Scalars.size() != VF) {
+ if (TE->ReuseShuffleIndices.size() == VF) {
+ // Need to reorder the reuses masks of the operands with smaller VF to
+ // be able to find the match between the graph nodes and scalar
+ // operands of the given node during vectorization/cost estimation.
+ assert(all_of(TE->UserTreeIndices,
+ [VF, &TE](const EdgeInfo &EI) {
+ return EI.UserTE->Scalars.size() == VF ||
+ EI.UserTE->Scalars.size() ==
+ TE->Scalars.size();
+ }) &&
+ "All users must be of VF size.");
+ // Update ordering of the operands with the smaller VF than the given
+ // one.
+ reorderReuses(TE->ReuseShuffleIndices, Mask);
+ }
+ continue;
+ }
+ if (TE->State == TreeEntry::Vectorize &&
+ isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
+ InsertElementInst>(TE->getMainOp()) &&
+ !TE->isAltShuffle()) {
+ // Build correct orders for extract{element,value}, loads and
+ // stores.
+ reorderOrder(TE->ReorderIndices, Mask);
+ if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
+ TE->reorderOperands(Mask);
+ } else {
+ // Reorder the node and its operands.
+ TE->reorderOperands(Mask);
+ assert(TE->ReorderIndices.empty() &&
+ "Expected empty reorder sequence.");
+ reorderScalars(TE->Scalars, Mask);
+ }
+ if (!TE->ReuseShuffleIndices.empty()) {
+ // Apply reversed order to keep the original ordering of the reused
+ // elements to avoid extra reorder indices shuffling.
+ OrdersType CurrentOrder;
+ reorderOrder(CurrentOrder, MaskOrder);
+ SmallVector<int> NewReuses;
+ inversePermutation(CurrentOrder, NewReuses);
+ addMask(NewReuses, TE->ReuseShuffleIndices);
+ TE->ReuseShuffleIndices.swap(NewReuses);
+ }
+ }
+ }
+}
+
+void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
+ SetVector<TreeEntry *> OrderedEntries;
+ DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+ // Find all reorderable leaf nodes with the given VF.
+ // Currently the are vectorized loads,extracts without alternate operands +
+ // some gathering of extracts.
+ SmallVector<TreeEntry *> NonVectorized;
+ for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
+ &NonVectorized](
+ const std::unique_ptr<TreeEntry> &TE) {
+ if (TE->State != TreeEntry::Vectorize)
+ NonVectorized.push_back(TE.get());
+ // No need to reorder if need to shuffle reuses, still need to shuffle the
+ // node.
+ if (!TE->ReuseShuffleIndices.empty())
+ return;
+ if (TE->State == TreeEntry::Vectorize &&
+ isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&
+ !TE->isAltShuffle()) {
+ OrderedEntries.insert(TE.get());
+ return;
+ }
+ if (TE->State == TreeEntry::NeedToGather) {
+ if (TE->getOpcode() == Instruction::ExtractElement &&
+ !TE->isAltShuffle() &&
+ isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
+ ->getVectorOperandType()) &&
+ allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
+ // Check that gather of extractelements can be represented as
+ // just a shuffle of a single vector with a single user only.
+ OrdersType CurrentOrder;
+ bool Reuse =
+ canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
+ if ((Reuse || !CurrentOrder.empty()) &&
+ !any_of(VectorizableTree,
+ [&TE](const std::unique_ptr<TreeEntry> &Entry) {
+ return Entry->State == TreeEntry::NeedToGather &&
+ Entry.get() != TE.get() &&
+ Entry->isSame(TE->Scalars);
+ })) {
+ OrderedEntries.insert(TE.get());
+ GathersToOrders.try_emplace(TE.get(), CurrentOrder);
+ return;
+ }
+ }
+ if (Optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(*TE.get())) {
+ OrderedEntries.insert(TE.get());
+ GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+ }
+ }
+ });
+
+ // Checks if the operands of the users are reordarable and have only single
+ // use.
+ auto &&CheckOperands =
+ [this, &NonVectorized](const auto &Data,
+ SmallVectorImpl<TreeEntry *> &GatherOps) {
+ for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
+ if (any_of(Data.second,
+ [I](const std::pair<unsigned, TreeEntry *> &OpData) {
+ return OpData.first == I &&
+ OpData.second->State == TreeEntry::Vectorize;
+ }))
+ continue;
+ ArrayRef<Value *> VL = Data.first->getOperand(I);
+ const TreeEntry *TE = nullptr;
+ const auto *It = find_if(VL, [this, &TE](Value *V) {
+ TE = getTreeEntry(V);
+ return TE;
+ });
+ if (It != VL.end() && TE->isSame(VL))
+ return false;
+ TreeEntry *Gather = nullptr;
+ if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
+ assert(TE->State != TreeEntry::Vectorize &&
+ "Only non-vectorized nodes are expected.");
+ if (TE->isSame(VL)) {
+ Gather = TE;
+ return true;
+ }
+ return false;
+ }) > 1)
+ return false;
+ if (Gather)
+ GatherOps.push_back(Gather);
+ }
+ return true;
+ };
+ // 1. Propagate order to the graph nodes, which use only reordered nodes.
+ // I.e., if the node has operands, that are reordered, try to make at least
+ // one operand order in the natural order and reorder others + reorder the
+ // user node itself.
+ SmallPtrSet<const TreeEntry *, 4> Visited;
+ while (!OrderedEntries.empty()) {
+ // 1. Filter out only reordered nodes.
+ // 2. If the entry has multiple uses - skip it and jump to the next node.
+ MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
+ SmallVector<TreeEntry *> Filtered;
+ for (TreeEntry *TE : OrderedEntries) {
+ if (!(TE->State == TreeEntry::Vectorize ||
+ (TE->State == TreeEntry::NeedToGather &&
+ GathersToOrders.count(TE))) ||
+ TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
+ !all_of(drop_begin(TE->UserTreeIndices),
+ [TE](const EdgeInfo &EI) {
+ return EI.UserTE == TE->UserTreeIndices.front().UserTE;
+ }) ||
+ !Visited.insert(TE).second) {
+ Filtered.push_back(TE);
+ continue;
+ }
+ // Build a map between user nodes and their operands order to speedup
+ // search. The graph currently does not provide this dependency directly.
+ for (EdgeInfo &EI : TE->UserTreeIndices) {
+ TreeEntry *UserTE = EI.UserTE;
+ auto It = Users.find(UserTE);
+ if (It == Users.end())
+ It = Users.insert({UserTE, {}}).first;
+ It->second.emplace_back(EI.EdgeIdx, TE);
+ }
+ }
+ // Erase filtered entries.
+ for_each(Filtered,
+ [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
+ for (const auto &Data : Users) {
+ // Check that operands are used only in the User node.
+ SmallVector<TreeEntry *> GatherOps;
+ if (!CheckOperands(Data, GatherOps)) {
+ for_each(Data.second,
+ [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
+ OrderedEntries.remove(Op.second);
+ });
+ continue;
+ }
+ // All operands are reordered and used only in this node - propagate the
+ // most used order to the user node.
+ MapVector<OrdersType, unsigned,
+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
+ OrdersUses;
+ SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+ for (const auto &Op : Data.second) {
+ TreeEntry *OpTE = Op.second;
+ if (!OpTE->ReuseShuffleIndices.empty() ||
+ (IgnoreReorder && OpTE == VectorizableTree.front().get()))
+ continue;
+ const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
+ if (OpTE->State == TreeEntry::NeedToGather)
+ return GathersToOrders.find(OpTE)->second;
+ return OpTE->ReorderIndices;
+ }();
+ // Stores actually store the mask, not the order, need to invert.
+ if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
+ OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
+ SmallVector<int> Mask;
+ inversePermutation(Order, Mask);
+ unsigned E = Order.size();
+ OrdersType CurrentOrder(E, E);
+ transform(Mask, CurrentOrder.begin(), [E](int Idx) {
+ return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+ });
+ fixupOrderingIndices(CurrentOrder);
+ ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+ } else {
+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ }
+ if (VisitedOps.insert(OpTE).second)
+ OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
+ OpTE->UserTreeIndices.size();
+ assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
+ --OrdersUses[{}];
+ }
+ // If no orders - skip current nodes and jump to the next one, if any.
+ if (OrdersUses.empty()) {
+ for_each(Data.second,
+ [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
+ OrderedEntries.remove(Op.second);
+ });
+ continue;
+ }
+ // Choose the best order.
+ ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
+ unsigned Cnt = OrdersUses.front().second;
+ for (const auto &Pair : drop_begin(OrdersUses)) {
+ if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+ BestOrder = Pair.first;
+ Cnt = Pair.second;
+ }
+ }
+ // Set order of the user node (reordering of operands and user nodes).
+ if (BestOrder.empty()) {
+ for_each(Data.second,
+ [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
+ OrderedEntries.remove(Op.second);
+ });
+ continue;
+ }
+ // Erase operands from OrderedEntries list and adjust their orders.
+ VisitedOps.clear();
+ SmallVector<int> Mask;
+ inversePermutation(BestOrder, Mask);
+ SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+ unsigned E = BestOrder.size();
+ transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
+ return I < E ? static_cast<int>(I) : UndefMaskElem;
+ });
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
+ TreeEntry *TE = Op.second;
+ OrderedEntries.remove(TE);
+ if (!VisitedOps.insert(TE).second)
+ continue;
+ if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
+ // Just reorder reuses indices.
+ reorderReuses(TE->ReuseShuffleIndices, Mask);
+ continue;
+ }
+ // Gathers are processed separately.
+ if (TE->State != TreeEntry::Vectorize)
+ continue;
+ assert((BestOrder.size() == TE->ReorderIndices.size() ||
+ TE->ReorderIndices.empty()) &&
+ "Non-matching sizes of user/operand entries.");
+ reorderOrder(TE->ReorderIndices, Mask);
+ }
+ // For gathers just need to reorder its scalars.
+ for (TreeEntry *Gather : GatherOps) {
+ assert(Gather->ReorderIndices.empty() &&
+ "Unexpected reordering of gathers.");
+ if (!Gather->ReuseShuffleIndices.empty()) {
+ // Just reorder reuses indices.
+ reorderReuses(Gather->ReuseShuffleIndices, Mask);
+ continue;
+ }
+ reorderScalars(Gather->Scalars, Mask);
+ OrderedEntries.remove(Gather);
+ }
+ // Reorder operands of the user node and set the ordering for the user
+ // node itself.
+ if (Data.first->State != TreeEntry::Vectorize ||
+ !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
+ Data.first->getMainOp()) ||
+ Data.first->isAltShuffle())
+ Data.first->reorderOperands(Mask);
+ if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
+ Data.first->isAltShuffle()) {
+ reorderScalars(Data.first->Scalars, Mask);
+ reorderOrder(Data.first->ReorderIndices, MaskOrder);
+ if (Data.first->ReuseShuffleIndices.empty() &&
+ !Data.first->ReorderIndices.empty() &&
+ !Data.first->isAltShuffle()) {
+ // Insert user node to the list to try to sink reordering deeper in
+ // the graph.
+ OrderedEntries.insert(Data.first);
+ }
+ } else {
+ reorderOrder(Data.first->ReorderIndices, Mask);
+ }
+ }
+ }
+ // If the reordering is unnecessary, just remove the reorder.
+ if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
+ VectorizableTree.front()->ReuseShuffleIndices.empty())
+ VectorizableTree.front()->ReorderIndices.clear();
+}
+
+void BoUpSLP::buildExternalUses(
+ const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// Collect the values that we need to extract from the tree.
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
@@ -2636,6 +3215,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
if (!UserInst)
continue;
+ if (isDeleted(UserInst))
+ continue;
+
// Skip in-tree scalars that become vectors
if (TreeEntry *UseEntry = getTreeEntry(U)) {
Value *UseScalar = UseEntry->Scalars[0];
@@ -2664,14 +3246,120 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
}
}
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst) {
+ deleteTree();
+ UserIgnoreList = UserIgnoreLst;
+ if (!allSameType(Roots))
+ return;
+ buildTree_rec(Roots, 0, EdgeInfo());
+}
+
+namespace {
+/// Tracks the state we can represent the loads in the given sequence.
+enum class LoadsState { Gather, Vectorize, ScatterVectorize };
+} // anonymous namespace
+
+/// Checks if the given array of loads can be represented as a vectorized,
+/// scatter or just simple gather.
+static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &Order,
+ SmallVectorImpl<Value *> &PointerOps) {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ Type *ScalarTy = VL0->getType();
+
+ if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
+ return LoadsState::Gather;
+
+ // Make sure all loads in the bundle are simple - we can't vectorize
+ // atomic or volatile loads.
+ PointerOps.clear();
+ PointerOps.resize(VL.size());
+ auto *POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
+ if (!L->isSimple())
+ return LoadsState::Gather;
+ *POIter = L->getPointerOperand();
+ ++POIter;
+ }
+
+ Order.clear();
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ Optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ // Check that the sorted loads are consecutive.
+ if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+ return LoadsState::Vectorize;
+ Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
+ for (Value *V : VL)
+ CommonAlignment =
+ commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
+ CommonAlignment))
+ return LoadsState::ScatterVectorize;
+ }
+
+ return LoadsState::Gather;
+}
+
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+ SmallVector<int> ReuseShuffleIndicies;
+ SmallVector<Value *> UniqueValues;
+ auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
+ &UserTreeIdx,
+ this](const InstructionsState &S) {
+ // Check that every instruction appears once in this bundle.
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1
+ : Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ size_t NumUniqueScalarValues = UniqueValues.size();
+ if (NumUniqueScalarValues == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (NumUniqueScalarValues <= 1 ||
+ !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return false;
+ }
+ VL = UniqueValues;
+ }
+ return true;
+ };
+
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
@@ -2680,7 +3368,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
isa<ScalableVectorType>(
cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
@@ -2700,9 +3390,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+ // If we deal with insert/extract instructions, they all must have constant
+ // indices, otherwise we should gather them, not try to vectorize.
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
+ (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
+ !all_of(VL, isVectorLikeInstWithConstOps))) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
@@ -2724,7 +3420,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
@@ -2743,7 +3441,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (getTreeEntry(I)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is already in tree.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
@@ -2754,7 +3454,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *V : VL) {
if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
@@ -2773,28 +3475,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// Check that every instruction appears once in this bundle.
- SmallVector<unsigned, 4> ReuseShuffleIndicies;
- SmallVector<Value *, 4> UniqueValues;
- DenseMap<Value *, unsigned> UniquePositions;
- for (Value *V : VL) {
- auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndicies.emplace_back(Res.first->second);
- if (Res.second)
- UniqueValues.emplace_back(V);
- }
- size_t NumUniqueScalarValues = UniqueValues.size();
- if (NumUniqueScalarValues == VL.size()) {
- ReuseShuffleIndicies.clear();
- } else {
- LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
- if (NumUniqueScalarValues <= 1 ||
- !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
- LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- VL = UniqueValues;
- }
+ if (!TryToFindDuplicates(S))
+ return;
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
@@ -2867,7 +3549,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse) {
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
- ++NumOpsWantToKeepOriginalOrder;
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
// This is a special case, as it does not gather, but at the same time
@@ -2885,12 +3566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
dbgs() << " " << Idx;
dbgs() << "\n";
});
+ fixupOrderingIndices(CurrentOrder);
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
- findRootOrder(CurrentOrder);
- ++NumOpsWantToKeepOrder[CurrentOrder];
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
@@ -2910,8 +3590,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check that we have a buildvector and not a shuffle of 2 or more
// different vectors.
ValueSet SourceVectors;
- for (Value *V : VL)
+ int MinIdx = std::numeric_limits<int>::max();
+ for (Value *V : VL) {
SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
+ Optional<int> Idx = *getInsertIndex(V, 0);
+ if (!Idx || *Idx == UndefMaskElem)
+ continue;
+ MinIdx = std::min(MinIdx, *Idx);
+ }
if (count_if(VL, [&SourceVectors](Value *V) {
return !SourceVectors.contains(V);
@@ -2919,13 +3605,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Found 2nd source vector - cancel.
LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
BS.cancelScheduling(VL, VL0);
return;
}
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);
+ auto OrdCompare = [](const std::pair<int, int> &P1,
+ const std::pair<int, int> &P2) {
+ return P1.first > P2.first;
+ };
+ PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
+ decltype(OrdCompare)>
+ Indices(OrdCompare);
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ Optional<int> Idx = *getInsertIndex(VL[I], 0);
+ if (!Idx || *Idx == UndefMaskElem)
+ continue;
+ Indices.emplace(*Idx, I);
+ }
+ OrdersType CurrentOrder(VL.size(), VL.size());
+ bool IsIdentity = true;
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ CurrentOrder[Indices.top().second] = I;
+ IsIdentity &= Indices.top().second == I;
+ Indices.pop();
+ }
+ if (IsIdentity)
+ CurrentOrder.clear();
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ None, CurrentOrder);
LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
constexpr int NumOps = 2;
@@ -2936,7 +3644,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->setOperand(I, VectorOperands[I]);
}
- buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0});
+ buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
return;
}
case Instruction::Load: {
@@ -2946,90 +3654,52 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
- Type *ScalarTy = VL0->getType();
-
- if (DL->getTypeSizeInBits(ScalarTy) !=
- DL->getTypeAllocSizeInBits(ScalarTy)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
- return;
- }
-
- // Make sure all loads in the bundle are simple - we can't vectorize
- // atomic or volatile loads.
- SmallVector<Value *, 4> PointerOps(VL.size());
- auto POIter = PointerOps.begin();
- for (Value *V : VL) {
- auto *L = cast<LoadInst>(V);
- if (!L->isSimple()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
- return;
- }
- *POIter = L->getPointerOperand();
- ++POIter;
- }
-
+ SmallVector<Value *> PointerOps;
OrdersType CurrentOrder;
- // Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
- Value *Ptr0;
- Value *PtrN;
+ TreeEntry *TE = nullptr;
+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,
+ PointerOps)) {
+ case LoadsState::Vectorize:
if (CurrentOrder.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
+ // Original loads are consecutive and does not require reordering.
+ TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
- Ptr0 = PointerOps[CurrentOrder.front()];
- PtrN = PointerOps[CurrentOrder.back()];
- }
- Optional<int> Diff = getPointersDiff(
- ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
- // Check that the sorted loads are consecutive.
- if (static_cast<unsigned>(*Diff) == VL.size() - 1) {
- if (CurrentOrder.empty()) {
- // Original loads are consecutive and does not require reordering.
- ++NumOpsWantToKeepOriginalOrder;
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
- UserTreeIdx, ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
- } else {
- // Need to reorder.
- TreeEntry *TE =
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, CurrentOrder);
- TE->setOperandsInOrder();
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
- findRootOrder(CurrentOrder);
- ++NumOpsWantToKeepOrder[CurrentOrder];
- }
- return;
- }
- Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
- for (Value *V : VL)
- CommonAlignment =
- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
- if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
- CommonAlignment)) {
- // Vectorizing non-consecutive loads with `llvm.masked.gather`.
- TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle,
- S, UserTreeIdx, ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- buildTree_rec(PointerOps, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs()
- << "SLP: added a vector of non-consecutive loads.\n");
- return;
+ fixupOrderingIndices(CurrentOrder);
+ // Need to reorder.
+ TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, CurrentOrder);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
}
+ TE->setOperandsInOrder();
+ break;
+ case LoadsState::ScatterVectorize:
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ break;
+ case LoadsState::Gather:
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+#ifndef NDEBUG
+ Type *ScalarTy = VL0->getType();
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy))
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ else if (any_of(VL, [](Value *V) {
+ return !cast<LoadInst>(V)->isSimple();
+ }))
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ else
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+#endif // NDEBUG
+ break;
}
-
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
return;
}
case Instruction::ZExt:
@@ -3213,15 +3883,40 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
- TE->setOperandsInOrder();
- for (unsigned i = 0, e = 2; i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ SmallVector<ValueList, 2> Operands(2);
+ // Prepare the operand vector for pointer operands.
+ for (Value *V : VL)
+ Operands.front().push_back(
+ cast<GetElementPtrInst>(V)->getPointerOperand());
+ TE->setOperand(0, Operands.front());
+ // Need to cast all indices to the same type before vectorization to
+ // avoid crash.
+ // Required to be able to find correct matches between different gather
+ // nodes and reuse the vectorized values rather than trying to gather them
+ // again.
+ int IndexIdx = 1;
+ Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
+ Type *Ty = all_of(VL,
+ [VL0Ty, IndexIdx](Value *V) {
+ return VL0Ty == cast<GetElementPtrInst>(V)
+ ->getOperand(IndexIdx)
+ ->getType();
+ })
+ ? VL0Ty
+ : DL->getIndexType(cast<GetElementPtrInst>(VL0)
+ ->getPointerOperandType()
+ ->getScalarType());
+ // Prepare the operand vector.
+ for (Value *V : VL) {
+ auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);
+ auto *CI = cast<ConstantInt>(Op);
+ Operands.back().push_back(ConstantExpr::getIntegerCast(
+ CI, Ty, CI->getValue().isSignBitSet()));
}
+ TE->setOperand(IndexIdx, Operands.back());
+
+ for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
+ buildTree_rec(Operands[I], Depth + 1, {TE, I});
return;
}
case Instruction::Store: {
@@ -3276,21 +3971,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
if (CurrentOrder.empty()) {
// Original stores are consecutive and does not require reordering.
- ++NumOpsWantToKeepOriginalOrder;
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
UserTreeIdx, ReuseShuffleIndicies);
TE->setOperandsInOrder();
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
} else {
+ fixupOrderingIndices(CurrentOrder);
TreeEntry *TE =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
TE->setOperandsInOrder();
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
- findRootOrder(CurrentOrder);
- ++NumOpsWantToKeepOrder[CurrentOrder];
}
return;
}
@@ -3321,7 +4014,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
Function *F = CI->getCalledFunction();
- unsigned NumArgs = CI->getNumArgOperands();
+ unsigned NumArgs = CI->arg_size();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
if (hasVectorInstrinsicScalarOpd(ID, j))
@@ -3373,7 +4066,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
TE->setOperandsInOrder();
- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
+ // For scalar operands no need to to create an entry since no need to
+ // vectorize it.
+ if (hasVectorInstrinsicScalarOpd(ID, i))
+ continue;
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL) {
@@ -3548,7 +4245,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
FastMathFlags FMF;
if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
FMF = FPCI->getFastMathFlags();
- SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
+ SmallVector<const Value *> Arguments(CI->args());
IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
dyn_cast<IntrinsicInst>(CI));
auto IntrinsicCost =
@@ -3621,25 +4318,42 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
return Cost;
}
-/// Shuffles \p Mask in accordance with the given \p SubMask.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
- if (SubMask.empty())
- return;
- if (Mask.empty()) {
- Mask.append(SubMask.begin(), SubMask.end());
- return;
- }
- SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
- int TermValue = std::min(Mask.size(), SubMask.size());
- for (int I = 0, E = SubMask.size(); I < E; ++I) {
- if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
- Mask[SubMask[I]] >= TermValue) {
- NewMask[I] = UndefMaskElem;
- continue;
+/// Build shuffle mask for shuffle graph entries and lists of main and alternate
+/// operations operands.
+static void
+buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
+ ArrayRef<int> ReusesIndices,
+ const function_ref<bool(Instruction *)> IsAltOp,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<Value *> *OpScalars = nullptr,
+ SmallVectorImpl<Value *> *AltScalars = nullptr) {
+ unsigned Sz = VL.size();
+ Mask.assign(Sz, UndefMaskElem);
+ SmallVector<int> OrderMask;
+ if (!ReorderIndices.empty())
+ inversePermutation(ReorderIndices, OrderMask);
+ for (unsigned I = 0; I < Sz; ++I) {
+ unsigned Idx = I;
+ if (!ReorderIndices.empty())
+ Idx = OrderMask[I];
+ auto *OpInst = cast<Instruction>(VL[Idx]);
+ if (IsAltOp(OpInst)) {
+ Mask[I] = Sz + Idx;
+ if (AltScalars)
+ AltScalars->push_back(OpInst);
+ } else {
+ Mask[I] = Idx;
+ if (OpScalars)
+ OpScalars->push_back(OpInst);
}
- NewMask[I] = Mask[SubMask[I]];
}
- Mask.swap(NewMask);
+ if (!ReusesIndices.empty()) {
+ SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);
+ transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
+ return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;
+ });
+ Mask.swap(NewMask);
+ }
}
InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
@@ -3661,13 +4375,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (MinBWs.count(VL[0]))
VecTy = FixedVectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
- auto *FinalVecTy = VecTy;
+ unsigned EntryVF = E->getVectorFactor();
+ auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
- unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- if (NeedToShuffleReuses)
- FinalVecTy =
- FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers);
// FIXME: it tries to fix a problem with MSVC buildbots.
TargetTransformInfo &TTIRef = *TTI;
auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
@@ -3785,7 +4496,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// shuffle of a single/two vectors the scalars are extracted from.
SmallVector<int> Mask;
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
- isShuffle(VL, Mask);
+ isFixedVectorShuffle(VL, Mask);
if (ShuffleKind.hasValue()) {
// Found the bunch of extractelement instructions that must be gathered
// into a vector and can be represented as a permutation elements in a
@@ -3803,6 +4514,92 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (NeedToShuffleReuses)
ReuseShuffleCost = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
+ // Improve gather cost for gather of loads, if we can group some of the
+ // loads into vector loads.
+ if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
+ !E->isAltShuffle()) {
+ BoUpSLP::ValueSet VectorizedLoads;
+ unsigned StartIdx = 0;
+ unsigned VF = VL.size() / 2;
+ unsigned VectorizedCnt = 0;
+ unsigned ScatterVectorizeCnt = 0;
+ const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
+ for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+ for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
+ Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ if (!VectorizedLoads.count(Slice.front()) &&
+ !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
+ SmallVector<Value *> PointerOps;
+ OrdersType CurrentOrder;
+ LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,
+ *SE, CurrentOrder, PointerOps);
+ switch (LS) {
+ case LoadsState::Vectorize:
+ case LoadsState::ScatterVectorize:
+ // Mark the vectorized loads so that we don't vectorize them
+ // again.
+ if (LS == LoadsState::Vectorize)
+ ++VectorizedCnt;
+ else
+ ++ScatterVectorizeCnt;
+ VectorizedLoads.insert(Slice.begin(), Slice.end());
+ // If we vectorized initial block, no need to try to vectorize it
+ // again.
+ if (Cnt == StartIdx)
+ StartIdx += VF;
+ break;
+ case LoadsState::Gather:
+ break;
+ }
+ }
+ }
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= VL.size())
+ break;
+ // Found vectorizable parts - exit.
+ if (!VectorizedLoads.empty())
+ break;
+ }
+ if (!VectorizedLoads.empty()) {
+ InstructionCost GatherCost = 0;
+ unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ bool NeedInsertSubvectorAnalysis =
+ !NumParts || (VL.size() / VF) > NumParts;
+ // Get the cost for gathered loads.
+ for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
+ if (VectorizedLoads.contains(VL[I]))
+ continue;
+ GatherCost += getGatherCost(VL.slice(I, VF));
+ }
+ // The cost for vectorized loads.
+ InstructionCost ScalarsCost = 0;
+ for (Value *V : VectorizedLoads) {
+ auto *LI = cast<LoadInst>(V);
+ ScalarsCost += TTI->getMemoryOpCost(
+ Instruction::Load, LI->getType(), LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind, LI);
+ }
+ auto *LI = cast<LoadInst>(E->getMainOp());
+ auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
+ Align Alignment = LI->getAlign();
+ GatherCost +=
+ VectorizedCnt *
+ TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind, LI);
+ GatherCost += ScatterVectorizeCnt *
+ TTI->getGatherScatterOpCost(
+ Instruction::Load, LoadTy, LI->getPointerOperand(),
+ /*VariableMask=*/false, Alignment, CostKind, LI);
+ if (NeedInsertSubvectorAnalysis) {
+ // Add the cost for the subvectors insert.
+ for (int I = VF, E = VL.size(); I < E; I += VF)
+ GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
+ None, I, LoadTy);
+ }
+ return ReuseShuffleCost + GatherCost - ScalarsCost;
+ }
+ }
return ReuseShuffleCost + getGatherCost(VL);
}
InstructionCost CommonCost = 0;
@@ -3852,7 +4649,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
++Idx;
}
}
- Idx = ReuseShuffleNumbers;
+ Idx = EntryVF;
for (Value *V : VL) {
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *EE = cast<ExtractElementInst>(V);
@@ -3895,29 +4692,33 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
return CommonCost;
}
case Instruction::InsertElement: {
+ assert(E->ReuseShuffleIndices.empty() &&
+ "Unique insertelements only are expected.");
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
unsigned const NumElts = SrcVecTy->getNumElements();
unsigned const NumScalars = VL.size();
- APInt DemandedElts = APInt::getNullValue(NumElts);
+ APInt DemandedElts = APInt::getZero(NumElts);
// TODO: Add support for Instruction::InsertValue.
- unsigned Offset = UINT_MAX;
+ SmallVector<int> Mask;
+ if (!E->ReorderIndices.empty()) {
+ inversePermutation(E->ReorderIndices, Mask);
+ Mask.append(NumElts - NumScalars, UndefMaskElem);
+ } else {
+ Mask.assign(NumElts, UndefMaskElem);
+ std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+ }
+ unsigned Offset = *getInsertIndex(VL0, 0);
bool IsIdentity = true;
- SmallVector<int> ShuffleMask(NumElts, UndefMaskElem);
+ SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+ Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
- Optional<int> InsertIdx = getInsertIndex(VL[I], 0);
+ Optional<int> InsertIdx = getInsertIndex(VL[PrevMask[I]], 0);
if (!InsertIdx || *InsertIdx == UndefMaskElem)
continue;
- unsigned Idx = *InsertIdx;
- DemandedElts.setBit(Idx);
- if (Idx < Offset) {
- Offset = Idx;
- IsIdentity &= I == 0;
- } else {
- assert(Idx >= Offset && "Failed to find vector index offset");
- IsIdentity &= Idx - Offset == I;
- }
- ShuffleMask[Idx] = I;
+ DemandedElts.setBit(*InsertIdx);
+ IsIdentity &= *InsertIdx - Offset == I;
+ Mask[*InsertIdx - Offset] = I;
}
assert(Offset < NumElts && "Failed to find vector index offset");
@@ -3932,8 +4733,23 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TargetTransformInfo::SK_PermuteSingleSrc,
FixedVectorType::get(SrcVecTy->getElementType(), Sz));
} else if (!IsIdentity) {
- Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,
- ShuffleMask);
+ auto *FirstInsert =
+ cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+ return !is_contained(E->Scalars,
+ cast<Instruction>(V)->getOperand(0));
+ }));
+ if (isa<UndefValue>(FirstInsert->getOperand(0))) {
+ Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
+ } else {
+ SmallVector<int> InsertMask(NumElts);
+ std::iota(InsertMask.begin(), InsertMask.end(), 0);
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (Mask[I] != UndefMaskElem)
+ InsertMask[Offset + I] = NumElts + I;
+ }
+ Cost +=
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
+ }
}
return Cost;
@@ -3955,7 +4771,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
TTI::getCastContextHint(VL0), CostKind, VL0);
if (NeedToShuffleReuses) {
- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
}
// Calculate the cost of this instruction.
@@ -3980,7 +4796,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
if (NeedToShuffleReuses) {
- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
}
auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
@@ -4085,7 +4901,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
Op2VK, Op1VP, Op2VP, Operands, VL0);
if (NeedToShuffleReuses) {
- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecCost =
@@ -4103,7 +4919,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
if (NeedToShuffleReuses) {
- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecCost = TTI->getArithmeticInstrCost(
@@ -4117,7 +4933,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
if (NeedToShuffleReuses) {
- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecLdCost;
@@ -4160,7 +4976,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
InstructionCost ScalarEltCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
@@ -4215,14 +5031,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI::CastContextHint::None, CostKind);
}
- SmallVector<int> Mask(E->Scalars.size());
- for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) {
- auto *OpInst = cast<Instruction>(E->Scalars[I]);
- assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
- Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0);
- }
- VecCost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0);
+ SmallVector<int> Mask;
+ buildSuffleEntryMask(
+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+ [E](Instruction *I) {
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ return I->getOpcode() == E->getAltOpcode();
+ },
+ Mask);
+ CommonCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
return CommonCost + VecCost - ScalarCost;
}
@@ -4231,13 +5049,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
}
}
-bool BoUpSLP::isFullyVectorizableTinyTree() const {
+bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n");
+ auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
+ SmallVector<int> Mask;
+ return TE->State == TreeEntry::NeedToGather &&
+ !any_of(TE->Scalars,
+ [this](Value *V) { return EphValues.contains(V); }) &&
+ (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
+ TE->Scalars.size() < Limit ||
+ (TE->getOpcode() == Instruction::ExtractElement &&
+ isFixedVectorShuffle(TE->Scalars, Mask)) ||
+ (TE->State == TreeEntry::NeedToGather &&
+ TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
+ };
+
// We only handle trees of heights 1 and 2.
if (VectorizableTree.size() == 1 &&
- VectorizableTree[0]->State == TreeEntry::Vectorize)
+ (VectorizableTree[0]->State == TreeEntry::Vectorize ||
+ (ForReduction &&
+ AreVectorizableGathers(VectorizableTree[0].get(),
+ VectorizableTree[0]->Scalars.size()) &&
+ VectorizableTree[0]->getVectorFactor() > 2)))
return true;
if (VectorizableTree.size() != 2)
@@ -4249,19 +5084,14 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
// or they are extractelements, which form shuffle.
SmallVector<int> Mask;
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
- (allConstant(VectorizableTree[1]->Scalars) ||
- isSplat(VectorizableTree[1]->Scalars) ||
- (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
- VectorizableTree[1]->Scalars.size() <
- VectorizableTree[0]->Scalars.size()) ||
- (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
- VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
- isShuffle(VectorizableTree[1]->Scalars, Mask))))
+ AreVectorizableGathers(VectorizableTree[1].get(),
+ VectorizableTree[0]->Scalars.size()))
return true;
// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
- VectorizableTree[1]->State == TreeEntry::NeedToGather)
+ (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+ VectorizableTree[0]->State != TreeEntry::ScatterVectorize))
return false;
return true;
@@ -4330,7 +5160,7 @@ bool BoUpSLP::isLoadCombineCandidate() const {
return true;
}
-bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// No need to vectorize inserts of gathered values.
if (VectorizableTree.size() == 2 &&
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
@@ -4344,7 +5174,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
// can vectorize it if we can prove it fully vectorizable.
- if (isFullyVectorizableTinyTree())
+ if (isFullyVectorizableTinyTree(ForReduction))
return false;
assert(VectorizableTree.empty()
@@ -4496,7 +5326,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
- if (EU.User && isa<InsertElementInst>(EU.User)) {
+ if (isa_and_nonnull<InsertElementInst>(EU.User)) {
if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
if (!InsertIdx || *InsertIdx == UndefMaskElem)
@@ -4508,8 +5338,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return false;
auto *IE1 = cast<InsertElementInst>(VU);
auto *IE2 = cast<InsertElementInst>(V);
- // Go though of insertelement instructions trying to find either VU as
- // the original vector for IE2 or V as the original vector for IE1.
+ // Go through of insertelement instructions trying to find either VU
+ // as the original vector for IE2 or V as the original vector for IE1.
do {
if (IE1 == VU || IE2 == V)
return true;
@@ -4525,7 +5355,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
VF.push_back(FTy->getNumElements());
ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
FirstUsers.push_back(EU.User);
- DemandedElts.push_back(APInt::getNullValue(VF.back()));
+ DemandedElts.push_back(APInt::getZero(VF.back()));
VecId = FirstUsers.size() - 1;
} else {
VecId = std::distance(FirstUsers.begin(), It);
@@ -4705,18 +5535,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
} else {
// Try to find nodes with the same vector factor.
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
- // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
- // landed.
- auto &&GetVF = [](const TreeEntry *TE) {
- if (!TE->ReuseShuffleIndices.empty())
- return TE->ReuseShuffleIndices.size();
- return TE->Scalars.size();
- };
DenseMap<int, const TreeEntry *> VFToTE;
for (const TreeEntry *TE : UsedTEs.front())
- VFToTE.try_emplace(GetVF(TE), TE);
+ VFToTE.try_emplace(TE->getVectorFactor(), TE);
for (const TreeEntry *TE : UsedTEs.back()) {
- auto It = VFToTE.find(GetVF(TE));
+ auto It = VFToTE.find(TE->getVectorFactor());
if (It != VFToTE.end()) {
VF = It->first;
Entries.push_back(It->second);
@@ -4757,16 +5580,17 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
InstructionCost
BoUpSLP::getGatherCost(FixedVectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices) const {
+ const DenseSet<unsigned> &ShuffledIndices,
+ bool NeedToShuffle) const {
unsigned NumElts = Ty->getNumElements();
- APInt DemandedElts = APInt::getNullValue(NumElts);
+ APInt DemandedElts = APInt::getZero(NumElts);
for (unsigned I = 0; I < NumElts; ++I)
if (!ShuffledIndices.count(I))
DemandedElts.setBit(I);
InstructionCost Cost =
TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
/*Extract*/ false);
- if (!ShuffledIndices.empty())
+ if (NeedToShuffle)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
}
@@ -4777,6 +5601,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ bool DuplicateNonConst = false;
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
@@ -4785,12 +5610,17 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Iterate in reverse order to consider insert elements with the high cost.
for (unsigned I = VL.size(); I > 0; --I) {
unsigned Idx = I - 1;
- if (isConstant(VL[Idx]))
+ // No need to shuffle duplicates for constants.
+ if (isConstant(VL[Idx])) {
+ ShuffledElements.insert(Idx);
continue;
- if (!UniqueElements.insert(VL[Idx]).second)
+ }
+ if (!UniqueElements.insert(VL[Idx]).second) {
+ DuplicateNonConst = true;
ShuffledElements.insert(Idx);
+ }
}
- return getGatherCost(VecTy, ShuffledElements);
+ return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
}
// Perform operand reordering on the instructions in VL and return the reordered
@@ -5006,17 +5836,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
// block:
// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
- // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
+ // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
// ... (use %2)
- // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
+ // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
// br %block
- SmallVector<int> UniqueIdxs;
+ SmallVector<int> UniqueIdxs(VF, UndefMaskElem);
SmallSet<int, 4> UsedIdxs;
int Pos = 0;
int Sz = VL.size();
for (int Idx : E->ReuseShuffleIndices) {
- if (Idx != Sz && UsedIdxs.insert(Idx).second)
- UniqueIdxs.emplace_back(Pos);
+ if (Idx != Sz && Idx != UndefMaskElem &&
+ UsedIdxs.insert(Idx).second)
+ UniqueIdxs[Idx] = Pos;
++Pos;
}
assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
@@ -5047,11 +5878,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
}).base());
VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));
int UniqueVals = 0;
- bool HasUndefs = false;
for (Value *V : VL.drop_back(VL.size() - VF)) {
if (isa<UndefValue>(V)) {
ReuseShuffleIndicies.emplace_back(UndefMaskElem);
- HasUndefs = true;
continue;
}
if (isConstant(V)) {
@@ -5066,15 +5895,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
++UniqueVals;
}
}
- if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) {
+ if (UniqueVals == 1 && UniqueValues.size() == 1) {
// Emit pure splat vector.
- // FIXME: why it is not identified as an identity.
- unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem);
- if (NumUndefs == ReuseShuffleIndicies.size() - 1)
- ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
- UndefMaskElem);
- else
- ReuseShuffleIndicies.assign(VF, 0);
+ ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
+ UndefMaskElem);
} else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
ReuseShuffleIndicies.clear();
UniqueValues.clear();
@@ -5107,12 +5931,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- unsigned VF = E->Scalars.size();
- if (NeedToShuffleReuses)
- VF = E->ReuseShuffleIndices.size();
+ unsigned VF = E->getVectorFactor();
ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
if (E->State == TreeEntry::NeedToGather) {
- setInsertPointAfterBundle(E);
+ if (E->getMainOp())
+ setInsertPointAfterBundle(E);
Value *Vec;
SmallVector<int> Mask;
SmallVector<const TreeEntry *> Entries;
@@ -5152,13 +5975,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
+ assert(
+ (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) &&
+ "PHI reordering is free.");
auto *PH = cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
Value *V = NewPhi;
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
@@ -5209,53 +6036,48 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return NewV;
}
case Instruction::InsertElement: {
- Builder.SetInsertPoint(VL0);
+ assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
+ Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
Value *V = vectorizeTree(E->getOperand(1));
+ // Create InsertVector shuffle if necessary
+ auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+ return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+ }));
const unsigned NumElts =
- cast<FixedVectorType>(VL0->getType())->getNumElements();
+ cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
const unsigned NumScalars = E->Scalars.size();
+ unsigned Offset = *getInsertIndex(VL0, 0);
+ assert(Offset < NumElts && "Failed to find vector index offset");
+
+ // Create shuffle to resize vector
+ SmallVector<int> Mask;
+ if (!E->ReorderIndices.empty()) {
+ inversePermutation(E->ReorderIndices, Mask);
+ Mask.append(NumElts - NumScalars, UndefMaskElem);
+ } else {
+ Mask.assign(NumElts, UndefMaskElem);
+ std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+ }
// Create InsertVector shuffle if necessary
- Instruction *FirstInsert = nullptr;
bool IsIdentity = true;
- unsigned Offset = UINT_MAX;
+ SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+ Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
- Value *Scalar = E->Scalars[I];
- if (!FirstInsert &&
- !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0)))
- FirstInsert = cast<Instruction>(Scalar);
+ Value *Scalar = E->Scalars[PrevMask[I]];
Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
if (!InsertIdx || *InsertIdx == UndefMaskElem)
continue;
- unsigned Idx = *InsertIdx;
- if (Idx < Offset) {
- Offset = Idx;
- IsIdentity &= I == 0;
- } else {
- assert(Idx >= Offset && "Failed to find vector index offset");
- IsIdentity &= Idx - Offset == I;
- }
- }
- assert(Offset < NumElts && "Failed to find vector index offset");
-
- // Create shuffle to resize vector
- SmallVector<int> Mask(NumElts, UndefMaskElem);
- if (!IsIdentity) {
- for (unsigned I = 0; I < NumScalars; ++I) {
- Value *Scalar = E->Scalars[I];
- Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
- if (!InsertIdx || *InsertIdx == UndefMaskElem)
- continue;
- Mask[*InsertIdx - Offset] = I;
- }
- } else {
- std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+ IsIdentity &= *InsertIdx - Offset == I;
+ Mask[*InsertIdx - Offset] = I;
}
if (!IsIdentity || NumElts != NumScalars)
V = Builder.CreateShuffleVector(V, Mask);
- if (NumElts != NumScalars) {
+ if ((!IsIdentity || Offset != 0 ||
+ !isa<UndefValue>(FirstInsert->getOperand(0))) &&
+ NumElts != NumScalars) {
SmallVector<int> InsertMask(NumElts);
std::iota(InsertMask.begin(), InsertMask.end(), 0);
for (unsigned I = 0; I < NumElts; I++) {
@@ -5295,6 +6117,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *CI = cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -5317,6 +6140,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -5337,6 +6161,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateSelect(Cond, True, False);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -5360,6 +6185,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -5403,6 +6229,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -5414,9 +6241,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
- bool IsReorder = E->updateStateIfReorder();
- if (IsReorder)
- VL0 = E->getMainOp();
setInsertPointAfterBundle(E);
LoadInst *LI = cast<LoadInst>(VL0);
@@ -5457,9 +6281,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::Store: {
- bool IsReorder = !E->ReorderIndices.empty();
- auto *SI = cast<StoreInst>(
- IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
+ auto *SI = cast<StoreInst>(VL0);
unsigned AS = SI->getPointerAddressSpace();
setInsertPointAfterBundle(E);
@@ -5491,37 +6313,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::GetElementPtr: {
+ auto *GEP0 = cast<GetElementPtrInst>(VL0);
setInsertPointAfterBundle(E);
Value *Op0 = vectorizeTree(E->getOperand(0));
- std::vector<Value *> OpVecs;
- for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
- ++j) {
- ValueList &VL = E->getOperand(j);
- // Need to cast all elements to the same type before vectorization to
- // avoid crash.
- Type *VL0Ty = VL0->getOperand(j)->getType();
- Type *Ty = llvm::all_of(
- VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
- ? VL0Ty
- : DL->getIndexType(cast<GetElementPtrInst>(VL0)
- ->getPointerOperandType()
- ->getScalarType());
- for (Value *&V : VL) {
- auto *CI = cast<ConstantInt>(V);
- V = ConstantExpr::getIntegerCast(CI, Ty,
- CI->getValue().isSignBitSet());
- }
- Value *OpVec = vectorizeTree(VL);
+ SmallVector<Value *> OpVecs;
+ for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
+ Value *OpVec = vectorizeTree(E->getOperand(J));
OpVecs.push_back(OpVec);
}
- Value *V = Builder.CreateGEP(
- cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -5548,7 +6355,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
std::vector<Value *> OpVecs;
SmallVector<Type *, 2> TysForDecl =
{FixedVectorType::get(CI->getType(), E->Scalars.size())};
- for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+ for (int j = 0, e = CI->arg_size(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
@@ -5594,6 +6401,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
propagateIRFlags(V, E->Scalars, VL0);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -5641,19 +6449,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// Also, gather up main and alt scalar ops to propagate IR flags to
// each vector operation.
ValueList OpScalars, AltScalars;
- unsigned Sz = E->Scalars.size();
- SmallVector<int> Mask(Sz);
- for (unsigned I = 0; I < Sz; ++I) {
- auto *OpInst = cast<Instruction>(E->Scalars[I]);
- assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
- if (OpInst->getOpcode() == E->getAltOpcode()) {
- Mask[I] = Sz + I;
- AltScalars.push_back(E->Scalars[I]);
- } else {
- Mask[I] = I;
- OpScalars.push_back(E->Scalars[I]);
- }
- }
+ SmallVector<int> Mask;
+ buildSuffleEntryMask(
+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+ [E](Instruction *I) {
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ return I->getOpcode() == E->getAltOpcode();
+ },
+ Mask, &OpScalars, &AltScalars);
propagateIRFlags(V0, OpScalars);
propagateIRFlags(V1, AltScalars);
@@ -5661,7 +6464,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
@@ -5836,7 +6638,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
// It is legal to delete users in the ignorelist.
- assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+ assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) ||
+ (isa_and_nonnull<Instruction>(U) &&
+ isDeleted(cast<Instruction>(U)))) &&
"Deleting out-of-tree value");
}
}
@@ -5911,27 +6715,28 @@ void BoUpSLP::optimizeGatherSequence() {
"Worklist not sorted properly!");
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
- Instruction *In = &*it++;
- if (isDeleted(In))
+ for (Instruction &In : llvm::make_early_inc_range(*BB)) {
+ if (isDeleted(&In))
continue;
- if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+ if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
+ !isa<ShuffleVectorInst>(&In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
+ bool Replaced = false;
for (Instruction *v : Visited) {
- if (In->isIdenticalTo(v) &&
- DT->dominates(v->getParent(), In->getParent())) {
- In->replaceAllUsesWith(v);
- eraseInstruction(In);
- In = nullptr;
+ if (In.isIdenticalTo(v) &&
+ DT->dominates(v->getParent(), In.getParent())) {
+ In.replaceAllUsesWith(v);
+ eraseInstruction(&In);
+ Replaced = true;
break;
}
}
- if (In) {
- assert(!is_contained(Visited, In));
- Visited.push_back(In);
+ if (!Replaced) {
+ assert(!is_contained(Visited, &In));
+ Visited.push_back(&In);
}
}
}
@@ -5944,7 +6749,9 @@ void BoUpSLP::optimizeGatherSequence() {
Optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
- if (isa<PHINode>(S.OpValue) || isa<InsertElementInst>(S.OpValue))
+ // No need to schedule PHIs, insertelement, extractelement and extractvalue
+ // instructions.
+ if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))
return nullptr;
// Initialize the instruction bundle.
@@ -6040,7 +6847,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
Value *OpValue) {
- if (isa<PHINode>(OpValue) || isa<InsertElementInst>(OpValue))
+ if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))
return;
ScheduleData *Bundle = getScheduleData(OpValue);
@@ -6080,8 +6887,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
- assert(!isa<PHINode>(I) && !isa<InsertElementInst>(I) &&
- "phi nodes/insertelements don't need to be scheduled");
+ assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
+ "phi nodes/insertelements/extractelements/extractvalues don't need to "
+ "be scheduled");
auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
@@ -6351,7 +7159,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
- assert((isa<InsertElementInst>(SD->Inst) ||
+ assert((isVectorLikeInstWithConstOps(SD->Inst) ||
SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;
@@ -6694,9 +7502,7 @@ struct SLPVectorizer : public FunctionPass {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}
- bool doInitialization(Module &M) override {
- return false;
- }
+ bool doInitialization(Module &M) override { return false; }
bool runOnFunction(Function &F) override {
if (skipFunction(F))
@@ -6831,44 +7637,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
return Changed;
}
-/// Order may have elements assigned special value (size) which is out of
-/// bounds. Such indices only appear on places which correspond to undef values
-/// (see canReuseExtract for details) and used in order to avoid undef values
-/// have effect on operands ordering.
-/// The first loop below simply finds all unused indices and then the next loop
-/// nest assigns these indices for undef values positions.
-/// As an example below Order has two undef positions and they have assigned
-/// values 3 and 7 respectively:
-/// before: 6 9 5 4 9 2 1 0
-/// after: 6 3 5 4 7 2 1 0
-/// \returns Fixed ordering.
-static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef<unsigned> Order) {
- BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end());
- const unsigned Sz = NewOrder.size();
- SmallBitVector UsedIndices(Sz);
- SmallVector<int> MaskedIndices;
- for (int I = 0, E = NewOrder.size(); I < E; ++I) {
- if (NewOrder[I] < Sz)
- UsedIndices.set(NewOrder[I]);
- else
- MaskedIndices.push_back(I);
- }
- if (MaskedIndices.empty())
- return NewOrder;
- SmallVector<int> AvailableIndices(MaskedIndices.size());
- unsigned Cnt = 0;
- int Idx = UsedIndices.find_first();
- do {
- AvailableIndices[Cnt] = Idx;
- Idx = UsedIndices.find_next(Idx);
- ++Cnt;
- } while (Idx > 0);
- assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
- for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
- NewOrder[MaskedIndices[I]] = AvailableIndices[I];
- return NewOrder;
-}
-
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned Idx) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
@@ -6884,19 +7652,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
<< "\n");
R.buildTree(Chain);
- Optional<ArrayRef<unsigned>> Order = R.bestOrder();
- // TODO: Handle orders of size less than number of elements in the vector.
- if (Order && Order->size() == Chain.size()) {
- // TODO: reorder tree nodes without tree rebuilding.
- SmallVector<Value *, 4> ReorderedOps(Chain.size());
- transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
- [Chain](const unsigned Idx) { return Chain[Idx]; });
- R.buildTree(ReorderedOps);
- }
if (R.isTreeTinyAndNotFullyVectorizable())
return false;
if (R.isLoadCombineCandidate())
return false;
+ R.reorderTopToBottom();
+ R.reorderBottomToTop();
+ R.buildExternalUses();
R.computeMinimumValueSizes();
@@ -7019,7 +7781,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned EltSize = R.getVectorElementSize(Operands[0]);
unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
- unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize);
+ unsigned MinVF = R.getMinVF(EltSize);
unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
MaxElts);
@@ -7092,11 +7854,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = {A, B};
- return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
+ return tryToVectorizeList(VL, R);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- bool AllowReorder) {
+ bool LimitForRegisterSize) {
if (VL.size() < 2)
return false;
@@ -7130,7 +7892,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
}
unsigned Sz = R.getVectorElementSize(I0);
- unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+ unsigned MinVF = R.getMinVF(Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
if (MaxVF < 2) {
@@ -7168,7 +7930,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (!isPowerOf2_32(OpsWidth))
continue;
- if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
+ if ((LimitForRegisterSize && OpsWidth < MaxVF) ||
+ (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
break;
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
@@ -7183,18 +7946,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
<< "\n");
R.buildTree(Ops);
- if (AllowReorder) {
- Optional<ArrayRef<unsigned>> Order = R.bestOrder();
- if (Order) {
- // TODO: reorder tree nodes without tree rebuilding.
- SmallVector<Value *, 4> ReorderedOps(Ops.size());
- transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
- [Ops](const unsigned Idx) { return Ops[Idx]; });
- R.buildTree(ReorderedOps);
- }
- }
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
+ R.reorderTopToBottom();
+ R.reorderBottomToTop();
+ R.buildExternalUses();
R.computeMinimumValueSizes();
InstructionCost Cost = R.getTreeCost();
@@ -7387,10 +8143,20 @@ class HorizontalReduction {
Value *RHS, const Twine &Name, bool UseSelect) {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
switch (Kind) {
- case RecurKind::Add:
- case RecurKind::Mul:
case RecurKind::Or:
+ if (UseSelect &&
+ LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
+ return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
+ return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
+ Name);
case RecurKind::And:
+ if (UseSelect &&
+ LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
+ return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
+ return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
+ Name);
+ case RecurKind::Add:
+ case RecurKind::Mul:
case RecurKind::Xor:
case RecurKind::FAdd:
case RecurKind::FMul:
@@ -7434,8 +8200,12 @@ class HorizontalReduction {
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
Value *RHS, const Twine &Name,
const ReductionOpsListType &ReductionOps) {
- bool UseSelect = ReductionOps.size() == 2;
- assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) &&
+ bool UseSelect = ReductionOps.size() == 2 ||
+ // Logical or/and.
+ (ReductionOps.size() == 1 &&
+ isa<SelectInst>(ReductionOps.front().front()));
+ assert((!UseSelect || ReductionOps.size() != 2 ||
+ isa<SelectInst>(ReductionOps[1][0])) &&
"Expected cmp + select pairs for reduction");
Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
@@ -7573,10 +8343,10 @@ class HorizontalReduction {
/// Checks if the instruction is in basic block \p BB.
/// For a cmp+sel min/max reduction check that both ops are in \p BB.
static bool hasSameParent(Instruction *I, BasicBlock *BB) {
- if (isCmpSelMinMax(I)) {
+ if (isCmpSelMinMax(I) || (isBoolLogicOp(I) && isa<SelectInst>(I))) {
auto *Sel = cast<SelectInst>(I);
- auto *Cmp = cast<Instruction>(Sel->getCondition());
- return Sel->getParent() == BB && Cmp->getParent() == BB;
+ auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
+ return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
}
return I->getParent() == BB;
}
@@ -7758,13 +8528,13 @@ public:
}
/// Attempt to vectorize the tree found by matchAssociativeReduction.
- bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
unsigned NumReducedVals = ReducedVals.size();
if (NumReducedVals < 4)
- return false;
+ return nullptr;
// Intersect the fast-math-flags from all reduction operations.
FastMathFlags RdxFMF;
@@ -7838,22 +8608,14 @@ public:
unsigned i = 0;
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, ExternallyUsedValues, IgnoreList);
- Optional<ArrayRef<unsigned>> Order = V.bestOrder();
- if (Order) {
- assert(Order->size() == VL.size() &&
- "Order size must be the same as number of vectorized "
- "instructions.");
- // TODO: reorder tree nodes without tree rebuilding.
- SmallVector<Value *, 4> ReorderedOps(VL.size());
- transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
- [VL](const unsigned Idx) { return VL[Idx]; });
- V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
- }
- if (V.isTreeTinyAndNotFullyVectorizable())
+ V.buildTree(VL, IgnoreList);
+ if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
break;
if (V.isLoadCombineReductionCandidate(RdxKind))
break;
+ V.reorderTopToBottom();
+ V.reorderBottomToTop(/*IgnoreReorder=*/true);
+ V.buildExternalUses(ExternallyUsedValues);
// For a poison-safe boolean logic reduction, do not replace select
// instructions with logic ops. All reduced values will be frozen (see
@@ -7873,7 +8635,7 @@ public:
InstructionCost Cost = TreeCost + ReductionCost;
if (!Cost.isValid()) {
LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
- return false;
+ return nullptr;
}
if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
@@ -7953,7 +8715,7 @@ public:
// vector reductions.
V.eraseInstructions(IgnoreList);
}
- return VectorizedTree != nullptr;
+ return VectorizedTree;
}
unsigned numReductionValues() const { return ReducedVals.size(); }
@@ -7963,6 +8725,7 @@ private:
InstructionCost getReductionCost(TargetTransformInfo *TTI,
Value *FirstReducedVal, unsigned ReduxWidth,
FastMathFlags FMF) {
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *ScalarTy = FirstReducedVal->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
InstructionCost VectorCost, ScalarCost;
@@ -7975,33 +8738,39 @@ private:
case RecurKind::FAdd:
case RecurKind::FMul: {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
- VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF);
- ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
+ VectorCost =
+ TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+ ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
break;
}
case RecurKind::FMax:
case RecurKind::FMin: {
+ auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*unsigned=*/false);
- ScalarCost =
- TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- CmpInst::makeCmpResultType(ScalarTy));
+ /*unsigned=*/false, CostKind);
+ CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+ ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
+ SclCondTy, RdxPred, CostKind) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ SclCondTy, RdxPred, CostKind);
break;
}
case RecurKind::SMax:
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin: {
+ auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
bool IsUnsigned =
RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned);
- ScalarCost =
- TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- CmpInst::makeCmpResultType(ScalarTy));
+ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
+ CostKind);
+ CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+ ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+ SclCondTy, RdxPred, CostKind) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ SclCondTy, RdxPred, CostKind);
break;
}
default:
@@ -8023,6 +8792,7 @@ private:
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ ++NumVectorInstructions;
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
ReductionOps.back());
}
@@ -8232,32 +9002,45 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Skip the analysis of CmpInsts.Compiler implements postanalysis of the
// CmpInsts so we can skip extra attempts in
// tryToVectorizeHorReductionOrInstOperands and save compile time.
- SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
+ std::queue<std::pair<Instruction *, unsigned>> Stack;
+ Stack.emplace(Root, 0);
SmallPtrSet<Value *, 8> VisitedInstrs;
+ SmallVector<WeakTrackingVH> PostponedInsts;
bool Res = false;
+ auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
+ Value *&B1) -> Value * {
+ bool IsBinop = matchRdxBop(Inst, B0, B1);
+ bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
+ if (IsBinop || IsSelect) {
+ HorizontalReduction HorRdx;
+ if (HorRdx.matchAssociativeReduction(P, Inst))
+ return HorRdx.tryToReduce(R, TTI);
+ }
+ return nullptr;
+ };
while (!Stack.empty()) {
Instruction *Inst;
unsigned Level;
- std::tie(Inst, Level) = Stack.pop_back_val();
+ std::tie(Inst, Level) = Stack.front();
+ Stack.pop();
// Do not try to analyze instruction that has already been vectorized.
// This may happen when we vectorize instruction operands on a previous
// iteration while stack was populated before that happened.
if (R.isDeleted(Inst))
continue;
- Value *B0, *B1;
- bool IsBinop = matchRdxBop(Inst, B0, B1);
- bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
- if (IsBinop || IsSelect) {
- HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst)) {
- if (HorRdx.tryToReduce(R, TTI)) {
- Res = true;
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- continue;
- }
+ Value *B0 = nullptr, *B1 = nullptr;
+ if (Value *V = TryToReduce(Inst, B0, B1)) {
+ Res = true;
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ // Try to find another reduction.
+ Stack.emplace(I, Level);
+ continue;
}
+ } else {
+ bool IsBinop = B0 && B1;
if (P && IsBinop) {
Inst = dyn_cast<Instruction>(B0);
if (Inst == P)
@@ -8269,14 +9052,14 @@ static bool tryToVectorizeHorReductionOrInstOperands(
continue;
}
}
- }
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- // Do not try to vectorize CmpInst operands, this is done separately.
- if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) {
- Res = true;
- continue;
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ // Do not try to vectorize CmpInst operands, this is done separately.
+ // Final attempt for binop args vectorization should happen after the loop
+ // to try to find reductions.
+ if (!isa<CmpInst>(Inst))
+ PostponedInsts.push_back(Inst);
}
// Try to vectorize operands.
@@ -8290,8 +9073,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// separately.
if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
I->getParent() == BB)
- Stack.emplace_back(I, Level);
+ Stack.emplace(I, Level);
}
+ // Try to vectorized binops where reductions were not found.
+ for (Value *V : PostponedInsts)
+ if (auto *Inst = dyn_cast<Instruction>(V))
+ if (!R.isDeleted(Inst))
+ Res |= Vectorize(Inst, R);
return Res;
}
@@ -8326,7 +9114,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false);
+ return tryToVectorizeList(BuildVectorOpds, R);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
@@ -8337,11 +9125,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
(llvm::all_of(BuildVectorOpds,
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
- isShuffle(BuildVectorOpds, Mask)))
+ isFixedVectorShuffle(BuildVectorOpds, Mask)))
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
- return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true);
+ return tryToVectorizeList(BuildVectorInsts, R);
}
bool SLPVectorizerPass::vectorizeSimpleInstructions(
@@ -8382,6 +9170,78 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
return OpsChanged;
}
+template <typename T>
+static bool
+tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
+ function_ref<unsigned(T *)> Limit,
+ function_ref<bool(T *, T *)> Comparator,
+ function_ref<bool(T *, T *)> AreCompatible,
+ function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize,
+ bool LimitForRegisterSize) {
+ bool Changed = false;
+ // Sort by type, parent, operands.
+ stable_sort(Incoming, Comparator);
+
+ // Try to vectorize elements base on their type.
+ SmallVector<T *> Candidates;
+ for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
+ // Look for the next elements with the same type, parent and operand
+ // kinds.
+ auto *SameTypeIt = IncIt;
+ while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
+ ++SameTypeIt;
+
+ // Try to vectorize them.
+ unsigned NumElts = (SameTypeIt - IncIt);
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
+ << NumElts << ")\n");
+ // The vectorization is a 3-state attempt:
+ // 1. Try to vectorize instructions with the same/alternate opcodes with the
+ // size of maximal register at first.
+ // 2. Try to vectorize remaining instructions with the same type, if
+ // possible. This may result in the better vectorization results rather than
+ // if we try just to vectorize instructions with the same/alternate opcodes.
+ // 3. Final attempt to try to vectorize all instructions with the
+ // same/alternate ops only, this may result in some extra final
+ // vectorization.
+ if (NumElts > 1 &&
+ TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
+ // Success start over because instructions might have been changed.
+ Changed = true;
+ } else if (NumElts < Limit(*IncIt) &&
+ (Candidates.empty() ||
+ Candidates.front()->getType() == (*IncIt)->getType())) {
+ Candidates.append(IncIt, std::next(IncIt, NumElts));
+ }
+ // Final attempt to vectorize instructions with the same types.
+ if (Candidates.size() > 1 &&
+ (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
+ if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) {
+ // Success start over because instructions might have been changed.
+ Changed = true;
+ } else if (LimitForRegisterSize) {
+ // Try to vectorize using small vectors.
+ for (auto *It = Candidates.begin(), *End = Candidates.end();
+ It != End;) {
+ auto *SameTypeIt = It;
+ while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
+ ++SameTypeIt;
+ unsigned NumElts = (SameTypeIt - It);
+ if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts),
+ /*LimitForRegisterSize=*/false))
+ Changed = true;
+ It = SameTypeIt;
+ }
+ }
+ Candidates.clear();
+ }
+
+ // Start over at the next instruction of a different type (or the end).
+ IncIt = SameTypeIt;
+ }
+ return Changed;
+}
+
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
@@ -8390,11 +9250,89 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// node. Allows better to identify the chains that can be vectorized in the
// better way.
DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
+ auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
+ assert(isValidElementType(V1->getType()) &&
+ isValidElementType(V2->getType()) &&
+ "Expected vectorizable types only.");
+ // It is fine to compare type IDs here, since we expect only vectorizable
+ // types, like ints, floats and pointers, we don't care about other type.
+ if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
+ return true;
+ if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
+ return false;
+ ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+ ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+ if (Opcodes1.size() < Opcodes2.size())
+ return true;
+ if (Opcodes1.size() > Opcodes2.size())
+ return false;
+ for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+ // Undefs are compatible with any other value.
+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+ continue;
+ if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+ if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+ DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
+ DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
+ if (!NodeI1)
+ return NodeI2 != nullptr;
+ if (!NodeI2)
+ return false;
+ assert((NodeI1 == NodeI2) ==
+ (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeI1 != NodeI2)
+ return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
+ InstructionsState S = getSameOpcode({I1, I2});
+ if (S.getOpcode())
+ continue;
+ return I1->getOpcode() < I2->getOpcode();
+ }
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ continue;
+ if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
+ return true;
+ if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
+ return false;
+ }
+ return false;
+ };
+ auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
+ if (V1 == V2)
+ return true;
+ if (V1->getType() != V2->getType())
+ return false;
+ ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+ ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+ if (Opcodes1.size() != Opcodes2.size())
+ return false;
+ for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+ // Undefs are compatible with any other value.
+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+ continue;
+ if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+ if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+ if (I1->getParent() != I2->getParent())
+ return false;
+ InstructionsState S = getSameOpcode({I1, I2});
+ if (S.getOpcode())
+ continue;
+ return false;
+ }
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ continue;
+ if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
+ return false;
+ }
+ return true;
+ };
+ auto Limit = [&R](Value *V) {
+ unsigned EltSize = R.getVectorElementSize(V);
+ return std::max(2U, R.getMaxVecRegSize() / EltSize);
+ };
- bool HaveVectorizedPhiNodes = true;
- while (HaveVectorizedPhiNodes) {
- HaveVectorizedPhiNodes = false;
-
+ bool HaveVectorizedPhiNodes = false;
+ do {
// Collect the incoming values from the PHIs.
Incoming.clear();
for (Instruction &I : *BB) {
@@ -8432,132 +9370,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
}
- // Sort by type, parent, operands.
- stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) {
- assert(isValidElementType(V1->getType()) &&
- isValidElementType(V2->getType()) &&
- "Expected vectorizable types only.");
- // It is fine to compare type IDs here, since we expect only vectorizable
- // types, like ints, floats and pointers, we don't care about other type.
- if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
- return true;
- if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
- return false;
- ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
- ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
- if (Opcodes1.size() < Opcodes2.size())
- return true;
- if (Opcodes1.size() > Opcodes2.size())
- return false;
- for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
- // Undefs are compatible with any other value.
- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
- continue;
- if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
- if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
- DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
- DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
- if (!NodeI1)
- return NodeI2 != nullptr;
- if (!NodeI2)
- return false;
- assert((NodeI1 == NodeI2) ==
- (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
- "Different nodes should have different DFS numbers");
- if (NodeI1 != NodeI2)
- return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
- InstructionsState S = getSameOpcode({I1, I2});
- if (S.getOpcode())
- continue;
- return I1->getOpcode() < I2->getOpcode();
- }
- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
- continue;
- if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
- return true;
- if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
- return false;
- }
- return false;
- });
-
- auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
- if (V1 == V2)
- return true;
- if (V1->getType() != V2->getType())
- return false;
- ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
- ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
- if (Opcodes1.size() != Opcodes2.size())
- return false;
- for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
- // Undefs are compatible with any other value.
- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
- continue;
- if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
- if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
- if (I1->getParent() != I2->getParent())
- return false;
- InstructionsState S = getSameOpcode({I1, I2});
- if (S.getOpcode())
- continue;
- return false;
- }
- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
- continue;
- if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
- return false;
- }
- return true;
- };
-
- // Try to vectorize elements base on their type.
- SmallVector<Value *, 4> Candidates;
- for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
- E = Incoming.end();
- IncIt != E;) {
-
- // Look for the next elements with the same type, parent and operand
- // kinds.
- SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
- while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) {
- VisitedInstrs.insert(*SameTypeIt);
- ++SameTypeIt;
- }
-
- // Try to vectorize them.
- unsigned NumElts = (SameTypeIt - IncIt);
- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
- << NumElts << ")\n");
- // The order in which the phi nodes appear in the program does not matter.
- // So allow tryToVectorizeList to reorder them if it is beneficial. This
- // is done when there are exactly two elements since tryToVectorizeList
- // asserts that there are only two values when AllowReorder is true.
- if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
- /*AllowReorder=*/true)) {
- // Success start over because instructions might have been changed.
- HaveVectorizedPhiNodes = true;
- Changed = true;
- } else if (NumElts < 4 &&
- (Candidates.empty() ||
- Candidates.front()->getType() == (*IncIt)->getType())) {
- Candidates.append(IncIt, std::next(IncIt, NumElts));
- }
- // Final attempt to vectorize phis with the same types.
- if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {
- if (Candidates.size() > 1 &&
- tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) {
- // Success start over because instructions might have been changed.
- HaveVectorizedPhiNodes = true;
- Changed = true;
- }
- Candidates.clear();
- }
-
- // Start over at the next instruction of a different type (or the end).
- IncIt = SameTypeIt;
- }
- }
+ HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
+ Incoming, Limit, PHICompare, AreCompatiblePHIs,
+ [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
+ return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+ },
+ /*LimitForRegisterSize=*/true);
+ Changed |= HaveVectorizedPhiNodes;
+ VisitedInstrs.insert(Incoming.begin(), Incoming.end());
+ } while (HaveVectorizedPhiNodes);
VisitedInstrs.clear();
@@ -8810,6 +9631,10 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
return V1->getValueOperand()->getValueID() ==
V2->getValueOperand()->getValueID();
};
+ auto Limit = [&R, this](StoreInst *SI) {
+ unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+ return R.getMinVF(EltSize);
+ };
// Attempt to sort and vectorize each of the store-groups.
for (auto &Pair : Stores) {
@@ -8819,33 +9644,15 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< Pair.second.size() << ".\n");
- stable_sort(Pair.second, StoreSorter);
-
- // Try to vectorize elements based on their compatibility.
- for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(),
- E = Pair.second.end();
- IncIt != E;) {
-
- // Look for the next elements with the same type.
- ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt;
- Type *EltTy = (*IncIt)->getPointerOperand()->getType();
-
- while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt))
- ++SameTypeIt;
-
- // Try to vectorize them.
- unsigned NumElts = (SameTypeIt - IncIt);
- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("
- << NumElts << ")\n");
- if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() &&
- vectorizeStores(makeArrayRef(IncIt, NumElts), R)) {
- // Success start over because instructions might have been changed.
- Changed = true;
- }
+ if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
+ continue;
- // Start over at the next instruction of a different type (or the end).
- IncIt = SameTypeIt;
- }
+ Changed |= tryToVectorizeSequence<StoreInst>(
+ Pair.second, Limit, StoreSorter, AreCompatibleStores,
+ [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
+ return vectorizeStores(Candidates, R);
+ },
+ /*LimitForRegisterSize=*/false);
}
return Changed;
}