src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2022-03-20 11:40:34 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2022-05-14 11:43:05 +0000
commit	349cc55c9796c4596a5b9904cd3281af295f878f (patch)
tree	410c5a785075730a35f1272ca6a7adf72222ad03 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent	cb2ae6163174b90e999326ecec3699ee093a5d43 (diff)
parent	c0981da47d5696fe36474fcf86b4ce03ae3ff818 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

2381

1 files changed, 1594 insertions, 787 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d06bc7d79a7..e3ef0b794f68 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -21,6 +21,7 @@

#include "llvm/ADT/DenseSet.h"

#include "llvm/ADT/Optional.h"

#include "llvm/ADT/PostOrderIterator.h"

+#include "llvm/ADT/PriorityQueue.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetOperations.h"

#include "llvm/ADT/SetVector.h"

@@ -200,12 +201,39 @@ static bool isValidElementType(Type *Ty) {

!Ty->isPPC_FP128Ty();

}

+/// \returns True if the value is a constant (but not globals/constant

+/// expressions).

+static bool isConstant(Value *V) {

+ return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);

+/// Checks if \p V is one of vector-like instructions, i.e. undef,

+/// insertelement/extractelement with constant indices for fixed vector type or

+/// extractvalue instruction.

+static bool isVectorLikeInstWithConstOps(Value *V) {

+ if (!isa<InsertElementInst, ExtractElementInst>(V) &&

+ !isa<ExtractValueInst, UndefValue>(V))

+ return false;

+ auto *I = dyn_cast<Instruction>(V);

+ if (!I || isa<ExtractValueInst>(I))

+ return true;

+ if (!isa<FixedVectorType>(I->getOperand(0)->getType()))

+ return false;

+ if (isa<ExtractElementInst>(I))

+ return isConstant(I->getOperand(1));

+ assert(isa<InsertElementInst>(V) && "Expected only insertelement.");

+ return isConstant(I->getOperand(2));

/// \returns true if all of the instructions in \p VL are in the same block or

/// false otherwise.

static bool allSameBlock(ArrayRef<Value *> VL) {

Instruction *I0 = dyn_cast<Instruction>(VL[0]);

if (!I0)

return false;

+ if (all_of(VL, isVectorLikeInstWithConstOps))

+ return true;

BasicBlock *BB = I0->getParent();

for (int I = 1, E = VL.size(); I < E; I++) {

auto *II = dyn_cast<Instruction>(VL[I]);

@@ -218,12 +246,6 @@ static bool allSameBlock(ArrayRef<Value *> VL) {

return true;

}

-/// \returns True if the value is a constant (but not globals/constant

-/// expressions).

-static bool isConstant(Value *V) {

- return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);

/// \returns True if all of the values in \p VL are constants (but not

/// globals/constant expressions).

static bool allConstant(ArrayRef<Value *> VL) {

@@ -232,12 +254,21 @@ static bool allConstant(ArrayRef<Value *> VL) {

return all_of(VL, isConstant);

}

-/// \returns True if all of the values in \p VL are identical.

+/// \returns True if all of the values in \p VL are identical or some of them

+/// are UndefValue.

static bool isSplat(ArrayRef<Value *> VL) {

- for (unsigned i = 1, e = VL.size(); i < e; ++i)

- if (VL[i] != VL[0])

+ Value *FirstNonUndef = nullptr;

+ for (Value *V : VL) {

+ if (isa<UndefValue>(V))

+ continue;

+ if (!FirstNonUndef) {

+ FirstNonUndef = V;

+ continue;

+ }

+ if (V != FirstNonUndef)

return false;

- return true;

+ }

+ return FirstNonUndef != nullptr;

}

/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.

@@ -295,8 +326,10 @@ static bool isCommutative(Instruction *I) {

/// TODO: Can we split off and reuse the shuffle mask detection from

/// TargetTransformInfo::getInstructionThroughput?

static Optional<TargetTransformInfo::ShuffleKind>

-isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {

+isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {

auto *EI0 = cast<ExtractElementInst>(VL[0]);

+ if (isa<ScalableVectorType>(EI0->getVectorOperandType()))

+ return None;

unsigned Size =

cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();

Value *Vec1 = nullptr;

@@ -504,7 +537,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

case Instruction::Call: {

CallInst *CI = cast<CallInst>(UserInst);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {

+ for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

if (hasVectorInstrinsicScalarOpd(ID, i))

return (CI->getArgOperand(i) == Scalar);

}

@@ -535,13 +568,67 @@ static bool isSimple(Instruction *I) {

return true;

}

+/// Shuffles \p Mask in accordance with the given \p SubMask.

+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {

+ if (SubMask.empty())

+ return;

+ if (Mask.empty()) {

+ Mask.append(SubMask.begin(), SubMask.end());

+ return;

+ }

+ SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);

+ int TermValue = std::min(Mask.size(), SubMask.size());

+ for (int I = 0, E = SubMask.size(); I < E; ++I) {

+ if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||

+ Mask[SubMask[I]] >= TermValue)

+ continue;

+ NewMask[I] = Mask[SubMask[I]];

+ }

+ Mask.swap(NewMask);

+/// Order may have elements assigned special value (size) which is out of

+/// bounds. Such indices only appear on places which correspond to undef values

+/// (see canReuseExtract for details) and used in order to avoid undef values

+/// have effect on operands ordering.

+/// The first loop below simply finds all unused indices and then the next loop

+/// nest assigns these indices for undef values positions.

+/// As an example below Order has two undef positions and they have assigned

+/// values 3 and 7 respectively:

+/// before: 6 9 5 4 9 2 1 0

+/// after: 6 3 5 4 7 2 1 0

+static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {

+ const unsigned Sz = Order.size();

+ SmallBitVector UsedIndices(Sz);

+ SmallVector<int> MaskedIndices;

+ for (unsigned I = 0; I < Sz; ++I) {

+ if (Order[I] < Sz)

+ UsedIndices.set(Order[I]);

+ else

+ MaskedIndices.push_back(I);

+ }

+ if (MaskedIndices.empty())

+ return;

+ SmallVector<int> AvailableIndices(MaskedIndices.size());

+ unsigned Cnt = 0;

+ int Idx = UsedIndices.find_first();

+ do {

+ AvailableIndices[Cnt] = Idx;

+ Idx = UsedIndices.find_next(Idx);

+ ++Cnt;

+ } while (Idx > 0);

+ assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");

+ for (int I = 0, E = MaskedIndices.size(); I < E; ++I)

+ Order[MaskedIndices[I]] = AvailableIndices[I];

namespace llvm {

static void inversePermutation(ArrayRef<unsigned> Indices,

SmallVectorImpl<int> &Mask) {

Mask.clear();

const unsigned E = Indices.size();

- Mask.resize(E, E + 1);

+ Mask.resize(E, UndefMaskElem);

for (unsigned I = 0; I < E; ++I)

Mask[Indices[I]] = I;

}

@@ -581,6 +668,22 @@ static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {

return Index;

}

+/// Reorders the list of scalars in accordance with the given \p Order and then

+/// the \p Mask. \p Order - is the original order of the scalars, need to

+/// reorder scalars into an unordered state at first according to the given

+/// order. Then the ordered scalars are shuffled once again in accordance with

+/// the provided mask.

+static void reorderScalars(SmallVectorImpl<Value *> &Scalars,

+ ArrayRef<int> Mask) {

+ assert(!Mask.empty() && "Expected non-empty mask.");

+ SmallVector<Value *> Prev(Scalars.size(),

+ UndefValue::get(Scalars.front()->getType()));

+ Prev.swap(Scalars);

+ for (unsigned I = 0, E = Prev.size(); I < E; ++I)

+ if (Mask[I] != UndefMaskElem)

+ Scalars[Mask[I]] = Prev[I];

namespace slpvectorizer {

/// Bottom Up SLP Vectorizer.

@@ -645,13 +748,12 @@ public:

void buildTree(ArrayRef<Value *> Roots,

ArrayRef<Value *> UserIgnoreLst = None);

- /// Construct a vectorizable tree that starts at \p Roots, ignoring users for

- /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking

- /// into account (and updating it, if required) list of externally used

- /// values stored in \p ExternallyUsedValues.

- void buildTree(ArrayRef<Value *> Roots,

- ExtraValueToDebugLocsMap &ExternallyUsedValues,

- ArrayRef<Value *> UserIgnoreLst = None);

+ /// Builds external uses of the vectorized scalars, i.e. the list of

+ /// vectorized scalars to be extracted, their lanes and their scalar users. \p

+ /// ExternallyUsedValues contains additional list of external uses to handle

+ /// vectorization of reductions.

+ void

+ buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});

/// Clear the internal data structures that are created by 'buildTree'.

void deleteTree() {

@@ -659,8 +761,6 @@ public:

ScalarToTreeEntry.clear();

MustGather.clear();

ExternalUses.clear();

- NumOpsWantToKeepOrder.clear();

- NumOpsWantToKeepOriginalOrder = 0;

for (auto &Iter : BlocksSchedules) {

BlockScheduling *BS = Iter.second.get();

BS->clear();

@@ -674,103 +774,28 @@ public:

/// Perform LICM and CSE on the newly generated gather sequences.

void optimizeGatherSequence();

- /// \returns The best order of instructions for vectorization.

- Optional<ArrayRef<unsigned>> bestOrder() const {

- assert(llvm::all_of(

- NumOpsWantToKeepOrder,

- [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {

- return D.getFirst().size() ==

- VectorizableTree[0]->Scalars.size();

- }) &&

- "All orders must have the same size as number of instructions in "

- "tree node.");

- auto I = std::max_element(

- NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),

- [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,

- const decltype(NumOpsWantToKeepOrder)::value_type &D2) {

- return D1.second < D2.second;

- });

- if (I == NumOpsWantToKeepOrder.end() ||

- I->getSecond() <= NumOpsWantToKeepOriginalOrder)

- return None;

- return makeArrayRef(I->getFirst());

- }

- /// Builds the correct order for root instructions.

- /// If some leaves have the same instructions to be vectorized, we may

- /// incorrectly evaluate the best order for the root node (it is built for the

- /// vector of instructions without repeated instructions and, thus, has less

- /// elements than the root node). This function builds the correct order for

- /// the root node.

- /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves

- /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first

- /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should

- /// be reordered, the best order will be \<1, 0\>. We need to extend this

- /// order for the root node. For the root node this order should look like

- /// \<3, 0, 1, 2\>. This function extends the order for the reused

- /// instructions.

- void findRootOrder(OrdersType &Order) {

- // If the leaf has the same number of instructions to vectorize as the root

- // - order must be set already.

- unsigned RootSize = VectorizableTree[0]->Scalars.size();

- if (Order.size() == RootSize)

- return;

- SmallVector<unsigned, 4> RealOrder(Order.size());

- std::swap(Order, RealOrder);

- SmallVector<int, 4> Mask;

- inversePermutation(RealOrder, Mask);

- Order.assign(Mask.begin(), Mask.end());

- // The leaf has less number of instructions - need to find the true order of

- // the root.

- // Scan the nodes starting from the leaf back to the root.

- const TreeEntry *PNode = VectorizableTree.back().get();

- SmallVector<const TreeEntry *, 4> Nodes(1, PNode);

- SmallPtrSet<const TreeEntry *, 4> Visited;

- while (!Nodes.empty() && Order.size() != RootSize) {

- const TreeEntry *PNode = Nodes.pop_back_val();

- if (!Visited.insert(PNode).second)

- continue;

- const TreeEntry &Node = *PNode;

- for (const EdgeInfo &EI : Node.UserTreeIndices)

- if (EI.UserTE)

- Nodes.push_back(EI.UserTE);

- if (Node.ReuseShuffleIndices.empty())

- continue;

- // Build the order for the parent node.

- OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);

- SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);

- // The algorithm of the order extension is:

- // 1. Calculate the number of the same instructions for the order.

- // 2. Calculate the index of the new order: total number of instructions

- // with order less than the order of the current instruction + reuse

- // number of the current instruction.

- // 3. The new order is just the index of the instruction in the original

- // vector of the instructions.

- for (unsigned I : Node.ReuseShuffleIndices)

- ++OrderCounter[Order[I]];

- SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);

- for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {

- unsigned ReusedIdx = Node.ReuseShuffleIndices[I];

- unsigned OrderIdx = Order[ReusedIdx];

- unsigned NewIdx = 0;

- for (unsigned J = 0; J < OrderIdx; ++J)

- NewIdx += OrderCounter[J];

- NewIdx += CurrentCounter[OrderIdx];

- ++CurrentCounter[OrderIdx];

- assert(NewOrder[NewIdx] == RootSize &&

- "The order index should not be written already.");

- NewOrder[NewIdx] = I;

- }

- std::swap(Order, NewOrder);

- }

- assert(Order.size() == RootSize &&

- "Root node is expected or the size of the order must be the same as "

- "the number of elements in the root node.");

- assert(llvm::all_of(Order,

- [RootSize](unsigned Val) { return Val != RootSize; }) &&

- "All indices must be initialized");

- }

+ /// Checks if the specified gather tree entry \p TE can be represented as a

+ /// shuffled vector entry + (possibly) permutation with other gathers. It

+ /// implements the checks only for possibly ordered scalars (Loads,

+ /// ExtractElement, ExtractValue), which can be part of the graph.

+ Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);

+ /// Reorders the current graph to the most profitable order starting from the

+ /// root node to the leaf nodes. The best order is chosen only from the nodes

+ /// of the same size (vectorization factor). Smaller nodes are considered

+ /// parts of subgraph with smaller VF and they are reordered independently. We

+ /// can make it because we still need to extend smaller nodes to the wider VF

+ /// and we can merge reordering shuffles with the widening shuffles.

+ void reorderTopToBottom();

+ /// Reorders the current graph to the most profitable order starting from

+ /// leaves to the root. It allows to rotate small subgraphs and reduce the

+ /// number of reshuffles if the leaf nodes use the same order. In this case we

+ /// can merge the orders and just shuffle user node instead of shuffling its

+ /// operands. Plus, even the leaf nodes have different orders, it allows to

+ /// sink reordering in the graph closer to the root node and merge it later

+ /// during analysis.

+ void reorderBottomToTop(bool IgnoreReorder = false);

/// \return The vector element size in bits to use when vectorizing the

/// expression tree ending at \p V. If V is a store, the size is the width of

@@ -793,6 +818,10 @@ public:

return MinVecRegSize;

}

+ unsigned getMinVF(unsigned Sz) const {

+ return std::max(2U, getMinVecRegSize() / Sz);

+ }

unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

unsigned MaxVF = MaxVFOption.getNumOccurrences() ?

MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);

@@ -809,7 +838,7 @@ public:

/// \returns True if the VectorizableTree is both tiny and not fully

/// vectorizable. We do not vectorize such trees.

- bool isTreeTinyAndNotFullyVectorizable() const;

+ bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;

/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values

/// can be load combined in the backend. Load combining may not be allowed in

@@ -1578,10 +1607,12 @@ private:

Value *vectorizeTree(ArrayRef<Value *> VL);

/// \returns the scalarization cost for this type. Scalarization in this

- /// context means the creation of vectors from a group of scalars.

- InstructionCost

- getGatherCost(FixedVectorType *Ty,

- const DenseSet<unsigned> &ShuffledIndices) const;

+ /// context means the creation of vectors from a group of scalars. If \p

+ /// NeedToShuffle is true, need to add a cost of reshuffling some of the

+ /// vector elements.

+ InstructionCost getGatherCost(FixedVectorType *Ty,

+ const DenseSet<unsigned> &ShuffledIndices,

+ bool NeedToShuffle) const;

/// Checks if the gathered \p VL can be represented as shuffle(s) of previous

/// tree entries.

@@ -1605,7 +1636,7 @@ private:

/// \returns whether the VectorizableTree is fully vectorizable and will

/// be beneficial even the tree height is tiny.

- bool isFullyVectorizableTinyTree() const;

+ bool isFullyVectorizableTinyTree(bool ForReduction) const;

/// Reorder commutative or alt operands to get better probability of

/// generating vectorized code.

@@ -1621,14 +1652,43 @@ private:

/// \returns true if the scalars in VL are equal to this entry.

bool isSame(ArrayRef<Value *> VL) const {

- if (VL.size() == Scalars.size())

- return std::equal(VL.begin(), VL.end(), Scalars.begin());

- return VL.size() == ReuseShuffleIndices.size() &&

- std::equal(

- VL.begin(), VL.end(), ReuseShuffleIndices.begin(),

- [this](Value *V, int Idx) { return V == Scalars[Idx]; });

+ auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {

+ if (Mask.size() != VL.size() && VL.size() == Scalars.size())

+ return std::equal(VL.begin(), VL.end(), Scalars.begin());

+ return VL.size() == Mask.size() &&

+ std::equal(VL.begin(), VL.end(), Mask.begin(),

+ [Scalars](Value *V, int Idx) {

+ return (isa<UndefValue>(V) &&

+ Idx == UndefMaskElem) ||

+ (Idx != UndefMaskElem && V == Scalars[Idx]);

+ });

+ };

+ if (!ReorderIndices.empty()) {

+ // TODO: implement matching if the nodes are just reordered, still can

+ // treat the vector as the same if the list of scalars matches VL

+ // directly, without reordering.

+ SmallVector<int> Mask;

+ inversePermutation(ReorderIndices, Mask);

+ if (VL.size() == Scalars.size())

+ return IsSame(Scalars, Mask);

+ if (VL.size() == ReuseShuffleIndices.size()) {

+ ::addMask(Mask, ReuseShuffleIndices);

+ return IsSame(Scalars, Mask);

+ }

+ return false;

+ }

+ return IsSame(Scalars, ReuseShuffleIndices);

}

+ /// \return Final vectorization factor for the node. Defined by the total

+ /// number of vectorized scalars, including those, used several times in the

+ /// entry and counted in the \a ReuseShuffleIndices, if any.

+ unsigned getVectorFactor() const {

+ if (!ReuseShuffleIndices.empty())

+ return ReuseShuffleIndices.size();

+ return Scalars.size();

+ };

/// A vector of scalars.

ValueList Scalars;

@@ -1701,6 +1761,12 @@ private:

}

+ /// Reorders operands of the node to the given mask \p Mask.

+ void reorderOperands(ArrayRef<int> Mask) {

+ for (ValueList &Operand : Operands)

+ reorderScalars(Operand, Mask);

+ }

/// \returns the \p OpIdx operand of this TreeEntry.

ValueList &getOperand(unsigned OpIdx) {

assert(OpIdx < Operands.size() && "Off bounds");

@@ -1760,19 +1826,14 @@ private:

return AltOp ? AltOp->getOpcode() : 0;

}

- /// Update operations state of this entry if reorder occurred.

- bool updateStateIfReorder() {

- if (ReorderIndices.empty())

- return false;

- InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());

- setOperations(S);

- return true;

- }

- /// When ReuseShuffleIndices is empty it just returns position of \p V

- /// within vector of Scalars. Otherwise, try to remap on its reuse index.

+ /// When ReuseReorderShuffleIndices is empty it just returns position of \p

+ /// V within vector of Scalars. Otherwise, try to remap on its reuse index.

int findLaneForValue(Value *V) const {

unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));

assert(FoundLane < Scalars.size() && "Couldn't find extract lane");

+ if (!ReorderIndices.empty())

+ FoundLane = ReorderIndices[FoundLane];

+ assert(FoundLane < Scalars.size() && "Couldn't find extract lane");

if (!ReuseShuffleIndices.empty()) {

FoundLane = std::distance(ReuseShuffleIndices.begin(),

find(ReuseShuffleIndices, FoundLane));

@@ -1856,7 +1917,7 @@ private:

TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,

const InstructionsState &S,

const EdgeInfo &UserTreeIdx,

- ArrayRef<unsigned> ReuseShuffleIndices = None,

+ ArrayRef<int> ReuseShuffleIndices = None,

ArrayRef<unsigned> ReorderIndices = None) {

TreeEntry::EntryState EntryState =

Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;

@@ -1869,7 +1930,7 @@ private:

Optional<ScheduleData *> Bundle,

const InstructionsState &S,

const EdgeInfo &UserTreeIdx,

- ArrayRef<unsigned> ReuseShuffleIndices = None,

+ ArrayRef<int> ReuseShuffleIndices = None,

ArrayRef<unsigned> ReorderIndices = None) {

assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||

(Bundle && EntryState != TreeEntry::NeedToGather)) &&

@@ -1877,12 +1938,25 @@ private:

VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));

TreeEntry *Last = VectorizableTree.back().get();

Last->Idx = VectorizableTree.size() - 1;

- Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());

Last->State = EntryState;

Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

ReuseShuffleIndices.end());

- Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());

- Last->setOperations(S);

+ if (ReorderIndices.empty()) {

+ Last->Scalars.assign(VL.begin(), VL.end());

+ Last->setOperations(S);

+ } else {

+ // Reorder scalars and build final mask.

+ Last->Scalars.assign(VL.size(), nullptr);

+ transform(ReorderIndices, Last->Scalars.begin(),

+ [VL](unsigned Idx) -> Value * {

+ if (Idx >= VL.size())

+ return UndefValue::get(VL.front()->getType());

+ return VL[Idx];

+ });

+ InstructionsState S = getSameOpcode(Last->Scalars);

+ Last->setOperations(S);

+ Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());

+ }

if (Last->State != TreeEntry::NeedToGather) {

for (Value *V : VL) {

assert(!getTreeEntry(V) && "Scalar already in tree!");

@@ -1965,12 +2039,9 @@ private:

if (result.hasValue()) {

return result.getValue();

}

- MemoryLocation Loc2 = getLocation(Inst2, AA);

bool aliased = true;

- if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {

- // Do the alias check.

- aliased = !AA->isNoAlias(Loc1, Loc2);

- }

+ if (Loc1.Ptr && isSimple(Inst1))

+ aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));

// Store the result in the cache.

result = aliased;

return aliased;

@@ -2434,14 +2505,6 @@ private:

}

};

- /// Contains orders of operations along with the number of bundles that have

- /// operations in this order. It stores only those orders that require

- /// reordering, if reordering is not required it is counted using \a

- /// NumOpsWantToKeepOriginalOrder.

- DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;

- /// Number of bundles that do not require reordering.

- unsigned NumOpsWantToKeepOriginalOrder = 0;

// Analysis and block reference.

Function *F;

ScalarEvolution *SE;

@@ -2540,10 +2603,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {

std::string Str;

raw_string_ostream OS(Str);

- if (isSplat(Entry->Scalars)) {

- OS << "<splat> " << *Entry->Scalars[0];

- return Str;

- }

+ if (isSplat(Entry->Scalars))

+ OS << "<splat> ";

for (auto V : Entry->Scalars) {

OS << *V;

if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {

@@ -2594,21 +2655,539 @@ void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {

};

}

-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

- ArrayRef<Value *> UserIgnoreLst) {

- ExtraValueToDebugLocsMap ExternallyUsedValues;

- buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);

+/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses

+/// contains original mask for the scalars reused in the node. Procedure

+/// transform this mask in accordance with the given \p Mask.

+static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {

+ assert(!Mask.empty() && Reuses.size() == Mask.size() &&

+ "Expected non-empty mask.");

+ SmallVector<int> Prev(Reuses.begin(), Reuses.end());

+ Prev.swap(Reuses);

+ for (unsigned I = 0, E = Prev.size(); I < E; ++I)

+ if (Mask[I] != UndefMaskElem)

+ Reuses[Mask[I]] = Prev[I];

}

-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

- ExtraValueToDebugLocsMap &ExternallyUsedValues,

- ArrayRef<Value *> UserIgnoreLst) {

- deleteTree();

- UserIgnoreList = UserIgnoreLst;

- if (!allSameType(Roots))

+/// Reorders the given \p Order according to the given \p Mask. \p Order - is

+/// the original order of the scalars. Procedure transforms the provided order

+/// in accordance with the given \p Mask. If the resulting \p Order is just an

+/// identity order, \p Order is cleared.

+static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {

+ assert(!Mask.empty() && "Expected non-empty mask.");

+ SmallVector<int> MaskOrder;

+ if (Order.empty()) {

+ MaskOrder.resize(Mask.size());

+ std::iota(MaskOrder.begin(), MaskOrder.end(), 0);

+ } else {

+ inversePermutation(Order, MaskOrder);

+ }

+ reorderReuses(MaskOrder, Mask);

+ if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {

+ Order.clear();

return;

- buildTree_rec(Roots, 0, EdgeInfo());

+ }

+ Order.assign(Mask.size(), Mask.size());

+ for (unsigned I = 0, E = Mask.size(); I < E; ++I)

+ if (MaskOrder[I] != UndefMaskElem)

+ Order[MaskOrder[I]] = I;

+ fixupOrderingIndices(Order);

+Optional<BoUpSLP::OrdersType>

+BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

+ assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");

+ unsigned NumScalars = TE.Scalars.size();

+ OrdersType CurrentOrder(NumScalars, NumScalars);

+ SmallVector<int> Positions;

+ SmallBitVector UsedPositions(NumScalars);

+ const TreeEntry *STE = nullptr;

+ // Try to find all gathered scalars that are gets vectorized in other

+ // vectorize node. Here we can have only one single tree vector node to

+ // correctly identify order of the gathered scalars.

+ for (unsigned I = 0; I < NumScalars; ++I) {

+ Value *V = TE.Scalars[I];

+ if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))

+ continue;

+ if (const auto *LocalSTE = getTreeEntry(V)) {

+ if (!STE)

+ STE = LocalSTE;

+ else if (STE != LocalSTE)

+ // Take the order only from the single vector node.

+ return None;

+ unsigned Lane =

+ std::distance(STE->Scalars.begin(), find(STE->Scalars, V));

+ if (Lane >= NumScalars)

+ return None;

+ if (CurrentOrder[Lane] != NumScalars) {

+ if (Lane != I)

+ continue;

+ UsedPositions.reset(CurrentOrder[Lane]);

+ }

+ // The partial identity (where only some elements of the gather node are

+ // in the identity order) is good.

+ CurrentOrder[Lane] = I;

+ UsedPositions.set(I);

+ }

+ // Need to keep the order if we have a vector entry and at least 2 scalars or

+ // the vectorized entry has just 2 scalars.

+ if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {

+ auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {

+ for (unsigned I = 0; I < NumScalars; ++I)

+ if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)

+ return false;

+ return true;

+ };

+ if (IsIdentityOrder(CurrentOrder)) {

+ CurrentOrder.clear();

+ return CurrentOrder;

+ }

+ auto *It = CurrentOrder.begin();

+ for (unsigned I = 0; I < NumScalars;) {

+ if (UsedPositions.test(I)) {

+ ++I;

+ continue;

+ }

+ if (*It == NumScalars) {

+ *It = I;

+ ++I;

+ }

+ ++It;

+ }

+ return CurrentOrder;

+ }

+ return None;

+void BoUpSLP::reorderTopToBottom() {

+ // Maps VF to the graph nodes.

+ DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;

+ // ExtractElement gather nodes which can be vectorized and need to handle

+ // their ordering.

+ DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

+ // Find all reorderable nodes with the given VF.

+ // Currently the are vectorized loads,extracts + some gathering of extracts.

+ for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](

+ const std::unique_ptr<TreeEntry> &TE) {

+ // No need to reorder if need to shuffle reuses, still need to shuffle the

+ // node.

+ if (!TE->ReuseShuffleIndices.empty())

+ return;

+ if (TE->State == TreeEntry::Vectorize &&

+ isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,

+ InsertElementInst>(TE->getMainOp()) &&

+ !TE->isAltShuffle()) {

+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

+ return;

+ }

+ if (TE->State == TreeEntry::NeedToGather) {

+ if (TE->getOpcode() == Instruction::ExtractElement &&

+ !TE->isAltShuffle() &&

+ isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())

+ ->getVectorOperandType()) &&

+ allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {

+ // Check that gather of extractelements can be represented as

+ // just a shuffle of a single vector.

+ OrdersType CurrentOrder;

+ bool Reuse =

+ canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);

+ if (Reuse || !CurrentOrder.empty()) {

+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

+ GathersToOrders.try_emplace(TE.get(), CurrentOrder);

+ return;

+ }

+ if (Optional<OrdersType> CurrentOrder =

+ findReusedOrderedScalars(*TE.get())) {

+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

+ GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

+ }

+ });

+ // Reorder the graph nodes according to their vectorization factor.

+ for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;

+ VF /= 2) {

+ auto It = VFToOrderedEntries.find(VF);

+ if (It == VFToOrderedEntries.end())

+ continue;

+ // Try to find the most profitable order. We just are looking for the most

+ // used order and reorder scalar elements in the nodes according to this

+ // mostly used order.

+ const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond();

+ // All operands are reordered and used only in this node - propagate the

+ // most used order to the user node.

+ MapVector<OrdersType, unsigned,

+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

+ OrdersUses;

+ SmallPtrSet<const TreeEntry *, 4> VisitedOps;

+ for (const TreeEntry *OpTE : OrderedEntries) {

+ // No need to reorder this nodes, still need to extend and to use shuffle,

+ // just need to merge reordering shuffle and the reuse shuffle.

+ if (!OpTE->ReuseShuffleIndices.empty())

+ continue;

+ // Count number of orders uses.

+ const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {

+ if (OpTE->State == TreeEntry::NeedToGather)

+ return GathersToOrders.find(OpTE)->second;

+ return OpTE->ReorderIndices;

+ }();

+ // Stores actually store the mask, not the order, need to invert.

+ if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

+ OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

+ SmallVector<int> Mask;

+ inversePermutation(Order, Mask);

+ unsigned E = Order.size();

+ OrdersType CurrentOrder(E, E);

+ transform(Mask, CurrentOrder.begin(), [E](int Idx) {

+ return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);

+ });

+ fixupOrderingIndices(CurrentOrder);

+ ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;

+ } else {

+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

+ }

+ // Set order of the user node.

+ if (OrdersUses.empty())

+ continue;

+ // Choose the most used order.

+ ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

+ unsigned Cnt = OrdersUses.front().second;

+ for (const auto &Pair : drop_begin(OrdersUses)) {

+ if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {

+ BestOrder = Pair.first;

+ Cnt = Pair.second;

+ }

+ // Set order of the user node.

+ if (BestOrder.empty())

+ continue;

+ SmallVector<int> Mask;

+ inversePermutation(BestOrder, Mask);

+ SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);

+ unsigned E = BestOrder.size();

+ transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

+ return I < E ? static_cast<int>(I) : UndefMaskElem;

+ });

+ // Do an actual reordering, if profitable.

+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

+ // Just do the reordering for the nodes with the given VF.

+ if (TE->Scalars.size() != VF) {

+ if (TE->ReuseShuffleIndices.size() == VF) {

+ // Need to reorder the reuses masks of the operands with smaller VF to

+ // be able to find the match between the graph nodes and scalar

+ // operands of the given node during vectorization/cost estimation.

+ assert(all_of(TE->UserTreeIndices,

+ [VF, &TE](const EdgeInfo &EI) {

+ return EI.UserTE->Scalars.size() == VF ||

+ EI.UserTE->Scalars.size() ==

+ TE->Scalars.size();

+ }) &&

+ "All users must be of VF size.");

+ // Update ordering of the operands with the smaller VF than the given

+ // one.

+ reorderReuses(TE->ReuseShuffleIndices, Mask);

+ }

+ continue;

+ }

+ if (TE->State == TreeEntry::Vectorize &&

+ isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,

+ InsertElementInst>(TE->getMainOp()) &&

+ !TE->isAltShuffle()) {

+ // Build correct orders for extract{element,value}, loads and

+ // stores.

+ reorderOrder(TE->ReorderIndices, Mask);

+ if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))

+ TE->reorderOperands(Mask);

+ } else {

+ // Reorder the node and its operands.

+ TE->reorderOperands(Mask);

+ assert(TE->ReorderIndices.empty() &&

+ "Expected empty reorder sequence.");

+ reorderScalars(TE->Scalars, Mask);

+ }

+ if (!TE->ReuseShuffleIndices.empty()) {

+ // Apply reversed order to keep the original ordering of the reused

+ // elements to avoid extra reorder indices shuffling.

+ OrdersType CurrentOrder;

+ reorderOrder(CurrentOrder, MaskOrder);

+ SmallVector<int> NewReuses;

+ inversePermutation(CurrentOrder, NewReuses);

+ addMask(NewReuses, TE->ReuseShuffleIndices);

+ TE->ReuseShuffleIndices.swap(NewReuses);

+ }

+void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

+ SetVector<TreeEntry *> OrderedEntries;

+ DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

+ // Find all reorderable leaf nodes with the given VF.

+ // Currently the are vectorized loads,extracts without alternate operands +

+ // some gathering of extracts.

+ SmallVector<TreeEntry *> NonVectorized;

+ for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,

+ &NonVectorized](

+ const std::unique_ptr<TreeEntry> &TE) {

+ if (TE->State != TreeEntry::Vectorize)

+ NonVectorized.push_back(TE.get());

+ // No need to reorder if need to shuffle reuses, still need to shuffle the

+ // node.

+ if (!TE->ReuseShuffleIndices.empty())

+ return;

+ if (TE->State == TreeEntry::Vectorize &&

+ isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&

+ !TE->isAltShuffle()) {

+ OrderedEntries.insert(TE.get());

+ return;

+ }

+ if (TE->State == TreeEntry::NeedToGather) {

+ if (TE->getOpcode() == Instruction::ExtractElement &&

+ !TE->isAltShuffle() &&

+ isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())

+ ->getVectorOperandType()) &&

+ allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {

+ // Check that gather of extractelements can be represented as

+ // just a shuffle of a single vector with a single user only.

+ OrdersType CurrentOrder;

+ bool Reuse =

+ canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);

+ if ((Reuse || !CurrentOrder.empty()) &&

+ !any_of(VectorizableTree,

+ [&TE](const std::unique_ptr<TreeEntry> &Entry) {

+ return Entry->State == TreeEntry::NeedToGather &&

+ Entry.get() != TE.get() &&

+ Entry->isSame(TE->Scalars);

+ })) {

+ OrderedEntries.insert(TE.get());

+ GathersToOrders.try_emplace(TE.get(), CurrentOrder);

+ return;

+ }

+ if (Optional<OrdersType> CurrentOrder =

+ findReusedOrderedScalars(*TE.get())) {

+ OrderedEntries.insert(TE.get());

+ GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

+ }

+ });

+ // Checks if the operands of the users are reordarable and have only single

+ // use.

+ auto &&CheckOperands =

+ [this, &NonVectorized](const auto &Data,

+ SmallVectorImpl<TreeEntry *> &GatherOps) {

+ for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {

+ if (any_of(Data.second,

+ [I](const std::pair<unsigned, TreeEntry *> &OpData) {

+ return OpData.first == I &&

+ OpData.second->State == TreeEntry::Vectorize;

+ }))

+ continue;

+ ArrayRef<Value *> VL = Data.first->getOperand(I);

+ const TreeEntry *TE = nullptr;

+ const auto *It = find_if(VL, [this, &TE](Value *V) {

+ TE = getTreeEntry(V);

+ return TE;

+ });

+ if (It != VL.end() && TE->isSame(VL))

+ return false;

+ TreeEntry *Gather = nullptr;

+ if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {

+ assert(TE->State != TreeEntry::Vectorize &&

+ "Only non-vectorized nodes are expected.");

+ if (TE->isSame(VL)) {

+ Gather = TE;

+ return true;

+ }

+ return false;

+ }) > 1)

+ return false;

+ if (Gather)

+ GatherOps.push_back(Gather);

+ }

+ return true;

+ };

+ // 1. Propagate order to the graph nodes, which use only reordered nodes.

+ // I.e., if the node has operands, that are reordered, try to make at least

+ // one operand order in the natural order and reorder others + reorder the

+ // user node itself.

+ SmallPtrSet<const TreeEntry *, 4> Visited;

+ while (!OrderedEntries.empty()) {

+ // 1. Filter out only reordered nodes.

+ // 2. If the entry has multiple uses - skip it and jump to the next node.

+ MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;

+ SmallVector<TreeEntry *> Filtered;

+ for (TreeEntry *TE : OrderedEntries) {

+ if (!(TE->State == TreeEntry::Vectorize ||

+ (TE->State == TreeEntry::NeedToGather &&

+ GathersToOrders.count(TE))) ||

+ TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

+ !all_of(drop_begin(TE->UserTreeIndices),

+ [TE](const EdgeInfo &EI) {

+ return EI.UserTE == TE->UserTreeIndices.front().UserTE;

+ }) ||

+ !Visited.insert(TE).second) {

+ Filtered.push_back(TE);

+ continue;

+ }

+ // Build a map between user nodes and their operands order to speedup

+ // search. The graph currently does not provide this dependency directly.

+ for (EdgeInfo &EI : TE->UserTreeIndices) {

+ TreeEntry *UserTE = EI.UserTE;

+ auto It = Users.find(UserTE);

+ if (It == Users.end())

+ It = Users.insert({UserTE, {}}).first;

+ It->second.emplace_back(EI.EdgeIdx, TE);

+ }

+ // Erase filtered entries.

+ for_each(Filtered,

+ [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });

+ for (const auto &Data : Users) {

+ // Check that operands are used only in the User node.

+ SmallVector<TreeEntry *> GatherOps;

+ if (!CheckOperands(Data, GatherOps)) {

+ for_each(Data.second,

+ [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

+ OrderedEntries.remove(Op.second);

+ });

+ continue;

+ }

+ // All operands are reordered and used only in this node - propagate the

+ // most used order to the user node.

+ MapVector<OrdersType, unsigned,

+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

+ OrdersUses;

+ SmallPtrSet<const TreeEntry *, 4> VisitedOps;

+ for (const auto &Op : Data.second) {

+ TreeEntry *OpTE = Op.second;

+ if (!OpTE->ReuseShuffleIndices.empty() ||

+ (IgnoreReorder && OpTE == VectorizableTree.front().get()))

+ continue;

+ const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {

+ if (OpTE->State == TreeEntry::NeedToGather)

+ return GathersToOrders.find(OpTE)->second;

+ return OpTE->ReorderIndices;

+ }();

+ // Stores actually store the mask, not the order, need to invert.

+ if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

+ OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

+ SmallVector<int> Mask;

+ inversePermutation(Order, Mask);

+ unsigned E = Order.size();

+ OrdersType CurrentOrder(E, E);

+ transform(Mask, CurrentOrder.begin(), [E](int Idx) {

+ return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);

+ });

+ fixupOrderingIndices(CurrentOrder);

+ ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;

+ } else {

+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

+ }

+ if (VisitedOps.insert(OpTE).second)

+ OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=

+ OpTE->UserTreeIndices.size();

+ assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");

+ --OrdersUses[{}];

+ }

+ // If no orders - skip current nodes and jump to the next one, if any.

+ if (OrdersUses.empty()) {

+ for_each(Data.second,

+ [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

+ OrderedEntries.remove(Op.second);

+ });

+ continue;

+ }

+ // Choose the best order.

+ ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

+ unsigned Cnt = OrdersUses.front().second;

+ for (const auto &Pair : drop_begin(OrdersUses)) {

+ if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {

+ BestOrder = Pair.first;

+ Cnt = Pair.second;

+ }

+ // Set order of the user node (reordering of operands and user nodes).

+ if (BestOrder.empty()) {

+ for_each(Data.second,

+ [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

+ OrderedEntries.remove(Op.second);

+ });

+ continue;

+ }

+ // Erase operands from OrderedEntries list and adjust their orders.

+ VisitedOps.clear();

+ SmallVector<int> Mask;

+ inversePermutation(BestOrder, Mask);

+ SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);

+ unsigned E = BestOrder.size();

+ transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

+ return I < E ? static_cast<int>(I) : UndefMaskElem;

+ });

+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {

+ TreeEntry *TE = Op.second;

+ OrderedEntries.remove(TE);

+ if (!VisitedOps.insert(TE).second)

+ continue;

+ if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {

+ // Just reorder reuses indices.

+ reorderReuses(TE->ReuseShuffleIndices, Mask);

+ continue;

+ }

+ // Gathers are processed separately.

+ if (TE->State != TreeEntry::Vectorize)

+ continue;

+ assert((BestOrder.size() == TE->ReorderIndices.size() ||

+ TE->ReorderIndices.empty()) &&

+ "Non-matching sizes of user/operand entries.");

+ reorderOrder(TE->ReorderIndices, Mask);

+ }

+ // For gathers just need to reorder its scalars.

+ for (TreeEntry *Gather : GatherOps) {

+ assert(Gather->ReorderIndices.empty() &&

+ "Unexpected reordering of gathers.");

+ if (!Gather->ReuseShuffleIndices.empty()) {

+ // Just reorder reuses indices.

+ reorderReuses(Gather->ReuseShuffleIndices, Mask);

+ continue;

+ }

+ reorderScalars(Gather->Scalars, Mask);

+ OrderedEntries.remove(Gather);

+ }

+ // Reorder operands of the user node and set the ordering for the user

+ // node itself.

+ if (Data.first->State != TreeEntry::Vectorize ||

+ !isa<ExtractElementInst, ExtractValueInst, LoadInst>(

+ Data.first->getMainOp()) ||

+ Data.first->isAltShuffle())

+ Data.first->reorderOperands(Mask);

+ if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||

+ Data.first->isAltShuffle()) {

+ reorderScalars(Data.first->Scalars, Mask);

+ reorderOrder(Data.first->ReorderIndices, MaskOrder);

+ if (Data.first->ReuseShuffleIndices.empty() &&

+ !Data.first->ReorderIndices.empty() &&

+ !Data.first->isAltShuffle()) {

+ // Insert user node to the list to try to sink reordering deeper in

+ // the graph.

+ OrderedEntries.insert(Data.first);

+ }

+ } else {

+ reorderOrder(Data.first->ReorderIndices, Mask);

+ }

+ // If the reordering is unnecessary, just remove the reorder.

+ if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&

+ VectorizableTree.front()->ReuseShuffleIndices.empty())

+ VectorizableTree.front()->ReorderIndices.clear();

+void BoUpSLP::buildExternalUses(

+ const ExtraValueToDebugLocsMap &ExternallyUsedValues) {

// Collect the values that we need to extract from the tree.

for (auto &TEPtr : VectorizableTree) {

TreeEntry *Entry = TEPtr.get();

@@ -2636,6 +3215,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

if (!UserInst)

continue;

+ if (isDeleted(UserInst))

+ continue;

// Skip in-tree scalars that become vectors

if (TreeEntry *UseEntry = getTreeEntry(U)) {

Value *UseScalar = UseEntry->Scalars[0];

@@ -2664,14 +3246,120 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

}

+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

+ ArrayRef<Value *> UserIgnoreLst) {

+ deleteTree();

+ UserIgnoreList = UserIgnoreLst;

+ if (!allSameType(Roots))

+ return;

+ buildTree_rec(Roots, 0, EdgeInfo());

+namespace {

+/// Tracks the state we can represent the loads in the given sequence.

+enum class LoadsState { Gather, Vectorize, ScatterVectorize };

+} // anonymous namespace

+/// Checks if the given array of loads can be represented as a vectorized,

+/// scatter or just simple gather.

+static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

+ const TargetTransformInfo &TTI,

+ const DataLayout &DL, ScalarEvolution &SE,

+ SmallVectorImpl<unsigned> &Order,

+ SmallVectorImpl<Value *> &PointerOps) {

+ // Check that a vectorized load would load the same memory as a scalar

+ // load. For example, we don't want to vectorize loads that are smaller

+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores

+ // from such a struct, we read/write packed bits disagreeing with the

+ // unvectorized version.

+ Type *ScalarTy = VL0->getType();

+ if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))

+ return LoadsState::Gather;

+ // Make sure all loads in the bundle are simple - we can't vectorize

+ // atomic or volatile loads.

+ PointerOps.clear();

+ PointerOps.resize(VL.size());

+ auto *POIter = PointerOps.begin();

+ for (Value *V : VL) {

+ auto *L = cast<LoadInst>(V);

+ if (!L->isSimple())

+ return LoadsState::Gather;

+ *POIter = L->getPointerOperand();

+ ++POIter;

+ }

+ Order.clear();

+ // Check the order of pointer operands.

+ if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {

+ Value *Ptr0;

+ Value *PtrN;

+ if (Order.empty()) {

+ Ptr0 = PointerOps.front();

+ PtrN = PointerOps.back();

+ } else {

+ Ptr0 = PointerOps[Order.front()];

+ PtrN = PointerOps[Order.back()];

+ }

+ Optional<int> Diff =

+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);

+ // Check that the sorted loads are consecutive.

+ if (static_cast<unsigned>(*Diff) == VL.size() - 1)

+ return LoadsState::Vectorize;

+ Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();

+ for (Value *V : VL)

+ CommonAlignment =

+ commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());

+ if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),

+ CommonAlignment))

+ return LoadsState::ScatterVectorize;

+ }

+ return LoadsState::Gather;

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

const EdgeInfo &UserTreeIdx) {

assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");

+ SmallVector<int> ReuseShuffleIndicies;

+ SmallVector<Value *> UniqueValues;

+ auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,

+ &UserTreeIdx,

+ this](const InstructionsState &S) {

+ // Check that every instruction appears once in this bundle.

+ DenseMap<Value *, unsigned> UniquePositions;

+ for (Value *V : VL) {

+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

+ ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1

+ : Res.first->second);

+ if (Res.second)

+ UniqueValues.emplace_back(V);

+ }

+ size_t NumUniqueScalarValues = UniqueValues.size();

+ if (NumUniqueScalarValues == VL.size()) {

+ ReuseShuffleIndicies.clear();

+ } else {

+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");

+ if (NumUniqueScalarValues <= 1 ||

+ !llvm::isPowerOf2_32(NumUniqueScalarValues)) {

+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ return false;

+ }

+ VL = UniqueValues;

+ }

+ return true;

+ };

InstructionsState S = getSameOpcode(VL);

if (Depth == RecursionMaxDepth) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

@@ -2680,7 +3368,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

isa<ScalableVectorType>(

cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

@@ -2700,9 +3390,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

// If all of the operands are identical or constant we have a simple solution.

- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {

+ // If we deal with insert/extract instructions, they all must have constant

+ // indices, otherwise we should gather them, not try to vectorize.

+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||

+ (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&

+ !all_of(VL, isVectorLikeInstWithConstOps))) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

@@ -2724,7 +3420,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");

if (!E->isSame(VL)) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

// Record the reuse of the tree node. FIXME, currently this is only used to

@@ -2743,7 +3441,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (getTreeEntry(I)) {

LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

<< ") is already in tree.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

@@ -2754,7 +3454,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (Value *V : VL) {

if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

@@ -2773,28 +3475,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

// Check that every instruction appears once in this bundle.

- SmallVector<unsigned, 4> ReuseShuffleIndicies;

- SmallVector<Value *, 4> UniqueValues;

- DenseMap<Value *, unsigned> UniquePositions;

- for (Value *V : VL) {

- auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

- ReuseShuffleIndicies.emplace_back(Res.first->second);

- if (Res.second)

- UniqueValues.emplace_back(V);

- }

- size_t NumUniqueScalarValues = UniqueValues.size();

- if (NumUniqueScalarValues == VL.size()) {

- ReuseShuffleIndicies.clear();

- } else {

- LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");

- if (NumUniqueScalarValues <= 1 ||

- !llvm::isPowerOf2_32(NumUniqueScalarValues)) {

- LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

- return;

- }

- VL = UniqueValues;

- }

+ if (!TryToFindDuplicates(S))

+ return;

auto &BSRef = BlocksSchedules[BB];

if (!BSRef)

@@ -2867,7 +3549,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);

if (Reuse) {

LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");

- ++NumOpsWantToKeepOriginalOrder;

newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

// This is a special case, as it does not gather, but at the same time

@@ -2885,12 +3566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

dbgs() << " " << Idx;

dbgs() << "\n";

});

+ fixupOrderingIndices(CurrentOrder);

// Insert new order with initial value 0, if it does not exist,

// otherwise return the iterator to the existing one.

newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies, CurrentOrder);

- findRootOrder(CurrentOrder);

- ++NumOpsWantToKeepOrder[CurrentOrder];

// This is a special case, as it does not gather, but at the same time

// we are not extending buildTree_rec() towards the operands.

ValueList Op0;

@@ -2910,8 +3590,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Check that we have a buildvector and not a shuffle of 2 or more

// different vectors.

ValueSet SourceVectors;

- for (Value *V : VL)

+ int MinIdx = std::numeric_limits<int>::max();

+ for (Value *V : VL) {

SourceVectors.insert(cast<Instruction>(V)->getOperand(0));

+ Optional<int> Idx = *getInsertIndex(V, 0);

+ if (!Idx || *Idx == UndefMaskElem)

+ continue;

+ MinIdx = std::min(MinIdx, *Idx);

+ }

if (count_if(VL, [&SourceVectors](Value *V) {

return !SourceVectors.contains(V);

@@ -2919,13 +3605,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Found 2nd source vector - cancel.

LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "

"different source vectors.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

BS.cancelScheduling(VL, VL0);

return;

}

- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);

+ auto OrdCompare = [](const std::pair<int, int> &P1,

+ const std::pair<int, int> &P2) {

+ return P1.first > P2.first;

+ };

+ PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,

+ decltype(OrdCompare)>

+ Indices(OrdCompare);

+ for (int I = 0, E = VL.size(); I < E; ++I) {

+ Optional<int> Idx = *getInsertIndex(VL[I], 0);

+ if (!Idx || *Idx == UndefMaskElem)

+ continue;

+ Indices.emplace(*Idx, I);

+ }

+ OrdersType CurrentOrder(VL.size(), VL.size());

+ bool IsIdentity = true;

+ for (int I = 0, E = VL.size(); I < E; ++I) {

+ CurrentOrder[Indices.top().second] = I;

+ IsIdentity &= Indices.top().second == I;

+ Indices.pop();

+ }

+ if (IsIdentity)

+ CurrentOrder.clear();

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ None, CurrentOrder);

LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");

constexpr int NumOps = 2;

@@ -2936,7 +3644,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

TE->setOperand(I, VectorOperands[I]);

}

- buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0});

+ buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});

return;

}

case Instruction::Load: {

@@ -2946,90 +3654,52 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// treats loading/storing it as an i8 struct. If we vectorize loads/stores

// from such a struct, we read/write packed bits disagreeing with the

// unvectorized version.

- Type *ScalarTy = VL0->getType();

- if (DL->getTypeSizeInBits(ScalarTy) !=

- DL->getTypeAllocSizeInBits(ScalarTy)) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");

- return;

- }

- // Make sure all loads in the bundle are simple - we can't vectorize

- // atomic or volatile loads.

- SmallVector<Value *, 4> PointerOps(VL.size());

- auto POIter = PointerOps.begin();

- for (Value *V : VL) {

- auto *L = cast<LoadInst>(V);

- if (!L->isSimple()) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");

- return;

- }

- *POIter = L->getPointerOperand();

- ++POIter;

- }

+ SmallVector<Value *> PointerOps;

OrdersType CurrentOrder;

- // Check the order of pointer operands.

- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {

- Value *Ptr0;

- Value *PtrN;

+ TreeEntry *TE = nullptr;

+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,

+ PointerOps)) {

+ case LoadsState::Vectorize:

if (CurrentOrder.empty()) {

- Ptr0 = PointerOps.front();

- PtrN = PointerOps.back();

+ // Original loads are consecutive and does not require reordering.

+ TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");

} else {

- Ptr0 = PointerOps[CurrentOrder.front()];

- PtrN = PointerOps[CurrentOrder.back()];

- }

- Optional<int> Diff = getPointersDiff(

- ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

- // Check that the sorted loads are consecutive.

- if (static_cast<unsigned>(*Diff) == VL.size() - 1) {

- if (CurrentOrder.empty()) {

- // Original loads are consecutive and does not require reordering.

- ++NumOpsWantToKeepOriginalOrder;

- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,

- UserTreeIdx, ReuseShuffleIndicies);

- TE->setOperandsInOrder();

- LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");

- } else {

- // Need to reorder.

- TreeEntry *TE =

- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies, CurrentOrder);

- TE->setOperandsInOrder();

- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");

- findRootOrder(CurrentOrder);

- ++NumOpsWantToKeepOrder[CurrentOrder];

- }

- return;

- }

- Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();

- for (Value *V : VL)

- CommonAlignment =

- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());

- if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),

- CommonAlignment)) {

- // Vectorizing non-consecutive loads with `llvm.masked.gather`.

- TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle,

- S, UserTreeIdx, ReuseShuffleIndicies);

- TE->setOperandsInOrder();

- buildTree_rec(PointerOps, Depth + 1, {TE, 0});

- LLVM_DEBUG(dbgs()

- << "SLP: added a vector of non-consecutive loads.\n");

- return;

+ fixupOrderingIndices(CurrentOrder);

+ // Need to reorder.

+ TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies, CurrentOrder);

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");

}

+ TE->setOperandsInOrder();

+ break;

+ case LoadsState::ScatterVectorize:

+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.

+ TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

+ UserTreeIdx, ReuseShuffleIndicies);

+ TE->setOperandsInOrder();

+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");

+ break;

+ case LoadsState::Gather:

+ BS.cancelScheduling(VL, VL0);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+#ifndef NDEBUG

+ Type *ScalarTy = VL0->getType();

+ if (DL->getTypeSizeInBits(ScalarTy) !=

+ DL->getTypeAllocSizeInBits(ScalarTy))

+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");

+ else if (any_of(VL, [](Value *V) {

+ return !cast<LoadInst>(V)->isSimple();

+ }))

+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");

+ else

+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");

+#endif // NDEBUG

+ break;

}

- LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

return;

}

case Instruction::ZExt:

@@ -3213,15 +3883,40 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");

- TE->setOperandsInOrder();

- for (unsigned i = 0, e = 2; i < e; ++i) {

- ValueList Operands;

- // Prepare the operand vector.

- for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(i));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ SmallVector<ValueList, 2> Operands(2);

+ // Prepare the operand vector for pointer operands.

+ for (Value *V : VL)

+ Operands.front().push_back(

+ cast<GetElementPtrInst>(V)->getPointerOperand());

+ TE->setOperand(0, Operands.front());

+ // Need to cast all indices to the same type before vectorization to

+ // avoid crash.

+ // Required to be able to find correct matches between different gather

+ // nodes and reuse the vectorized values rather than trying to gather them

+ // again.

+ int IndexIdx = 1;

+ Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();

+ Type *Ty = all_of(VL,

+ [VL0Ty, IndexIdx](Value *V) {

+ return VL0Ty == cast<GetElementPtrInst>(V)

+ ->getOperand(IndexIdx)

+ ->getType();

+ })

+ ? VL0Ty

+ : DL->getIndexType(cast<GetElementPtrInst>(VL0)

+ ->getPointerOperandType()

+ ->getScalarType());

+ // Prepare the operand vector.

+ for (Value *V : VL) {

+ auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);

+ auto *CI = cast<ConstantInt>(Op);

+ Operands.back().push_back(ConstantExpr::getIntegerCast(

+ CI, Ty, CI->getValue().isSignBitSet()));

}

+ TE->setOperand(IndexIdx, Operands.back());

+ for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)

+ buildTree_rec(Operands[I], Depth + 1, {TE, I});

return;

}

case Instruction::Store: {

@@ -3276,21 +3971,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (static_cast<unsigned>(*Dist) == VL.size() - 1) {

if (CurrentOrder.empty()) {

// Original stores are consecutive and does not require reordering.

- ++NumOpsWantToKeepOriginalOrder;

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,

UserTreeIdx, ReuseShuffleIndicies);

TE->setOperandsInOrder();

buildTree_rec(Operands, Depth + 1, {TE, 0});

LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

} else {

+ fixupOrderingIndices(CurrentOrder);

TreeEntry *TE =

newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies, CurrentOrder);

TE->setOperandsInOrder();

buildTree_rec(Operands, Depth + 1, {TE, 0});

LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");

- findRootOrder(CurrentOrder);

- ++NumOpsWantToKeepOrder[CurrentOrder];

}

return;

}

@@ -3321,7 +4014,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return;

}

Function *F = CI->getCalledFunction();

- unsigned NumArgs = CI->getNumArgOperands();

+ unsigned NumArgs = CI->arg_size();

SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);

for (unsigned j = 0; j != NumArgs; ++j)

if (hasVectorInstrinsicScalarOpd(ID, j))

@@ -3373,7 +4066,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

TE->setOperandsInOrder();

- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {

+ for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

+ // For scalar operands no need to to create an entry since no need to

+ // vectorize it.

+ if (hasVectorInstrinsicScalarOpd(ID, i))

+ continue;

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL) {

@@ -3548,7 +4245,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

FastMathFlags FMF;

if (auto *FPCI = dyn_cast<FPMathOperator>(CI))

FMF = FPCI->getFastMathFlags();

- SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());

+ SmallVector<const Value *> Arguments(CI->args());

IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,

dyn_cast<IntrinsicInst>(CI));

auto IntrinsicCost =

@@ -3621,25 +4318,42 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

return Cost;

}

-/// Shuffles \p Mask in accordance with the given \p SubMask.

-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {

- if (SubMask.empty())

- return;

- if (Mask.empty()) {

- Mask.append(SubMask.begin(), SubMask.end());

- return;

- }

- SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());

- int TermValue = std::min(Mask.size(), SubMask.size());

- for (int I = 0, E = SubMask.size(); I < E; ++I) {

- if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||

- Mask[SubMask[I]] >= TermValue) {

- NewMask[I] = UndefMaskElem;

- continue;

+/// Build shuffle mask for shuffle graph entries and lists of main and alternate

+/// operations operands.

+static void

+buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

+ ArrayRef<int> ReusesIndices,

+ const function_ref<bool(Instruction *)> IsAltOp,

+ SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<Value *> *OpScalars = nullptr,

+ SmallVectorImpl<Value *> *AltScalars = nullptr) {

+ unsigned Sz = VL.size();

+ Mask.assign(Sz, UndefMaskElem);

+ SmallVector<int> OrderMask;

+ if (!ReorderIndices.empty())

+ inversePermutation(ReorderIndices, OrderMask);

+ for (unsigned I = 0; I < Sz; ++I) {

+ unsigned Idx = I;

+ if (!ReorderIndices.empty())

+ Idx = OrderMask[I];

+ auto *OpInst = cast<Instruction>(VL[Idx]);

+ if (IsAltOp(OpInst)) {

+ Mask[I] = Sz + Idx;

+ if (AltScalars)

+ AltScalars->push_back(OpInst);

+ } else {

+ Mask[I] = Idx;

+ if (OpScalars)

+ OpScalars->push_back(OpInst);

}

- NewMask[I] = Mask[SubMask[I]];

}

- Mask.swap(NewMask);

+ if (!ReusesIndices.empty()) {

+ SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);

+ transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {

+ return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;

+ });

+ Mask.swap(NewMask);

+ }

}

InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

@@ -3661,13 +4375,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

if (MinBWs.count(VL[0]))

VecTy = FixedVectorType::get(

IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

- auto *FinalVecTy = VecTy;

+ unsigned EntryVF = E->getVectorFactor();

+ auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);

- unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

- if (NeedToShuffleReuses)

- FinalVecTy =

- FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers);

// FIXME: it tries to fix a problem with MSVC buildbots.

TargetTransformInfo &TTIRef = *TTI;

auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,

@@ -3785,7 +4496,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// shuffle of a single/two vectors the scalars are extracted from.

SmallVector<int> Mask;

Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =

- isShuffle(VL, Mask);

+ isFixedVectorShuffle(VL, Mask);

if (ShuffleKind.hasValue()) {

// Found the bunch of extractelement instructions that must be gathered

// into a vector and can be represented as a permutation elements in a

@@ -3803,6 +4514,92 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

if (NeedToShuffleReuses)

ReuseShuffleCost = TTI->getShuffleCost(

TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);

+ // Improve gather cost for gather of loads, if we can group some of the

+ // loads into vector loads.

+ if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&

+ !E->isAltShuffle()) {

+ BoUpSLP::ValueSet VectorizedLoads;

+ unsigned StartIdx = 0;

+ unsigned VF = VL.size() / 2;

+ unsigned VectorizedCnt = 0;

+ unsigned ScatterVectorizeCnt = 0;

+ const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());

+ for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {

+ for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;

+ Cnt += VF) {

+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

+ if (!VectorizedLoads.count(Slice.front()) &&

+ !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {

+ SmallVector<Value *> PointerOps;

+ OrdersType CurrentOrder;

+ LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,

+ *SE, CurrentOrder, PointerOps);

+ switch (LS) {

+ case LoadsState::Vectorize:

+ case LoadsState::ScatterVectorize:

+ // Mark the vectorized loads so that we don't vectorize them

+ // again.

+ if (LS == LoadsState::Vectorize)

+ ++VectorizedCnt;

+ else

+ ++ScatterVectorizeCnt;

+ VectorizedLoads.insert(Slice.begin(), Slice.end());

+ // If we vectorized initial block, no need to try to vectorize it

+ // again.

+ if (Cnt == StartIdx)

+ StartIdx += VF;

+ break;

+ case LoadsState::Gather:

+ break;

+ }

+ // Check if the whole array was vectorized already - exit.

+ if (StartIdx >= VL.size())

+ break;

+ // Found vectorizable parts - exit.

+ if (!VectorizedLoads.empty())

+ break;

+ }

+ if (!VectorizedLoads.empty()) {

+ InstructionCost GatherCost = 0;

+ unsigned NumParts = TTI->getNumberOfParts(VecTy);

+ bool NeedInsertSubvectorAnalysis =

+ !NumParts || (VL.size() / VF) > NumParts;

+ // Get the cost for gathered loads.

+ for (unsigned I = 0, End = VL.size(); I < End; I += VF) {

+ if (VectorizedLoads.contains(VL[I]))

+ continue;

+ GatherCost += getGatherCost(VL.slice(I, VF));

+ }

+ // The cost for vectorized loads.

+ InstructionCost ScalarsCost = 0;

+ for (Value *V : VectorizedLoads) {

+ auto *LI = cast<LoadInst>(V);

+ ScalarsCost += TTI->getMemoryOpCost(

+ Instruction::Load, LI->getType(), LI->getAlign(),

+ LI->getPointerAddressSpace(), CostKind, LI);

+ }

+ auto *LI = cast<LoadInst>(E->getMainOp());

+ auto *LoadTy = FixedVectorType::get(LI->getType(), VF);

+ Align Alignment = LI->getAlign();

+ GatherCost +=

+ VectorizedCnt *

+ TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

+ LI->getPointerAddressSpace(), CostKind, LI);

+ GatherCost += ScatterVectorizeCnt *

+ TTI->getGatherScatterOpCost(

+ Instruction::Load, LoadTy, LI->getPointerOperand(),

+ /*VariableMask=*/false, Alignment, CostKind, LI);

+ if (NeedInsertSubvectorAnalysis) {

+ // Add the cost for the subvectors insert.

+ for (int I = VF, E = VL.size(); I < E; I += VF)

+ GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,

+ None, I, LoadTy);

+ }

+ return ReuseShuffleCost + GatherCost - ScalarsCost;

+ }

return ReuseShuffleCost + getGatherCost(VL);

}

InstructionCost CommonCost = 0;

@@ -3852,7 +4649,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

++Idx;

}

- Idx = ReuseShuffleNumbers;

+ Idx = EntryVF;

for (Value *V : VL) {

if (ShuffleOrOp == Instruction::ExtractElement) {

auto *EE = cast<ExtractElementInst>(V);

@@ -3895,29 +4692,33 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

return CommonCost;

}

case Instruction::InsertElement: {

+ assert(E->ReuseShuffleIndices.empty() &&

+ "Unique insertelements only are expected.");

auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());

unsigned const NumElts = SrcVecTy->getNumElements();

unsigned const NumScalars = VL.size();

- APInt DemandedElts = APInt::getNullValue(NumElts);

+ APInt DemandedElts = APInt::getZero(NumElts);

// TODO: Add support for Instruction::InsertValue.

- unsigned Offset = UINT_MAX;

+ SmallVector<int> Mask;

+ if (!E->ReorderIndices.empty()) {

+ inversePermutation(E->ReorderIndices, Mask);

+ Mask.append(NumElts - NumScalars, UndefMaskElem);

+ } else {

+ Mask.assign(NumElts, UndefMaskElem);

+ std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

+ }

+ unsigned Offset = *getInsertIndex(VL0, 0);

bool IsIdentity = true;

- SmallVector<int> ShuffleMask(NumElts, UndefMaskElem);

+ SmallVector<int> PrevMask(NumElts, UndefMaskElem);

+ Mask.swap(PrevMask);

for (unsigned I = 0; I < NumScalars; ++I) {

- Optional<int> InsertIdx = getInsertIndex(VL[I], 0);

+ Optional<int> InsertIdx = getInsertIndex(VL[PrevMask[I]], 0);

if (!InsertIdx || *InsertIdx == UndefMaskElem)

continue;

- unsigned Idx = *InsertIdx;

- DemandedElts.setBit(Idx);

- if (Idx < Offset) {

- Offset = Idx;

- IsIdentity &= I == 0;

- } else {

- assert(Idx >= Offset && "Failed to find vector index offset");

- IsIdentity &= Idx - Offset == I;

- }

- ShuffleMask[Idx] = I;

+ DemandedElts.setBit(*InsertIdx);

+ IsIdentity &= *InsertIdx - Offset == I;

+ Mask[*InsertIdx - Offset] = I;

}

assert(Offset < NumElts && "Failed to find vector index offset");

@@ -3932,8 +4733,23 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TargetTransformInfo::SK_PermuteSingleSrc,

FixedVectorType::get(SrcVecTy->getElementType(), Sz));

} else if (!IsIdentity) {

- Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,

- ShuffleMask);

+ auto *FirstInsert =

+ cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

+ return !is_contained(E->Scalars,

+ cast<Instruction>(V)->getOperand(0));

+ }));

+ if (isa<UndefValue>(FirstInsert->getOperand(0))) {

+ Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);

+ } else {

+ SmallVector<int> InsertMask(NumElts);

+ std::iota(InsertMask.begin(), InsertMask.end(), 0);

+ for (unsigned I = 0; I < NumElts; I++) {

+ if (Mask[I] != UndefMaskElem)

+ InsertMask[Offset + I] = NumElts + I;

+ }

+ Cost +=

+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);

+ }

}

return Cost;

@@ -3955,7 +4771,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,

TTI::getCastContextHint(VL0), CostKind, VL0);

if (NeedToShuffleReuses) {

- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

}

// Calculate the cost of this instruction.

@@ -3980,7 +4796,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),

CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);

if (NeedToShuffleReuses) {

- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

}

auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());

InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;

@@ -4085,7 +4901,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,

Op2VK, Op1VP, Op2VP, Operands, VL0);

if (NeedToShuffleReuses) {

- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

}

InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;

InstructionCost VecCost =

@@ -4103,7 +4919,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(

Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);

if (NeedToShuffleReuses) {

- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

}

InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;

InstructionCost VecCost = TTI->getArithmeticInstrCost(

@@ -4117,7 +4933,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

InstructionCost ScalarEltCost = TTI->getMemoryOpCost(

Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);

if (NeedToShuffleReuses) {

- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

}

InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;

InstructionCost VecLdCost;

@@ -4160,7 +4976,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

InstructionCost ScalarEltCost =

TTI->getIntrinsicInstrCost(CostAttrs, CostKind);

if (NeedToShuffleReuses) {

- CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

}

InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;

@@ -4215,14 +5031,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TTI::CastContextHint::None, CostKind);

}

- SmallVector<int> Mask(E->Scalars.size());

- for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) {

- auto *OpInst = cast<Instruction>(E->Scalars[I]);

- assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");

- Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0);

- }

- VecCost +=

- TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0);

+ SmallVector<int> Mask;

+ buildSuffleEntryMask(

+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

+ [E](Instruction *I) {

+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

+ return I->getOpcode() == E->getAltOpcode();

+ },

+ Mask);

+ CommonCost =

+ TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));

return CommonCost + VecCost - ScalarCost;

}

@@ -4231,13 +5049,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

}

-bool BoUpSLP::isFullyVectorizableTinyTree() const {

+bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {

LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "

<< VectorizableTree.size() << " is fully vectorizable .\n");

+ auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {

+ SmallVector<int> Mask;

+ return TE->State == TreeEntry::NeedToGather &&

+ !any_of(TE->Scalars,

+ [this](Value *V) { return EphValues.contains(V); }) &&

+ (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||

+ TE->Scalars.size() < Limit ||

+ (TE->getOpcode() == Instruction::ExtractElement &&

+ isFixedVectorShuffle(TE->Scalars, Mask)) ||

+ (TE->State == TreeEntry::NeedToGather &&

+ TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));

+ };

// We only handle trees of heights 1 and 2.

if (VectorizableTree.size() == 1 &&

- VectorizableTree[0]->State == TreeEntry::Vectorize)

+ (VectorizableTree[0]->State == TreeEntry::Vectorize ||

+ (ForReduction &&

+ AreVectorizableGathers(VectorizableTree[0].get(),

+ VectorizableTree[0]->Scalars.size()) &&

+ VectorizableTree[0]->getVectorFactor() > 2)))

return true;

if (VectorizableTree.size() != 2)

@@ -4249,19 +5084,14 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {

// or they are extractelements, which form shuffle.

SmallVector<int> Mask;

if (VectorizableTree[0]->State == TreeEntry::Vectorize &&

- (allConstant(VectorizableTree[1]->Scalars) ||

- isSplat(VectorizableTree[1]->Scalars) ||

- (VectorizableTree[1]->State == TreeEntry::NeedToGather &&

- VectorizableTree[1]->Scalars.size() <

- VectorizableTree[0]->Scalars.size()) ||

- (VectorizableTree[1]->State == TreeEntry::NeedToGather &&

- VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&

- isShuffle(VectorizableTree[1]->Scalars, Mask))))

+ AreVectorizableGathers(VectorizableTree[1].get(),

+ VectorizableTree[0]->Scalars.size()))

return true;

// Gathering cost would be too much for tiny trees.

if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||

- VectorizableTree[1]->State == TreeEntry::NeedToGather)

+ (VectorizableTree[1]->State == TreeEntry::NeedToGather &&

+ VectorizableTree[0]->State != TreeEntry::ScatterVectorize))

return false;

return true;

@@ -4330,7 +5160,7 @@ bool BoUpSLP::isLoadCombineCandidate() const {

return true;

}

-bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {

+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {

// No need to vectorize inserts of gathered values.

if (VectorizableTree.size() == 2 &&

isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&

@@ -4344,7 +5174,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {

// If we have a tiny tree (a tree whose size is less than MinTreeSize), we

// can vectorize it if we can prove it fully vectorizable.

- if (isFullyVectorizableTinyTree())

+ if (isFullyVectorizableTinyTree(ForReduction))

return false;

assert(VectorizableTree.empty()

@@ -4496,7 +5326,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// If found user is an insertelement, do not calculate extract cost but try

// to detect it as a final shuffled/identity match.

- if (EU.User && isa<InsertElementInst>(EU.User)) {

+ if (isa_and_nonnull<InsertElementInst>(EU.User)) {

if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {

Optional<int> InsertIdx = getInsertIndex(EU.User, 0);

if (!InsertIdx || *InsertIdx == UndefMaskElem)

@@ -4508,8 +5338,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

return false;

auto *IE1 = cast<InsertElementInst>(VU);

auto *IE2 = cast<InsertElementInst>(V);

- // Go though of insertelement instructions trying to find either VU as

- // the original vector for IE2 or V as the original vector for IE1.

+ // Go through of insertelement instructions trying to find either VU

+ // as the original vector for IE2 or V as the original vector for IE1.

do {

if (IE1 == VU || IE2 == V)

return true;

@@ -4525,7 +5355,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

VF.push_back(FTy->getNumElements());

ShuffleMask.emplace_back(VF.back(), UndefMaskElem);

FirstUsers.push_back(EU.User);

- DemandedElts.push_back(APInt::getNullValue(VF.back()));

+ DemandedElts.push_back(APInt::getZero(VF.back()));

VecId = FirstUsers.size() - 1;

} else {

VecId = std::distance(FirstUsers.begin(), It);

@@ -4705,18 +5535,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,

} else {

// Try to find nodes with the same vector factor.

assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");

- // FIXME: Shall be replaced by GetVF function once non-power-2 patch is

- // landed.

- auto &&GetVF = [](const TreeEntry *TE) {

- if (!TE->ReuseShuffleIndices.empty())

- return TE->ReuseShuffleIndices.size();

- return TE->Scalars.size();

- };

DenseMap<int, const TreeEntry *> VFToTE;

for (const TreeEntry *TE : UsedTEs.front())

- VFToTE.try_emplace(GetVF(TE), TE);

+ VFToTE.try_emplace(TE->getVectorFactor(), TE);

for (const TreeEntry *TE : UsedTEs.back()) {

- auto It = VFToTE.find(GetVF(TE));

+ auto It = VFToTE.find(TE->getVectorFactor());

if (It != VFToTE.end()) {

VF = It->first;

Entries.push_back(It->second);

@@ -4757,16 +5580,17 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,

InstructionCost

BoUpSLP::getGatherCost(FixedVectorType *Ty,

- const DenseSet<unsigned> &ShuffledIndices) const {

+ const DenseSet<unsigned> &ShuffledIndices,

+ bool NeedToShuffle) const {

unsigned NumElts = Ty->getNumElements();

- APInt DemandedElts = APInt::getNullValue(NumElts);

+ APInt DemandedElts = APInt::getZero(NumElts);

for (unsigned I = 0; I < NumElts; ++I)

if (!ShuffledIndices.count(I))

DemandedElts.setBit(I);

InstructionCost Cost =

TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,

/*Extract*/ false);

- if (!ShuffledIndices.empty())

+ if (NeedToShuffle)

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);

return Cost;

}

@@ -4777,6 +5601,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

ScalarTy = SI->getValueOperand()->getType();

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

+ bool DuplicateNonConst = false;

// Find the cost of inserting/extracting values from the vector.

// Check if the same elements are inserted several times and count them as

// shuffle candidates.

@@ -4785,12 +5610,17 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// Iterate in reverse order to consider insert elements with the high cost.

for (unsigned I = VL.size(); I > 0; --I) {

unsigned Idx = I - 1;

- if (isConstant(VL[Idx]))

+ // No need to shuffle duplicates for constants.

+ if (isConstant(VL[Idx])) {

+ ShuffledElements.insert(Idx);

continue;

- if (!UniqueElements.insert(VL[Idx]).second)

+ }

+ if (!UniqueElements.insert(VL[Idx]).second) {

+ DuplicateNonConst = true;

ShuffledElements.insert(Idx);

+ }

}

- return getGatherCost(VecTy, ShuffledElements);

+ return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);

}

// Perform operand reordering on the instructions in VL and return the reordered

@@ -5006,17 +5836,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

// block:

// %phi = phi <2 x > { .., %entry} {%shuffle, %block}

- // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>

+ // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>

// ... (use %2)

- // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}

+ // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}

// br %block

- SmallVector<int> UniqueIdxs;

+ SmallVector<int> UniqueIdxs(VF, UndefMaskElem);

SmallSet<int, 4> UsedIdxs;

int Pos = 0;

int Sz = VL.size();

for (int Idx : E->ReuseShuffleIndices) {

- if (Idx != Sz && UsedIdxs.insert(Idx).second)

- UniqueIdxs.emplace_back(Pos);

+ if (Idx != Sz && Idx != UndefMaskElem &&

+ UsedIdxs.insert(Idx).second)

+ UniqueIdxs[Idx] = Pos;

++Pos;

}

assert(VF >= UsedIdxs.size() && "Expected vectorization factor "

@@ -5047,11 +5878,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

}).base());

VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));

int UniqueVals = 0;

- bool HasUndefs = false;

for (Value *V : VL.drop_back(VL.size() - VF)) {

if (isa<UndefValue>(V)) {

ReuseShuffleIndicies.emplace_back(UndefMaskElem);

- HasUndefs = true;

continue;

}

if (isConstant(V)) {

@@ -5066,15 +5895,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

++UniqueVals;

}

- if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) {

+ if (UniqueVals == 1 && UniqueValues.size() == 1) {

// Emit pure splat vector.

- // FIXME: why it is not identified as an identity.

- unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem);

- if (NumUndefs == ReuseShuffleIndicies.size() - 1)

- ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),

- UndefMaskElem);

- else

- ReuseShuffleIndicies.assign(VF, 0);

+ ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),

+ UndefMaskElem);

} else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {

ReuseShuffleIndicies.clear();

UniqueValues.clear();

@@ -5107,12 +5931,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

- unsigned VF = E->Scalars.size();

- if (NeedToShuffleReuses)

- VF = E->ReuseShuffleIndices.size();

+ unsigned VF = E->getVectorFactor();

ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);

if (E->State == TreeEntry::NeedToGather) {

- setInsertPointAfterBundle(E);

+ if (E->getMainOp())

+ setInsertPointAfterBundle(E);

Value *Vec;

SmallVector<int> Mask;

SmallVector<const TreeEntry *> Entries;

@@ -5152,13 +5975,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());

switch (ShuffleOrOp) {

case Instruction::PHI: {

+ assert(

+ (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) &&

+ "PHI reordering is free.");

auto *PH = cast<PHINode>(VL0);

Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

Value *V = NewPhi;

- if (NeedToShuffleReuses)

- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);

+ V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V;

@@ -5209,53 +6036,48 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return NewV;

}

case Instruction::InsertElement: {

- Builder.SetInsertPoint(VL0);

+ assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");

+ Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));

Value *V = vectorizeTree(E->getOperand(1));

+ // Create InsertVector shuffle if necessary

+ auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

+ return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

+ }));

const unsigned NumElts =

- cast<FixedVectorType>(VL0->getType())->getNumElements();

+ cast<FixedVectorType>(FirstInsert->getType())->getNumElements();

const unsigned NumScalars = E->Scalars.size();

+ unsigned Offset = *getInsertIndex(VL0, 0);

+ assert(Offset < NumElts && "Failed to find vector index offset");

+ // Create shuffle to resize vector

+ SmallVector<int> Mask;

+ if (!E->ReorderIndices.empty()) {

+ inversePermutation(E->ReorderIndices, Mask);

+ Mask.append(NumElts - NumScalars, UndefMaskElem);

+ } else {

+ Mask.assign(NumElts, UndefMaskElem);

+ std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

+ }

// Create InsertVector shuffle if necessary

- Instruction *FirstInsert = nullptr;

bool IsIdentity = true;

- unsigned Offset = UINT_MAX;

+ SmallVector<int> PrevMask(NumElts, UndefMaskElem);

+ Mask.swap(PrevMask);

for (unsigned I = 0; I < NumScalars; ++I) {

- Value *Scalar = E->Scalars[I];

- if (!FirstInsert &&

- !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0)))

- FirstInsert = cast<Instruction>(Scalar);

+ Value *Scalar = E->Scalars[PrevMask[I]];

Optional<int> InsertIdx = getInsertIndex(Scalar, 0);

if (!InsertIdx || *InsertIdx == UndefMaskElem)

continue;

- unsigned Idx = *InsertIdx;

- if (Idx < Offset) {

- Offset = Idx;

- IsIdentity &= I == 0;

- } else {

- assert(Idx >= Offset && "Failed to find vector index offset");

- IsIdentity &= Idx - Offset == I;

- }

- assert(Offset < NumElts && "Failed to find vector index offset");

- // Create shuffle to resize vector

- SmallVector<int> Mask(NumElts, UndefMaskElem);

- if (!IsIdentity) {

- for (unsigned I = 0; I < NumScalars; ++I) {

- Value *Scalar = E->Scalars[I];

- Optional<int> InsertIdx = getInsertIndex(Scalar, 0);

- if (!InsertIdx || *InsertIdx == UndefMaskElem)

- continue;

- Mask[*InsertIdx - Offset] = I;

- }

- } else {

- std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

+ IsIdentity &= *InsertIdx - Offset == I;

+ Mask[*InsertIdx - Offset] = I;

}

if (!IsIdentity || NumElts != NumScalars)

V = Builder.CreateShuffleVector(V, Mask);

- if (NumElts != NumScalars) {

+ if ((!IsIdentity || Offset != 0 ||

+ !isa<UndefValue>(FirstInsert->getOperand(0))) &&

+ NumElts != NumScalars) {

SmallVector<int> InsertMask(NumElts);

std::iota(InsertMask.begin(), InsertMask.end(), 0);

for (unsigned I = 0; I < NumElts; I++) {

@@ -5295,6 +6117,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

auto *CI = cast<CastInst>(VL0);

Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -5317,6 +6140,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

Value *V = Builder.CreateCmp(P0, L, R);

propagateIRFlags(V, E->Scalars, VL0);

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -5337,6 +6161,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *V = Builder.CreateSelect(Cond, True, False);

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -5360,6 +6185,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -5403,6 +6229,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -5414,9 +6241,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Load: {

// Loads are inserted at the head of the tree because we don't want to

// sink them all the way down past store instructions.

- bool IsReorder = E->updateStateIfReorder();

- if (IsReorder)

- VL0 = E->getMainOp();

setInsertPointAfterBundle(E);

LoadInst *LI = cast<LoadInst>(VL0);

@@ -5457,9 +6281,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::Store: {

- bool IsReorder = !E->ReorderIndices.empty();

- auto *SI = cast<StoreInst>(

- IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);

+ auto *SI = cast<StoreInst>(VL0);

unsigned AS = SI->getPointerAddressSpace();

setInsertPointAfterBundle(E);

@@ -5491,37 +6313,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::GetElementPtr: {

+ auto *GEP0 = cast<GetElementPtrInst>(VL0);

setInsertPointAfterBundle(E);

Value *Op0 = vectorizeTree(E->getOperand(0));

- std::vector<Value *> OpVecs;

- for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;

- ++j) {

- ValueList &VL = E->getOperand(j);

- // Need to cast all elements to the same type before vectorization to

- // avoid crash.

- Type *VL0Ty = VL0->getOperand(j)->getType();

- Type *Ty = llvm::all_of(

- VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })

- ? VL0Ty

- : DL->getIndexType(cast<GetElementPtrInst>(VL0)

- ->getPointerOperandType()

- ->getScalarType());

- for (Value *&V : VL) {

- auto *CI = cast<ConstantInt>(V);

- V = ConstantExpr::getIntegerCast(CI, Ty,

- CI->getValue().isSignBitSet());

- }

- Value *OpVec = vectorizeTree(VL);

+ SmallVector<Value *> OpVecs;

+ for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {

+ Value *OpVec = vectorizeTree(E->getOperand(J));

OpVecs.push_back(OpVec);

}

- Value *V = Builder.CreateGEP(

- cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);

+ Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);

if (Instruction *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -5548,7 +6355,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

std::vector<Value *> OpVecs;

SmallVector<Type *, 2> TysForDecl =

{FixedVectorType::get(CI->getType(), E->Scalars.size())};

- for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {

+ for (int j = 0, e = CI->arg_size(); j < e; ++j) {

ValueList OpVL;

// Some intrinsics have scalar arguments. This argument should not be

// vectorized.

@@ -5594,6 +6401,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

propagateIRFlags(V, E->Scalars, VL0);

+ ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -5641,19 +6449,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// Also, gather up main and alt scalar ops to propagate IR flags to

// each vector operation.

ValueList OpScalars, AltScalars;

- unsigned Sz = E->Scalars.size();

- SmallVector<int> Mask(Sz);

- for (unsigned I = 0; I < Sz; ++I) {

- auto *OpInst = cast<Instruction>(E->Scalars[I]);

- assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");

- if (OpInst->getOpcode() == E->getAltOpcode()) {

- Mask[I] = Sz + I;

- AltScalars.push_back(E->Scalars[I]);

- } else {

- Mask[I] = I;

- OpScalars.push_back(E->Scalars[I]);

- }

+ SmallVector<int> Mask;

+ buildSuffleEntryMask(

+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

+ [E](Instruction *I) {

+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

+ return I->getOpcode() == E->getAltOpcode();

+ },

+ Mask, &OpScalars, &AltScalars);

propagateIRFlags(V0, OpScalars);

propagateIRFlags(V1, AltScalars);

@@ -5661,7 +6464,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Value *V = Builder.CreateShuffleVector(V0, V1, Mask);

if (Instruction *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

- ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V;

@@ -5836,7 +6638,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");

// It is legal to delete users in the ignorelist.

- assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&

+ assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) ||

+ (isa_and_nonnull<Instruction>(U) &&

+ isDeleted(cast<Instruction>(U)))) &&

"Deleting out-of-tree value");

}

@@ -5911,27 +6715,28 @@ void BoUpSLP::optimizeGatherSequence() {

"Worklist not sorted properly!");

BasicBlock *BB = (*I)->getBlock();

// For all instructions in blocks containing gather sequences:

- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {

- Instruction *In = &*it++;

- if (isDeleted(In))

+ for (Instruction &In : llvm::make_early_inc_range(*BB)) {

+ if (isDeleted(&In))

continue;

- if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))

+ if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&

+ !isa<ShuffleVectorInst>(&In))

continue;

// Check if we can replace this instruction with any of the

// visited instructions.

+ bool Replaced = false;

for (Instruction *v : Visited) {

- if (In->isIdenticalTo(v) &&

- DT->dominates(v->getParent(), In->getParent())) {

- In->replaceAllUsesWith(v);

- eraseInstruction(In);

- In = nullptr;

+ if (In.isIdenticalTo(v) &&

+ DT->dominates(v->getParent(), In.getParent())) {

+ In.replaceAllUsesWith(v);

+ eraseInstruction(&In);

+ Replaced = true;

break;

}

- if (In) {

- assert(!is_contained(Visited, In));

- Visited.push_back(In);

+ if (!Replaced) {

+ assert(!is_contained(Visited, &In));

+ Visited.push_back(&In);

}

@@ -5944,7 +6749,9 @@ void BoUpSLP::optimizeGatherSequence() {

Optional<BoUpSLP::ScheduleData *>

BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

const InstructionsState &S) {

- if (isa<PHINode>(S.OpValue) || isa<InsertElementInst>(S.OpValue))

+ // No need to schedule PHIs, insertelement, extractelement and extractvalue

+ // instructions.

+ if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))

return nullptr;

// Initialize the instruction bundle.

@@ -6040,7 +6847,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

Value *OpValue) {

- if (isa<PHINode>(OpValue) || isa<InsertElementInst>(OpValue))

+ if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))

return;

ScheduleData *Bundle = getScheduleData(OpValue);

@@ -6080,8 +6887,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

return true;

Instruction *I = dyn_cast<Instruction>(V);

assert(I && "bundle member must be an instruction");

- assert(!isa<PHINode>(I) && !isa<InsertElementInst>(I) &&

- "phi nodes/insertelements don't need to be scheduled");

+ assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&

+ "phi nodes/insertelements/extractelements/extractvalues don't need to "

+ "be scheduled");

auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {

ScheduleData *ISD = getScheduleData(I);

if (!ISD)

@@ -6351,7 +7159,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;

I = I->getNextNode()) {

BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {

- assert((isa<InsertElementInst>(SD->Inst) ||

+ assert((isVectorLikeInstWithConstOps(SD->Inst) ||

SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&

"scheduler and vectorizer bundle mismatch");

SD->FirstInBundle->SchedulingPriority = Idx++;

@@ -6694,9 +7502,7 @@ struct SLPVectorizer : public FunctionPass {

initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());

}

- bool doInitialization(Module &M) override {

- return false;

- }

+ bool doInitialization(Module &M) override { return false; }

bool runOnFunction(Function &F) override {

if (skipFunction(F))

@@ -6831,44 +7637,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

return Changed;

}

-/// Order may have elements assigned special value (size) which is out of

-/// bounds. Such indices only appear on places which correspond to undef values

-/// (see canReuseExtract for details) and used in order to avoid undef values

-/// have effect on operands ordering.

-/// The first loop below simply finds all unused indices and then the next loop

-/// nest assigns these indices for undef values positions.

-/// As an example below Order has two undef positions and they have assigned

-/// values 3 and 7 respectively:

-/// before: 6 9 5 4 9 2 1 0

-/// after: 6 3 5 4 7 2 1 0

-/// \returns Fixed ordering.

-static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef<unsigned> Order) {

- BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end());

- const unsigned Sz = NewOrder.size();

- SmallBitVector UsedIndices(Sz);

- SmallVector<int> MaskedIndices;

- for (int I = 0, E = NewOrder.size(); I < E; ++I) {

- if (NewOrder[I] < Sz)

- UsedIndices.set(NewOrder[I]);

- else

- MaskedIndices.push_back(I);

- }

- if (MaskedIndices.empty())

- return NewOrder;

- SmallVector<int> AvailableIndices(MaskedIndices.size());

- unsigned Cnt = 0;

- int Idx = UsedIndices.find_first();

- do {

- AvailableIndices[Cnt] = Idx;

- Idx = UsedIndices.find_next(Idx);

- ++Cnt;

- } while (Idx > 0);

- assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");

- for (int I = 0, E = MaskedIndices.size(); I < E; ++I)

- NewOrder[MaskedIndices[I]] = AvailableIndices[I];

- return NewOrder;

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

unsigned Idx) {

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()

@@ -6884,19 +7652,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

<< "\n");

R.buildTree(Chain);

- Optional<ArrayRef<unsigned>> Order = R.bestOrder();

- // TODO: Handle orders of size less than number of elements in the vector.

- if (Order && Order->size() == Chain.size()) {

- // TODO: reorder tree nodes without tree rebuilding.

- SmallVector<Value *, 4> ReorderedOps(Chain.size());

- transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),

- [Chain](const unsigned Idx) { return Chain[Idx]; });

- R.buildTree(ReorderedOps);

- }

if (R.isTreeTinyAndNotFullyVectorizable())

return false;

if (R.isLoadCombineCandidate())

return false;

+ R.reorderTopToBottom();

+ R.reorderBottomToTop();

+ R.buildExternalUses();

R.computeMinimumValueSizes();

@@ -7019,7 +7781,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

unsigned EltSize = R.getVectorElementSize(Operands[0]);

unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);

- unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize);

+ unsigned MinVF = R.getMinVF(EltSize);

unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),

MaxElts);

@@ -7092,11 +7854,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {

if (!A || !B)

return false;

Value *VL[] = {A, B};

- return tryToVectorizeList(VL, R, /*AllowReorder=*/true);

+ return tryToVectorizeList(VL, R);

}

bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

- bool AllowReorder) {

+ bool LimitForRegisterSize) {

if (VL.size() < 2)

return false;

@@ -7130,7 +7892,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

}

unsigned Sz = R.getVectorElementSize(I0);

- unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);

+ unsigned MinVF = R.getMinVF(Sz);

unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);

MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);

if (MaxVF < 2) {

@@ -7168,7 +7930,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

if (!isPowerOf2_32(OpsWidth))

continue;

- if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))

+ if ((LimitForRegisterSize && OpsWidth < MaxVF) ||

+ (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))

break;

ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

@@ -7183,18 +7946,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

<< "\n");

R.buildTree(Ops);

- if (AllowReorder) {

- Optional<ArrayRef<unsigned>> Order = R.bestOrder();

- if (Order) {

- // TODO: reorder tree nodes without tree rebuilding.

- SmallVector<Value *, 4> ReorderedOps(Ops.size());

- transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),

- [Ops](const unsigned Idx) { return Ops[Idx]; });

- R.buildTree(ReorderedOps);

- }

if (R.isTreeTinyAndNotFullyVectorizable())

continue;

+ R.reorderTopToBottom();

+ R.reorderBottomToTop();

+ R.buildExternalUses();

R.computeMinimumValueSizes();

InstructionCost Cost = R.getTreeCost();

@@ -7387,10 +8143,20 @@ class HorizontalReduction {

Value *RHS, const Twine &Name, bool UseSelect) {

unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);

switch (Kind) {

- case RecurKind::Add:

- case RecurKind::Mul:

case RecurKind::Or:

+ if (UseSelect &&

+ LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))

+ return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);

+ return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

+ Name);

case RecurKind::And:

+ if (UseSelect &&

+ LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))

+ return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);

+ return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

+ Name);

+ case RecurKind::Add:

+ case RecurKind::Mul:

case RecurKind::Xor:

case RecurKind::FAdd:

case RecurKind::FMul:

@@ -7434,8 +8200,12 @@ class HorizontalReduction {

static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,

Value *RHS, const Twine &Name,

const ReductionOpsListType &ReductionOps) {

- bool UseSelect = ReductionOps.size() == 2;

- assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) &&

+ bool UseSelect = ReductionOps.size() == 2 ||

+ // Logical or/and.

+ (ReductionOps.size() == 1 &&

+ isa<SelectInst>(ReductionOps.front().front()));

+ assert((!UseSelect || ReductionOps.size() != 2 ||

+ isa<SelectInst>(ReductionOps[1][0])) &&

"Expected cmp + select pairs for reduction");

Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);

if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {

@@ -7573,10 +8343,10 @@ class HorizontalReduction {

/// Checks if the instruction is in basic block \p BB.

/// For a cmp+sel min/max reduction check that both ops are in \p BB.

static bool hasSameParent(Instruction *I, BasicBlock *BB) {

- if (isCmpSelMinMax(I)) {

+ if (isCmpSelMinMax(I) || (isBoolLogicOp(I) && isa<SelectInst>(I))) {

auto *Sel = cast<SelectInst>(I);

- auto *Cmp = cast<Instruction>(Sel->getCondition());

- return Sel->getParent() == BB && Cmp->getParent() == BB;

+ auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());

+ return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;

}

return I->getParent() == BB;

}

@@ -7758,13 +8528,13 @@ public:

}

/// Attempt to vectorize the tree found by matchAssociativeReduction.

- bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {

+ Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {

// If there are a sufficient number of reduction values, reduce

// to a nearby power-of-2. We can safely generate oversized

// vectors and rely on the backend to split them to legal sizes.

unsigned NumReducedVals = ReducedVals.size();

if (NumReducedVals < 4)

- return false;

+ return nullptr;

// Intersect the fast-math-flags from all reduction operations.

FastMathFlags RdxFMF;

@@ -7838,22 +8608,14 @@ public:

unsigned i = 0;

while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {

ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);

- V.buildTree(VL, ExternallyUsedValues, IgnoreList);

- Optional<ArrayRef<unsigned>> Order = V.bestOrder();

- if (Order) {

- assert(Order->size() == VL.size() &&

- "Order size must be the same as number of vectorized "

- "instructions.");

- // TODO: reorder tree nodes without tree rebuilding.

- SmallVector<Value *, 4> ReorderedOps(VL.size());

- transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),

- [VL](const unsigned Idx) { return VL[Idx]; });

- V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);

- }

- if (V.isTreeTinyAndNotFullyVectorizable())

+ V.buildTree(VL, IgnoreList);

+ if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))

break;

if (V.isLoadCombineReductionCandidate(RdxKind))

break;

+ V.reorderTopToBottom();

+ V.reorderBottomToTop(/*IgnoreReorder=*/true);

+ V.buildExternalUses(ExternallyUsedValues);

// For a poison-safe boolean logic reduction, do not replace select

// instructions with logic ops. All reduced values will be frozen (see

@@ -7873,7 +8635,7 @@ public:

InstructionCost Cost = TreeCost + ReductionCost;

if (!Cost.isValid()) {

LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");

- return false;

+ return nullptr;

}

if (Cost >= -SLPCostThreshold) {

V.getORE()->emit([&]() {

@@ -7953,7 +8715,7 @@ public:

// vector reductions.

V.eraseInstructions(IgnoreList);

}

- return VectorizedTree != nullptr;

+ return VectorizedTree;

}

unsigned numReductionValues() const { return ReducedVals.size(); }

@@ -7963,6 +8725,7 @@ private:

InstructionCost getReductionCost(TargetTransformInfo *TTI,

Value *FirstReducedVal, unsigned ReduxWidth,

FastMathFlags FMF) {

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

Type *ScalarTy = FirstReducedVal->getType();

FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);

InstructionCost VectorCost, ScalarCost;

@@ -7975,33 +8738,39 @@ private:

case RecurKind::FAdd:

case RecurKind::FMul: {

unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);

- VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF);

- ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);

+ VectorCost =

+ TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);

+ ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);

break;

}

case RecurKind::FMax:

case RecurKind::FMin: {

+ auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

- /*unsigned=*/false);

- ScalarCost =

- TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +

- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

- CmpInst::makeCmpResultType(ScalarTy));

+ /*unsigned=*/false, CostKind);

+ CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

+ ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,

+ SclCondTy, RdxPred, CostKind) +

+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

+ SclCondTy, RdxPred, CostKind);

break;

}

case RecurKind::SMax:

case RecurKind::SMin:

case RecurKind::UMax:

case RecurKind::UMin: {

+ auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

bool IsUnsigned =

RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;

- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned);

- ScalarCost =

- TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +

- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

- CmpInst::makeCmpResultType(ScalarTy));

+ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,

+ CostKind);

+ CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

+ ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,

+ SclCondTy, RdxPred, CostKind) +

+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

+ SclCondTy, RdxPred, CostKind);

break;

}

default:

@@ -8023,6 +8792,7 @@ private:

assert(isPowerOf2_32(ReduxWidth) &&

"We only handle power-of-two reductions for now");

+ ++NumVectorInstructions;

return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,

ReductionOps.back());

}

@@ -8232,32 +9002,45 @@ static bool tryToVectorizeHorReductionOrInstOperands(

// Skip the analysis of CmpInsts.Compiler implements postanalysis of the

// CmpInsts so we can skip extra attempts in

// tryToVectorizeHorReductionOrInstOperands and save compile time.

- SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});

+ std::queue<std::pair<Instruction *, unsigned>> Stack;

+ Stack.emplace(Root, 0);

SmallPtrSet<Value *, 8> VisitedInstrs;

+ SmallVector<WeakTrackingVH> PostponedInsts;

bool Res = false;

+ auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,

+ Value *&B1) -> Value * {

+ bool IsBinop = matchRdxBop(Inst, B0, B1);

+ bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));

+ if (IsBinop || IsSelect) {

+ HorizontalReduction HorRdx;

+ if (HorRdx.matchAssociativeReduction(P, Inst))

+ return HorRdx.tryToReduce(R, TTI);

+ }

+ return nullptr;

+ };

while (!Stack.empty()) {

Instruction *Inst;

unsigned Level;

- std::tie(Inst, Level) = Stack.pop_back_val();

+ std::tie(Inst, Level) = Stack.front();

+ Stack.pop();

// Do not try to analyze instruction that has already been vectorized.

// This may happen when we vectorize instruction operands on a previous

// iteration while stack was populated before that happened.

if (R.isDeleted(Inst))

continue;

- Value *B0, *B1;

- bool IsBinop = matchRdxBop(Inst, B0, B1);

- bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));

- if (IsBinop || IsSelect) {

- HorizontalReduction HorRdx;

- if (HorRdx.matchAssociativeReduction(P, Inst)) {

- if (HorRdx.tryToReduce(R, TTI)) {

- Res = true;

- // Set P to nullptr to avoid re-analysis of phi node in

- // matchAssociativeReduction function unless this is the root node.

- P = nullptr;

- continue;

- }

+ Value *B0 = nullptr, *B1 = nullptr;

+ if (Value *V = TryToReduce(Inst, B0, B1)) {

+ Res = true;

+ // Set P to nullptr to avoid re-analysis of phi node in

+ // matchAssociativeReduction function unless this is the root node.

+ P = nullptr;

+ if (auto *I = dyn_cast<Instruction>(V)) {

+ // Try to find another reduction.

+ Stack.emplace(I, Level);

+ continue;

}

+ } else {

+ bool IsBinop = B0 && B1;

if (P && IsBinop) {

Inst = dyn_cast<Instruction>(B0);

if (Inst == P)

@@ -8269,14 +9052,14 @@ static bool tryToVectorizeHorReductionOrInstOperands(

continue;

}

- }

- // Set P to nullptr to avoid re-analysis of phi node in

- // matchAssociativeReduction function unless this is the root node.

- P = nullptr;

- // Do not try to vectorize CmpInst operands, this is done separately.

- if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) {

- Res = true;

- continue;

+ // Set P to nullptr to avoid re-analysis of phi node in

+ // matchAssociativeReduction function unless this is the root node.

+ P = nullptr;

+ // Do not try to vectorize CmpInst operands, this is done separately.

+ // Final attempt for binop args vectorization should happen after the loop

+ // to try to find reductions.

+ if (!isa<CmpInst>(Inst))

+ PostponedInsts.push_back(Inst);

}

// Try to vectorize operands.

@@ -8290,8 +9073,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(

// separately.

if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&

I->getParent() == BB)

- Stack.emplace_back(I, Level);

+ Stack.emplace(I, Level);

}

+ // Try to vectorized binops where reductions were not found.

+ for (Value *V : PostponedInsts)

+ if (auto *Inst = dyn_cast<Instruction>(V))

+ if (!R.isDeleted(Inst))

+ Res |= Vectorize(Inst, R);

return Res;

}

@@ -8326,7 +9114,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");

// Aggregate value is unlikely to be processed in vector register, we need to

// extract scalars into scalar registers, so NeedExtraction is set true.

- return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false);

+ return tryToVectorizeList(BuildVectorOpds, R);

}

bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

@@ -8337,11 +9125,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||

(llvm::all_of(BuildVectorOpds,

[](Value *V) { return isa<ExtractElementInst>(V); }) &&

- isShuffle(BuildVectorOpds, Mask)))

+ isFixedVectorShuffle(BuildVectorOpds, Mask)))

return false;

LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");

- return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true);

+ return tryToVectorizeList(BuildVectorInsts, R);

}

bool SLPVectorizerPass::vectorizeSimpleInstructions(

@@ -8382,6 +9170,78 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(

return OpsChanged;

}

+template <typename T>

+static bool

+tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

+ function_ref<unsigned(T *)> Limit,

+ function_ref<bool(T *, T *)> Comparator,

+ function_ref<bool(T *, T *)> AreCompatible,

+ function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize,

+ bool LimitForRegisterSize) {

+ bool Changed = false;

+ // Sort by type, parent, operands.

+ stable_sort(Incoming, Comparator);

+ // Try to vectorize elements base on their type.

+ SmallVector<T *> Candidates;

+ for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {

+ // Look for the next elements with the same type, parent and operand

+ // kinds.

+ auto *SameTypeIt = IncIt;

+ while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))

+ ++SameTypeIt;

+ // Try to vectorize them.

+ unsigned NumElts = (SameTypeIt - IncIt);

+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("

+ << NumElts << ")\n");

+ // The vectorization is a 3-state attempt:

+ // 1. Try to vectorize instructions with the same/alternate opcodes with the

+ // size of maximal register at first.

+ // 2. Try to vectorize remaining instructions with the same type, if

+ // possible. This may result in the better vectorization results rather than

+ // if we try just to vectorize instructions with the same/alternate opcodes.

+ // 3. Final attempt to try to vectorize all instructions with the

+ // same/alternate ops only, this may result in some extra final

+ // vectorization.

+ if (NumElts > 1 &&

+ TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {

+ // Success start over because instructions might have been changed.

+ Changed = true;

+ } else if (NumElts < Limit(*IncIt) &&

+ (Candidates.empty() ||

+ Candidates.front()->getType() == (*IncIt)->getType())) {

+ Candidates.append(IncIt, std::next(IncIt, NumElts));

+ }

+ // Final attempt to vectorize instructions with the same types.

+ if (Candidates.size() > 1 &&

+ (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {

+ if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) {

+ // Success start over because instructions might have been changed.

+ Changed = true;

+ } else if (LimitForRegisterSize) {

+ // Try to vectorize using small vectors.

+ for (auto *It = Candidates.begin(), *End = Candidates.end();

+ It != End;) {

+ auto *SameTypeIt = It;

+ while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))

+ ++SameTypeIt;

+ unsigned NumElts = (SameTypeIt - It);

+ if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts),

+ /*LimitForRegisterSize=*/false))

+ Changed = true;

+ It = SameTypeIt;

+ }

+ Candidates.clear();

+ }

+ // Start over at the next instruction of a different type (or the end).

+ IncIt = SameTypeIt;

+ }

+ return Changed;

bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

bool Changed = false;

SmallVector<Value *, 4> Incoming;

@@ -8390,11 +9250,89 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

// node. Allows better to identify the chains that can be vectorized in the

// better way.

DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;

+ auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {

+ assert(isValidElementType(V1->getType()) &&

+ isValidElementType(V2->getType()) &&

+ "Expected vectorizable types only.");

+ // It is fine to compare type IDs here, since we expect only vectorizable

+ // types, like ints, floats and pointers, we don't care about other type.

+ if (V1->getType()->getTypeID() < V2->getType()->getTypeID())

+ return true;

+ if (V1->getType()->getTypeID() > V2->getType()->getTypeID())

+ return false;

+ ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

+ ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

+ if (Opcodes1.size() < Opcodes2.size())

+ return true;

+ if (Opcodes1.size() > Opcodes2.size())

+ return false;

+ for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

+ // Undefs are compatible with any other value.

+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

+ continue;

+ if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

+ if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

+ DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

+ DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());

+ if (!NodeI1)

+ return NodeI2 != nullptr;

+ if (!NodeI2)

+ return false;

+ assert((NodeI1 == NodeI2) ==

+ (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

+ "Different nodes should have different DFS numbers");

+ if (NodeI1 != NodeI2)

+ return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

+ InstructionsState S = getSameOpcode({I1, I2});

+ if (S.getOpcode())

+ continue;

+ return I1->getOpcode() < I2->getOpcode();

+ }

+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

+ continue;

+ if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())

+ return true;

+ if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())

+ return false;

+ }

+ return false;

+ };

+ auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {

+ if (V1 == V2)

+ return true;

+ if (V1->getType() != V2->getType())

+ return false;

+ ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

+ ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

+ if (Opcodes1.size() != Opcodes2.size())

+ return false;

+ for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

+ // Undefs are compatible with any other value.

+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

+ continue;

+ if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

+ if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

+ if (I1->getParent() != I2->getParent())

+ return false;

+ InstructionsState S = getSameOpcode({I1, I2});

+ if (S.getOpcode())

+ continue;

+ return false;

+ }

+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

+ continue;

+ if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())

+ return false;

+ }

+ return true;

+ };

+ auto Limit = [&R](Value *V) {

+ unsigned EltSize = R.getVectorElementSize(V);

+ return std::max(2U, R.getMaxVecRegSize() / EltSize);

+ };

- bool HaveVectorizedPhiNodes = true;

- while (HaveVectorizedPhiNodes) {

- HaveVectorizedPhiNodes = false;

+ bool HaveVectorizedPhiNodes = false;

+ do {

// Collect the incoming values from the PHIs.

Incoming.clear();

for (Instruction &I : *BB) {

@@ -8432,132 +9370,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

}

- // Sort by type, parent, operands.

- stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) {

- assert(isValidElementType(V1->getType()) &&

- isValidElementType(V2->getType()) &&

- "Expected vectorizable types only.");

- // It is fine to compare type IDs here, since we expect only vectorizable

- // types, like ints, floats and pointers, we don't care about other type.

- if (V1->getType()->getTypeID() < V2->getType()->getTypeID())

- return true;

- if (V1->getType()->getTypeID() > V2->getType()->getTypeID())

- return false;

- ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

- ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

- if (Opcodes1.size() < Opcodes2.size())

- return true;

- if (Opcodes1.size() > Opcodes2.size())

- return false;

- for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

- // Undefs are compatible with any other value.

- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

- continue;

- if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

- if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

- DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

- DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());

- if (!NodeI1)

- return NodeI2 != nullptr;

- if (!NodeI2)

- return false;

- assert((NodeI1 == NodeI2) ==

- (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

- "Different nodes should have different DFS numbers");

- if (NodeI1 != NodeI2)

- return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

- InstructionsState S = getSameOpcode({I1, I2});

- if (S.getOpcode())

- continue;

- return I1->getOpcode() < I2->getOpcode();

- }

- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

- continue;

- if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())

- return true;

- if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())

- return false;

- }

- return false;

- });

- auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {

- if (V1 == V2)

- return true;

- if (V1->getType() != V2->getType())

- return false;

- ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

- ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

- if (Opcodes1.size() != Opcodes2.size())

- return false;

- for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

- // Undefs are compatible with any other value.

- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

- continue;

- if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

- if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

- if (I1->getParent() != I2->getParent())

- return false;

- InstructionsState S = getSameOpcode({I1, I2});

- if (S.getOpcode())

- continue;

- return false;

- }

- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

- continue;

- if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())

- return false;

- }

- return true;

- };

- // Try to vectorize elements base on their type.

- SmallVector<Value *, 4> Candidates;

- for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),

- E = Incoming.end();

- IncIt != E;) {

- // Look for the next elements with the same type, parent and operand

- // kinds.

- SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;

- while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) {

- VisitedInstrs.insert(*SameTypeIt);

- ++SameTypeIt;

- }

- // Try to vectorize them.

- unsigned NumElts = (SameTypeIt - IncIt);

- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("

- << NumElts << ")\n");

- // The order in which the phi nodes appear in the program does not matter.

- // So allow tryToVectorizeList to reorder them if it is beneficial. This

- // is done when there are exactly two elements since tryToVectorizeList

- // asserts that there are only two values when AllowReorder is true.

- if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,

- /*AllowReorder=*/true)) {

- // Success start over because instructions might have been changed.

- HaveVectorizedPhiNodes = true;

- Changed = true;

- } else if (NumElts < 4 &&

- (Candidates.empty() ||

- Candidates.front()->getType() == (*IncIt)->getType())) {

- Candidates.append(IncIt, std::next(IncIt, NumElts));

- }

- // Final attempt to vectorize phis with the same types.

- if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {

- if (Candidates.size() > 1 &&

- tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) {

- // Success start over because instructions might have been changed.

- HaveVectorizedPhiNodes = true;

- Changed = true;

- }

- Candidates.clear();

- }

- // Start over at the next instruction of a different type (or the end).

- IncIt = SameTypeIt;

- }

+ HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(

+ Incoming, Limit, PHICompare, AreCompatiblePHIs,

+ [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {

+ return tryToVectorizeList(Candidates, R, LimitForRegisterSize);

+ },

+ /*LimitForRegisterSize=*/true);

+ Changed |= HaveVectorizedPhiNodes;

+ VisitedInstrs.insert(Incoming.begin(), Incoming.end());

+ } while (HaveVectorizedPhiNodes);

VisitedInstrs.clear();

@@ -8810,6 +9631,10 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

return V1->getValueOperand()->getValueID() ==

V2->getValueOperand()->getValueID();

};

+ auto Limit = [&R, this](StoreInst *SI) {

+ unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType());

+ return R.getMinVF(EltSize);

+ };

// Attempt to sort and vectorize each of the store-groups.

for (auto &Pair : Stores) {

@@ -8819,33 +9644,15 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "

<< Pair.second.size() << ".\n");

- stable_sort(Pair.second, StoreSorter);

- // Try to vectorize elements based on their compatibility.

- for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(),

- E = Pair.second.end();

- IncIt != E;) {

- // Look for the next elements with the same type.

- ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt;

- Type *EltTy = (*IncIt)->getPointerOperand()->getType();

- while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt))

- ++SameTypeIt;

- // Try to vectorize them.

- unsigned NumElts = (SameTypeIt - IncIt);

- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("

- << NumElts << ")\n");

- if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() &&

- vectorizeStores(makeArrayRef(IncIt, NumElts), R)) {

- // Success start over because instructions might have been changed.

- Changed = true;

- }

+ if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))

+ continue;

- // Start over at the next instruction of a different type (or the end).

- IncIt = SameTypeIt;

- }

+ Changed |= tryToVectorizeSequence<StoreInst>(

+ Pair.second, Limit, StoreSorter, AreCompatibleStores,

+ [this, &R](ArrayRef<StoreInst *> Candidates, bool) {

+ return vectorizeStores(Candidates, R);

+ },

+ /*LimitForRegisterSize=*/false);

}

return Changed;

}