diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-03-20 11:40:34 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-05-14 11:43:05 +0000 |
commit | 349cc55c9796c4596a5b9904cd3281af295f878f (patch) | |
tree | 410c5a785075730a35f1272ca6a7adf72222ad03 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | |
parent | cb2ae6163174b90e999326ecec3699ee093a5d43 (diff) | |
parent | c0981da47d5696fe36474fcf86b4ce03ae3ff818 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2381 |
1 files changed, 1594 insertions, 787 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1d06bc7d79a7..e3ef0b794f68 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" @@ -200,12 +201,39 @@ static bool isValidElementType(Type *Ty) { !Ty->isPPC_FP128Ty(); } +/// \returns True if the value is a constant (but not globals/constant +/// expressions). +static bool isConstant(Value *V) { + return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V); +} + +/// Checks if \p V is one of vector-like instructions, i.e. undef, +/// insertelement/extractelement with constant indices for fixed vector type or +/// extractvalue instruction. +static bool isVectorLikeInstWithConstOps(Value *V) { + if (!isa<InsertElementInst, ExtractElementInst>(V) && + !isa<ExtractValueInst, UndefValue>(V)) + return false; + auto *I = dyn_cast<Instruction>(V); + if (!I || isa<ExtractValueInst>(I)) + return true; + if (!isa<FixedVectorType>(I->getOperand(0)->getType())) + return false; + if (isa<ExtractElementInst>(I)) + return isConstant(I->getOperand(1)); + assert(isa<InsertElementInst>(V) && "Expected only insertelement."); + return isConstant(I->getOperand(2)); +} + /// \returns true if all of the instructions in \p VL are in the same block or /// false otherwise. static bool allSameBlock(ArrayRef<Value *> VL) { Instruction *I0 = dyn_cast<Instruction>(VL[0]); if (!I0) return false; + if (all_of(VL, isVectorLikeInstWithConstOps)) + return true; + BasicBlock *BB = I0->getParent(); for (int I = 1, E = VL.size(); I < E; I++) { auto *II = dyn_cast<Instruction>(VL[I]); @@ -218,12 +246,6 @@ static bool allSameBlock(ArrayRef<Value *> VL) { return true; } -/// \returns True if the value is a constant (but not globals/constant -/// expressions). -static bool isConstant(Value *V) { - return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V); -} - /// \returns True if all of the values in \p VL are constants (but not /// globals/constant expressions). static bool allConstant(ArrayRef<Value *> VL) { @@ -232,12 +254,21 @@ static bool allConstant(ArrayRef<Value *> VL) { return all_of(VL, isConstant); } -/// \returns True if all of the values in \p VL are identical. +/// \returns True if all of the values in \p VL are identical or some of them +/// are UndefValue. static bool isSplat(ArrayRef<Value *> VL) { - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (VL[i] != VL[0]) + Value *FirstNonUndef = nullptr; + for (Value *V : VL) { + if (isa<UndefValue>(V)) + continue; + if (!FirstNonUndef) { + FirstNonUndef = V; + continue; + } + if (V != FirstNonUndef) return false; - return true; + } + return FirstNonUndef != nullptr; } /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. @@ -295,8 +326,10 @@ static bool isCommutative(Instruction *I) { /// TODO: Can we split off and reuse the shuffle mask detection from /// TargetTransformInfo::getInstructionThroughput? static Optional<TargetTransformInfo::ShuffleKind> -isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { +isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { auto *EI0 = cast<ExtractElementInst>(VL[0]); + if (isa<ScalableVectorType>(EI0->getVectorOperandType())) + return None; unsigned Size = cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); Value *Vec1 = nullptr; @@ -504,7 +537,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, case Instruction::Call: { CallInst *CI = cast<CallInst>(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { if (hasVectorInstrinsicScalarOpd(ID, i)) return (CI->getArgOperand(i) == Scalar); } @@ -535,13 +568,67 @@ static bool isSimple(Instruction *I) { return true; } +/// Shuffles \p Mask in accordance with the given \p SubMask. +static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { + if (SubMask.empty()) + return; + if (Mask.empty()) { + Mask.append(SubMask.begin(), SubMask.end()); + return; + } + SmallVector<int> NewMask(SubMask.size(), UndefMaskElem); + int TermValue = std::min(Mask.size(), SubMask.size()); + for (int I = 0, E = SubMask.size(); I < E; ++I) { + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || + Mask[SubMask[I]] >= TermValue) + continue; + NewMask[I] = Mask[SubMask[I]]; + } + Mask.swap(NewMask); +} + +/// Order may have elements assigned special value (size) which is out of +/// bounds. Such indices only appear on places which correspond to undef values +/// (see canReuseExtract for details) and used in order to avoid undef values +/// have effect on operands ordering. +/// The first loop below simply finds all unused indices and then the next loop +/// nest assigns these indices for undef values positions. +/// As an example below Order has two undef positions and they have assigned +/// values 3 and 7 respectively: +/// before: 6 9 5 4 9 2 1 0 +/// after: 6 3 5 4 7 2 1 0 +static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) { + const unsigned Sz = Order.size(); + SmallBitVector UsedIndices(Sz); + SmallVector<int> MaskedIndices; + for (unsigned I = 0; I < Sz; ++I) { + if (Order[I] < Sz) + UsedIndices.set(Order[I]); + else + MaskedIndices.push_back(I); + } + if (MaskedIndices.empty()) + return; + SmallVector<int> AvailableIndices(MaskedIndices.size()); + unsigned Cnt = 0; + int Idx = UsedIndices.find_first(); + do { + AvailableIndices[Cnt] = Idx; + Idx = UsedIndices.find_next(Idx); + ++Cnt; + } while (Idx > 0); + assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); + for (int I = 0, E = MaskedIndices.size(); I < E; ++I) + Order[MaskedIndices[I]] = AvailableIndices[I]; +} + namespace llvm { static void inversePermutation(ArrayRef<unsigned> Indices, SmallVectorImpl<int> &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E, E + 1); + Mask.resize(E, UndefMaskElem); for (unsigned I = 0; I < E; ++I) Mask[Indices[I]] = I; } @@ -581,6 +668,22 @@ static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) { return Index; } +/// Reorders the list of scalars in accordance with the given \p Order and then +/// the \p Mask. \p Order - is the original order of the scalars, need to +/// reorder scalars into an unordered state at first according to the given +/// order. Then the ordered scalars are shuffled once again in accordance with +/// the provided mask. +static void reorderScalars(SmallVectorImpl<Value *> &Scalars, + ArrayRef<int> Mask) { + assert(!Mask.empty() && "Expected non-empty mask."); + SmallVector<Value *> Prev(Scalars.size(), + UndefValue::get(Scalars.front()->getType())); + Prev.swap(Scalars); + for (unsigned I = 0, E = Prev.size(); I < E; ++I) + if (Mask[I] != UndefMaskElem) + Scalars[Mask[I]] = Prev[I]; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -645,13 +748,12 @@ public: void buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst = None); - /// Construct a vectorizable tree that starts at \p Roots, ignoring users for - /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking - /// into account (and updating it, if required) list of externally used - /// values stored in \p ExternallyUsedValues. - void buildTree(ArrayRef<Value *> Roots, - ExtraValueToDebugLocsMap &ExternallyUsedValues, - ArrayRef<Value *> UserIgnoreLst = None); + /// Builds external uses of the vectorized scalars, i.e. the list of + /// vectorized scalars to be extracted, their lanes and their scalar users. \p + /// ExternallyUsedValues contains additional list of external uses to handle + /// vectorization of reductions. + void + buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { @@ -659,8 +761,6 @@ public: ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); - NumOpsWantToKeepOrder.clear(); - NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -674,103 +774,28 @@ public: /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// \returns The best order of instructions for vectorization. - Optional<ArrayRef<unsigned>> bestOrder() const { - assert(llvm::all_of( - NumOpsWantToKeepOrder, - [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { - return D.getFirst().size() == - VectorizableTree[0]->Scalars.size(); - }) && - "All orders must have the same size as number of instructions in " - "tree node."); - auto I = std::max_element( - NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), - [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, - const decltype(NumOpsWantToKeepOrder)::value_type &D2) { - return D1.second < D2.second; - }); - if (I == NumOpsWantToKeepOrder.end() || - I->getSecond() <= NumOpsWantToKeepOriginalOrder) - return None; - - return makeArrayRef(I->getFirst()); - } - - /// Builds the correct order for root instructions. - /// If some leaves have the same instructions to be vectorized, we may - /// incorrectly evaluate the best order for the root node (it is built for the - /// vector of instructions without repeated instructions and, thus, has less - /// elements than the root node). This function builds the correct order for - /// the root node. - /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves - /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first - /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should - /// be reordered, the best order will be \<1, 0\>. We need to extend this - /// order for the root node. For the root node this order should look like - /// \<3, 0, 1, 2\>. This function extends the order for the reused - /// instructions. - void findRootOrder(OrdersType &Order) { - // If the leaf has the same number of instructions to vectorize as the root - // - order must be set already. - unsigned RootSize = VectorizableTree[0]->Scalars.size(); - if (Order.size() == RootSize) - return; - SmallVector<unsigned, 4> RealOrder(Order.size()); - std::swap(Order, RealOrder); - SmallVector<int, 4> Mask; - inversePermutation(RealOrder, Mask); - Order.assign(Mask.begin(), Mask.end()); - // The leaf has less number of instructions - need to find the true order of - // the root. - // Scan the nodes starting from the leaf back to the root. - const TreeEntry *PNode = VectorizableTree.back().get(); - SmallVector<const TreeEntry *, 4> Nodes(1, PNode); - SmallPtrSet<const TreeEntry *, 4> Visited; - while (!Nodes.empty() && Order.size() != RootSize) { - const TreeEntry *PNode = Nodes.pop_back_val(); - if (!Visited.insert(PNode).second) - continue; - const TreeEntry &Node = *PNode; - for (const EdgeInfo &EI : Node.UserTreeIndices) - if (EI.UserTE) - Nodes.push_back(EI.UserTE); - if (Node.ReuseShuffleIndices.empty()) - continue; - // Build the order for the parent node. - OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); - SmallVector<unsigned, 4> OrderCounter(Order.size(), 0); - // The algorithm of the order extension is: - // 1. Calculate the number of the same instructions for the order. - // 2. Calculate the index of the new order: total number of instructions - // with order less than the order of the current instruction + reuse - // number of the current instruction. - // 3. The new order is just the index of the instruction in the original - // vector of the instructions. - for (unsigned I : Node.ReuseShuffleIndices) - ++OrderCounter[Order[I]]; - SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0); - for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { - unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; - unsigned OrderIdx = Order[ReusedIdx]; - unsigned NewIdx = 0; - for (unsigned J = 0; J < OrderIdx; ++J) - NewIdx += OrderCounter[J]; - NewIdx += CurrentCounter[OrderIdx]; - ++CurrentCounter[OrderIdx]; - assert(NewOrder[NewIdx] == RootSize && - "The order index should not be written already."); - NewOrder[NewIdx] = I; - } - std::swap(Order, NewOrder); - } - assert(Order.size() == RootSize && - "Root node is expected or the size of the order must be the same as " - "the number of elements in the root node."); - assert(llvm::all_of(Order, - [RootSize](unsigned Val) { return Val != RootSize; }) && - "All indices must be initialized"); - } + /// Checks if the specified gather tree entry \p TE can be represented as a + /// shuffled vector entry + (possibly) permutation with other gathers. It + /// implements the checks only for possibly ordered scalars (Loads, + /// ExtractElement, ExtractValue), which can be part of the graph. + Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); + + /// Reorders the current graph to the most profitable order starting from the + /// root node to the leaf nodes. The best order is chosen only from the nodes + /// of the same size (vectorization factor). Smaller nodes are considered + /// parts of subgraph with smaller VF and they are reordered independently. We + /// can make it because we still need to extend smaller nodes to the wider VF + /// and we can merge reordering shuffles with the widening shuffles. + void reorderTopToBottom(); + + /// Reorders the current graph to the most profitable order starting from + /// leaves to the root. It allows to rotate small subgraphs and reduce the + /// number of reshuffles if the leaf nodes use the same order. In this case we + /// can merge the orders and just shuffle user node instead of shuffling its + /// operands. Plus, even the leaf nodes have different orders, it allows to + /// sink reordering in the graph closer to the root node and merge it later + /// during analysis. + void reorderBottomToTop(bool IgnoreReorder = false); /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of @@ -793,6 +818,10 @@ public: return MinVecRegSize; } + unsigned getMinVF(unsigned Sz) const { + return std::max(2U, getMinVecRegSize() / Sz); + } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { unsigned MaxVF = MaxVFOption.getNumOccurrences() ? MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); @@ -809,7 +838,7 @@ public: /// \returns True if the VectorizableTree is both tiny and not fully /// vectorizable. We do not vectorize such trees. - bool isTreeTinyAndNotFullyVectorizable() const; + bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in @@ -1578,10 +1607,12 @@ private: Value *vectorizeTree(ArrayRef<Value *> VL); /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. - InstructionCost - getGatherCost(FixedVectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices) const; + /// context means the creation of vectors from a group of scalars. If \p + /// NeedToShuffle is true, need to add a cost of reshuffling some of the + /// vector elements. + InstructionCost getGatherCost(FixedVectorType *Ty, + const DenseSet<unsigned> &ShuffledIndices, + bool NeedToShuffle) const; /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. @@ -1605,7 +1636,7 @@ private: /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. - bool isFullyVectorizableTinyTree() const; + bool isFullyVectorizableTinyTree(bool ForReduction) const; /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. @@ -1621,14 +1652,43 @@ private: /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { - if (VL.size() == Scalars.size()) - return std::equal(VL.begin(), VL.end(), Scalars.begin()); - return VL.size() == ReuseShuffleIndices.size() && - std::equal( - VL.begin(), VL.end(), ReuseShuffleIndices.begin(), - [this](Value *V, int Idx) { return V == Scalars[Idx]; }); + auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) { + if (Mask.size() != VL.size() && VL.size() == Scalars.size()) + return std::equal(VL.begin(), VL.end(), Scalars.begin()); + return VL.size() == Mask.size() && + std::equal(VL.begin(), VL.end(), Mask.begin(), + [Scalars](Value *V, int Idx) { + return (isa<UndefValue>(V) && + Idx == UndefMaskElem) || + (Idx != UndefMaskElem && V == Scalars[Idx]); + }); + }; + if (!ReorderIndices.empty()) { + // TODO: implement matching if the nodes are just reordered, still can + // treat the vector as the same if the list of scalars matches VL + // directly, without reordering. + SmallVector<int> Mask; + inversePermutation(ReorderIndices, Mask); + if (VL.size() == Scalars.size()) + return IsSame(Scalars, Mask); + if (VL.size() == ReuseShuffleIndices.size()) { + ::addMask(Mask, ReuseShuffleIndices); + return IsSame(Scalars, Mask); + } + return false; + } + return IsSame(Scalars, ReuseShuffleIndices); } + /// \return Final vectorization factor for the node. Defined by the total + /// number of vectorized scalars, including those, used several times in the + /// entry and counted in the \a ReuseShuffleIndices, if any. + unsigned getVectorFactor() const { + if (!ReuseShuffleIndices.empty()) + return ReuseShuffleIndices.size(); + return Scalars.size(); + }; + /// A vector of scalars. ValueList Scalars; @@ -1701,6 +1761,12 @@ private: } } + /// Reorders operands of the node to the given mask \p Mask. + void reorderOperands(ArrayRef<int> Mask) { + for (ValueList &Operand : Operands) + reorderScalars(Operand, Mask); + } + /// \returns the \p OpIdx operand of this TreeEntry. ValueList &getOperand(unsigned OpIdx) { assert(OpIdx < Operands.size() && "Off bounds"); @@ -1760,19 +1826,14 @@ private: return AltOp ? AltOp->getOpcode() : 0; } - /// Update operations state of this entry if reorder occurred. - bool updateStateIfReorder() { - if (ReorderIndices.empty()) - return false; - InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); - setOperations(S); - return true; - } - /// When ReuseShuffleIndices is empty it just returns position of \p V - /// within vector of Scalars. Otherwise, try to remap on its reuse index. + /// When ReuseReorderShuffleIndices is empty it just returns position of \p + /// V within vector of Scalars. Otherwise, try to remap on its reuse index. int findLaneForValue(Value *V) const { unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); + if (!ReorderIndices.empty()) + FoundLane = ReorderIndices[FoundLane]; + assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); if (!ReuseShuffleIndices.empty()) { FoundLane = std::distance(ReuseShuffleIndices.begin(), find(ReuseShuffleIndices, FoundLane)); @@ -1856,7 +1917,7 @@ private: TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef<unsigned> ReuseShuffleIndices = None, + ArrayRef<int> ReuseShuffleIndices = None, ArrayRef<unsigned> ReorderIndices = None) { TreeEntry::EntryState EntryState = Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; @@ -1869,7 +1930,7 @@ private: Optional<ScheduleData *> Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef<unsigned> ReuseShuffleIndices = None, + ArrayRef<int> ReuseShuffleIndices = None, ArrayRef<unsigned> ReorderIndices = None) { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && @@ -1877,12 +1938,25 @@ private: VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; - Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->State = EntryState; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); - Last->setOperations(S); + if (ReorderIndices.empty()) { + Last->Scalars.assign(VL.begin(), VL.end()); + Last->setOperations(S); + } else { + // Reorder scalars and build final mask. + Last->Scalars.assign(VL.size(), nullptr); + transform(ReorderIndices, Last->Scalars.begin(), + [VL](unsigned Idx) -> Value * { + if (Idx >= VL.size()) + return UndefValue::get(VL.front()->getType()); + return VL[Idx]; + }); + InstructionsState S = getSameOpcode(Last->Scalars); + Last->setOperations(S); + Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); + } if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { assert(!getTreeEntry(V) && "Scalar already in tree!"); @@ -1965,12 +2039,9 @@ private: if (result.hasValue()) { return result.getValue(); } - MemoryLocation Loc2 = getLocation(Inst2, AA); bool aliased = true; - if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { - // Do the alias check. - aliased = !AA->isNoAlias(Loc1, Loc2); - } + if (Loc1.Ptr && isSimple(Inst1)) + aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1)); // Store the result in the cache. result = aliased; return aliased; @@ -2434,14 +2505,6 @@ private: } }; - /// Contains orders of operations along with the number of bundles that have - /// operations in this order. It stores only those orders that require - /// reordering, if reordering is not required it is counted using \a - /// NumOpsWantToKeepOriginalOrder. - DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder; - /// Number of bundles that do not require reordering. - unsigned NumOpsWantToKeepOriginalOrder = 0; - // Analysis and block reference. Function *F; ScalarEvolution *SE; @@ -2540,10 +2603,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { std::string Str; raw_string_ostream OS(Str); - if (isSplat(Entry->Scalars)) { - OS << "<splat> " << *Entry->Scalars[0]; - return Str; - } + if (isSplat(Entry->Scalars)) + OS << "<splat> "; for (auto V : Entry->Scalars) { OS << *V; if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { @@ -2594,21 +2655,539 @@ void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { }; } -void BoUpSLP::buildTree(ArrayRef<Value *> Roots, - ArrayRef<Value *> UserIgnoreLst) { - ExtraValueToDebugLocsMap ExternallyUsedValues; - buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); +/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses +/// contains original mask for the scalars reused in the node. Procedure +/// transform this mask in accordance with the given \p Mask. +static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) { + assert(!Mask.empty() && Reuses.size() == Mask.size() && + "Expected non-empty mask."); + SmallVector<int> Prev(Reuses.begin(), Reuses.end()); + Prev.swap(Reuses); + for (unsigned I = 0, E = Prev.size(); I < E; ++I) + if (Mask[I] != UndefMaskElem) + Reuses[Mask[I]] = Prev[I]; } -void BoUpSLP::buildTree(ArrayRef<Value *> Roots, - ExtraValueToDebugLocsMap &ExternallyUsedValues, - ArrayRef<Value *> UserIgnoreLst) { - deleteTree(); - UserIgnoreList = UserIgnoreLst; - if (!allSameType(Roots)) +/// Reorders the given \p Order according to the given \p Mask. \p Order - is +/// the original order of the scalars. Procedure transforms the provided order +/// in accordance with the given \p Mask. If the resulting \p Order is just an +/// identity order, \p Order is cleared. +static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) { + assert(!Mask.empty() && "Expected non-empty mask."); + SmallVector<int> MaskOrder; + if (Order.empty()) { + MaskOrder.resize(Mask.size()); + std::iota(MaskOrder.begin(), MaskOrder.end(), 0); + } else { + inversePermutation(Order, MaskOrder); + } + reorderReuses(MaskOrder, Mask); + if (ShuffleVectorInst::isIdentityMask(MaskOrder)) { + Order.clear(); return; - buildTree_rec(Roots, 0, EdgeInfo()); + } + Order.assign(Mask.size(), Mask.size()); + for (unsigned I = 0, E = Mask.size(); I < E; ++I) + if (MaskOrder[I] != UndefMaskElem) + Order[MaskOrder[I]] = I; + fixupOrderingIndices(Order); +} +Optional<BoUpSLP::OrdersType> +BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { + assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); + unsigned NumScalars = TE.Scalars.size(); + OrdersType CurrentOrder(NumScalars, NumScalars); + SmallVector<int> Positions; + SmallBitVector UsedPositions(NumScalars); + const TreeEntry *STE = nullptr; + // Try to find all gathered scalars that are gets vectorized in other + // vectorize node. Here we can have only one single tree vector node to + // correctly identify order of the gathered scalars. + for (unsigned I = 0; I < NumScalars; ++I) { + Value *V = TE.Scalars[I]; + if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V)) + continue; + if (const auto *LocalSTE = getTreeEntry(V)) { + if (!STE) + STE = LocalSTE; + else if (STE != LocalSTE) + // Take the order only from the single vector node. + return None; + unsigned Lane = + std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); + if (Lane >= NumScalars) + return None; + if (CurrentOrder[Lane] != NumScalars) { + if (Lane != I) + continue; + UsedPositions.reset(CurrentOrder[Lane]); + } + // The partial identity (where only some elements of the gather node are + // in the identity order) is good. + CurrentOrder[Lane] = I; + UsedPositions.set(I); + } + } + // Need to keep the order if we have a vector entry and at least 2 scalars or + // the vectorized entry has just 2 scalars. + if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { + auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) { + for (unsigned I = 0; I < NumScalars; ++I) + if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) + return false; + return true; + }; + if (IsIdentityOrder(CurrentOrder)) { + CurrentOrder.clear(); + return CurrentOrder; + } + auto *It = CurrentOrder.begin(); + for (unsigned I = 0; I < NumScalars;) { + if (UsedPositions.test(I)) { + ++I; + continue; + } + if (*It == NumScalars) { + *It = I; + ++I; + } + ++It; + } + return CurrentOrder; + } + return None; +} + +void BoUpSLP::reorderTopToBottom() { + // Maps VF to the graph nodes. + DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; + // ExtractElement gather nodes which can be vectorized and need to handle + // their ordering. + DenseMap<const TreeEntry *, OrdersType> GathersToOrders; + // Find all reorderable nodes with the given VF. + // Currently the are vectorized loads,extracts + some gathering of extracts. + for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( + const std::unique_ptr<TreeEntry> &TE) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE->ReuseShuffleIndices.empty()) + return; + if (TE->State == TreeEntry::Vectorize && + isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst, + InsertElementInst>(TE->getMainOp()) && + !TE->isAltShuffle()) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + return; + } + if (TE->State == TreeEntry::NeedToGather) { + if (TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle() && + isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = + canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + return; + } + } + if (Optional<OrdersType> CurrentOrder = + findReusedOrderedScalars(*TE.get())) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), *CurrentOrder); + } + } + }); + + // Reorder the graph nodes according to their vectorization factor. + for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1; + VF /= 2) { + auto It = VFToOrderedEntries.find(VF); + if (It == VFToOrderedEntries.end()) + continue; + // Try to find the most profitable order. We just are looking for the most + // used order and reorder scalar elements in the nodes according to this + // mostly used order. + const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond(); + // All operands are reordered and used only in this node - propagate the + // most used order to the user node. + MapVector<OrdersType, unsigned, + DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> + OrdersUses; + SmallPtrSet<const TreeEntry *, 4> VisitedOps; + for (const TreeEntry *OpTE : OrderedEntries) { + // No need to reorder this nodes, still need to extend and to use shuffle, + // just need to merge reordering shuffle and the reuse shuffle. + if (!OpTE->ReuseShuffleIndices.empty()) + continue; + // Count number of orders uses. + const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather) + return GathersToOrders.find(OpTE)->second; + return OpTE->ReorderIndices; + }(); + // Stores actually store the mask, not the order, need to invert. + if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && + OpTE->getOpcode() == Instruction::Store && !Order.empty()) { + SmallVector<int> Mask; + inversePermutation(Order, Mask); + unsigned E = Order.size(); + OrdersType CurrentOrder(E, E); + transform(Mask, CurrentOrder.begin(), [E](int Idx) { + return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx); + }); + fixupOrderingIndices(CurrentOrder); + ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + } else { + ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + } + } + // Set order of the user node. + if (OrdersUses.empty()) + continue; + // Choose the most used order. + ArrayRef<unsigned> BestOrder = OrdersUses.front().first; + unsigned Cnt = OrdersUses.front().second; + for (const auto &Pair : drop_begin(OrdersUses)) { + if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + BestOrder = Pair.first; + Cnt = Pair.second; + } + } + // Set order of the user node. + if (BestOrder.empty()) + continue; + SmallVector<int> Mask; + inversePermutation(BestOrder, Mask); + SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem); + unsigned E = BestOrder.size(); + transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { + return I < E ? static_cast<int>(I) : UndefMaskElem; + }); + // Do an actual reordering, if profitable. + for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + // Just do the reordering for the nodes with the given VF. + if (TE->Scalars.size() != VF) { + if (TE->ReuseShuffleIndices.size() == VF) { + // Need to reorder the reuses masks of the operands with smaller VF to + // be able to find the match between the graph nodes and scalar + // operands of the given node during vectorization/cost estimation. + assert(all_of(TE->UserTreeIndices, + [VF, &TE](const EdgeInfo &EI) { + return EI.UserTE->Scalars.size() == VF || + EI.UserTE->Scalars.size() == + TE->Scalars.size(); + }) && + "All users must be of VF size."); + // Update ordering of the operands with the smaller VF than the given + // one. + reorderReuses(TE->ReuseShuffleIndices, Mask); + } + continue; + } + if (TE->State == TreeEntry::Vectorize && + isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst, + InsertElementInst>(TE->getMainOp()) && + !TE->isAltShuffle()) { + // Build correct orders for extract{element,value}, loads and + // stores. + reorderOrder(TE->ReorderIndices, Mask); + if (isa<InsertElementInst, StoreInst>(TE->getMainOp())) + TE->reorderOperands(Mask); + } else { + // Reorder the node and its operands. + TE->reorderOperands(Mask); + assert(TE->ReorderIndices.empty() && + "Expected empty reorder sequence."); + reorderScalars(TE->Scalars, Mask); + } + if (!TE->ReuseShuffleIndices.empty()) { + // Apply reversed order to keep the original ordering of the reused + // elements to avoid extra reorder indices shuffling. + OrdersType CurrentOrder; + reorderOrder(CurrentOrder, MaskOrder); + SmallVector<int> NewReuses; + inversePermutation(CurrentOrder, NewReuses); + addMask(NewReuses, TE->ReuseShuffleIndices); + TE->ReuseShuffleIndices.swap(NewReuses); + } + } + } +} + +void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { + SetVector<TreeEntry *> OrderedEntries; + DenseMap<const TreeEntry *, OrdersType> GathersToOrders; + // Find all reorderable leaf nodes with the given VF. + // Currently the are vectorized loads,extracts without alternate operands + + // some gathering of extracts. + SmallVector<TreeEntry *> NonVectorized; + for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, + &NonVectorized]( + const std::unique_ptr<TreeEntry> &TE) { + if (TE->State != TreeEntry::Vectorize) + NonVectorized.push_back(TE.get()); + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE->ReuseShuffleIndices.empty()) + return; + if (TE->State == TreeEntry::Vectorize && + isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) && + !TE->isAltShuffle()) { + OrderedEntries.insert(TE.get()); + return; + } + if (TE->State == TreeEntry::NeedToGather) { + if (TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle() && + isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector with a single user only. + OrdersType CurrentOrder; + bool Reuse = + canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if ((Reuse || !CurrentOrder.empty()) && + !any_of(VectorizableTree, + [&TE](const std::unique_ptr<TreeEntry> &Entry) { + return Entry->State == TreeEntry::NeedToGather && + Entry.get() != TE.get() && + Entry->isSame(TE->Scalars); + })) { + OrderedEntries.insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + return; + } + } + if (Optional<OrdersType> CurrentOrder = + findReusedOrderedScalars(*TE.get())) { + OrderedEntries.insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), *CurrentOrder); + } + } + }); + + // Checks if the operands of the users are reordarable and have only single + // use. + auto &&CheckOperands = + [this, &NonVectorized](const auto &Data, + SmallVectorImpl<TreeEntry *> &GatherOps) { + for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) { + if (any_of(Data.second, + [I](const std::pair<unsigned, TreeEntry *> &OpData) { + return OpData.first == I && + OpData.second->State == TreeEntry::Vectorize; + })) + continue; + ArrayRef<Value *> VL = Data.first->getOperand(I); + const TreeEntry *TE = nullptr; + const auto *It = find_if(VL, [this, &TE](Value *V) { + TE = getTreeEntry(V); + return TE; + }); + if (It != VL.end() && TE->isSame(VL)) + return false; + TreeEntry *Gather = nullptr; + if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) { + assert(TE->State != TreeEntry::Vectorize && + "Only non-vectorized nodes are expected."); + if (TE->isSame(VL)) { + Gather = TE; + return true; + } + return false; + }) > 1) + return false; + if (Gather) + GatherOps.push_back(Gather); + } + return true; + }; + // 1. Propagate order to the graph nodes, which use only reordered nodes. + // I.e., if the node has operands, that are reordered, try to make at least + // one operand order in the natural order and reorder others + reorder the + // user node itself. + SmallPtrSet<const TreeEntry *, 4> Visited; + while (!OrderedEntries.empty()) { + // 1. Filter out only reordered nodes. + // 2. If the entry has multiple uses - skip it and jump to the next node. + MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; + SmallVector<TreeEntry *> Filtered; + for (TreeEntry *TE : OrderedEntries) { + if (!(TE->State == TreeEntry::Vectorize || + (TE->State == TreeEntry::NeedToGather && + GathersToOrders.count(TE))) || + TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || + !all_of(drop_begin(TE->UserTreeIndices), + [TE](const EdgeInfo &EI) { + return EI.UserTE == TE->UserTreeIndices.front().UserTE; + }) || + !Visited.insert(TE).second) { + Filtered.push_back(TE); + continue; + } + // Build a map between user nodes and their operands order to speedup + // search. The graph currently does not provide this dependency directly. + for (EdgeInfo &EI : TE->UserTreeIndices) { + TreeEntry *UserTE = EI.UserTE; + auto It = Users.find(UserTE); + if (It == Users.end()) + It = Users.insert({UserTE, {}}).first; + It->second.emplace_back(EI.EdgeIdx, TE); + } + } + // Erase filtered entries. + for_each(Filtered, + [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); + for (const auto &Data : Users) { + // Check that operands are used only in the User node. + SmallVector<TreeEntry *> GatherOps; + if (!CheckOperands(Data, GatherOps)) { + for_each(Data.second, + [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // All operands are reordered and used only in this node - propagate the + // most used order to the user node. + MapVector<OrdersType, unsigned, + DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> + OrdersUses; + SmallPtrSet<const TreeEntry *, 4> VisitedOps; + for (const auto &Op : Data.second) { + TreeEntry *OpTE = Op.second; + if (!OpTE->ReuseShuffleIndices.empty() || + (IgnoreReorder && OpTE == VectorizableTree.front().get())) + continue; + const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather) + return GathersToOrders.find(OpTE)->second; + return OpTE->ReorderIndices; + }(); + // Stores actually store the mask, not the order, need to invert. + if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && + OpTE->getOpcode() == Instruction::Store && !Order.empty()) { + SmallVector<int> Mask; + inversePermutation(Order, Mask); + unsigned E = Order.size(); + OrdersType CurrentOrder(E, E); + transform(Mask, CurrentOrder.begin(), [E](int Idx) { + return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx); + }); + fixupOrderingIndices(CurrentOrder); + ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + } else { + ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + } + if (VisitedOps.insert(OpTE).second) + OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += + OpTE->UserTreeIndices.size(); + assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0."); + --OrdersUses[{}]; + } + // If no orders - skip current nodes and jump to the next one, if any. + if (OrdersUses.empty()) { + for_each(Data.second, + [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // Choose the best order. + ArrayRef<unsigned> BestOrder = OrdersUses.front().first; + unsigned Cnt = OrdersUses.front().second; + for (const auto &Pair : drop_begin(OrdersUses)) { + if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + BestOrder = Pair.first; + Cnt = Pair.second; + } + } + // Set order of the user node (reordering of operands and user nodes). + if (BestOrder.empty()) { + for_each(Data.second, + [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // Erase operands from OrderedEntries list and adjust their orders. + VisitedOps.clear(); + SmallVector<int> Mask; + inversePermutation(BestOrder, Mask); + SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem); + unsigned E = BestOrder.size(); + transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { + return I < E ? static_cast<int>(I) : UndefMaskElem; + }); + for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) { + TreeEntry *TE = Op.second; + OrderedEntries.remove(TE); + if (!VisitedOps.insert(TE).second) + continue; + if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) { + // Just reorder reuses indices. + reorderReuses(TE->ReuseShuffleIndices, Mask); + continue; + } + // Gathers are processed separately. + if (TE->State != TreeEntry::Vectorize) + continue; + assert((BestOrder.size() == TE->ReorderIndices.size() || + TE->ReorderIndices.empty()) && + "Non-matching sizes of user/operand entries."); + reorderOrder(TE->ReorderIndices, Mask); + } + // For gathers just need to reorder its scalars. + for (TreeEntry *Gather : GatherOps) { + assert(Gather->ReorderIndices.empty() && + "Unexpected reordering of gathers."); + if (!Gather->ReuseShuffleIndices.empty()) { + // Just reorder reuses indices. + reorderReuses(Gather->ReuseShuffleIndices, Mask); + continue; + } + reorderScalars(Gather->Scalars, Mask); + OrderedEntries.remove(Gather); + } + // Reorder operands of the user node and set the ordering for the user + // node itself. + if (Data.first->State != TreeEntry::Vectorize || + !isa<ExtractElementInst, ExtractValueInst, LoadInst>( + Data.first->getMainOp()) || + Data.first->isAltShuffle()) + Data.first->reorderOperands(Mask); + if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) || + Data.first->isAltShuffle()) { + reorderScalars(Data.first->Scalars, Mask); + reorderOrder(Data.first->ReorderIndices, MaskOrder); + if (Data.first->ReuseShuffleIndices.empty() && + !Data.first->ReorderIndices.empty() && + !Data.first->isAltShuffle()) { + // Insert user node to the list to try to sink reordering deeper in + // the graph. + OrderedEntries.insert(Data.first); + } + } else { + reorderOrder(Data.first->ReorderIndices, Mask); + } + } + } + // If the reordering is unnecessary, just remove the reorder. + if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && + VectorizableTree.front()->ReuseShuffleIndices.empty()) + VectorizableTree.front()->ReorderIndices.clear(); +} + +void BoUpSLP::buildExternalUses( + const ExtraValueToDebugLocsMap &ExternallyUsedValues) { // Collect the values that we need to extract from the tree. for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -2636,6 +3215,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, if (!UserInst) continue; + if (isDeleted(UserInst)) + continue; + // Skip in-tree scalars that become vectors if (TreeEntry *UseEntry = getTreeEntry(U)) { Value *UseScalar = UseEntry->Scalars[0]; @@ -2664,14 +3246,120 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, } } +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst) { + deleteTree(); + UserIgnoreList = UserIgnoreLst; + if (!allSameType(Roots)) + return; + buildTree_rec(Roots, 0, EdgeInfo()); +} + +namespace { +/// Tracks the state we can represent the loads in the given sequence. +enum class LoadsState { Gather, Vectorize, ScatterVectorize }; +} // anonymous namespace + +/// Checks if the given array of loads can be represented as a vectorized, +/// scatter or just simple gather. +static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, + const TargetTransformInfo &TTI, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl<unsigned> &Order, + SmallVectorImpl<Value *> &PointerOps) { + // Check that a vectorized load would load the same memory as a scalar + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the + // unvectorized version. + Type *ScalarTy = VL0->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) + return LoadsState::Gather; + + // Make sure all loads in the bundle are simple - we can't vectorize + // atomic or volatile loads. + PointerOps.clear(); + PointerOps.resize(VL.size()); + auto *POIter = PointerOps.begin(); + for (Value *V : VL) { + auto *L = cast<LoadInst>(V); + if (!L->isSimple()) + return LoadsState::Gather; + *POIter = L->getPointerOperand(); + ++POIter; + } + + Order.clear(); + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + Optional<int> Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + // Check that the sorted loads are consecutive. + if (static_cast<unsigned>(*Diff) == VL.size() - 1) + return LoadsState::Vectorize; + Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); + if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), + CommonAlignment)) + return LoadsState::ScatterVectorize; + } + + return LoadsState::Gather; +} + void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); + SmallVector<int> ReuseShuffleIndicies; + SmallVector<Value *> UniqueValues; + auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, + &UserTreeIdx, + this](const InstructionsState &S) { + // Check that every instruction appears once in this bundle. + DenseMap<Value *, unsigned> UniquePositions; + for (Value *V : VL) { + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1 + : Res.first->second); + if (Res.second) + UniqueValues.emplace_back(V); + } + size_t NumUniqueScalarValues = UniqueValues.size(); + if (NumUniqueScalarValues == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + if (NumUniqueScalarValues <= 1 || + !llvm::isPowerOf2_32(NumUniqueScalarValues)) { + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return false; + } + VL = UniqueValues; + } + return true; + }; + InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -2680,7 +3368,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, isa<ScalableVectorType>( cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -2700,9 +3390,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { + // If we deal with insert/extract instructions, they all must have constant + // indices, otherwise we should gather them, not try to vectorize. + if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() || + (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) && + !all_of(VL, isVectorLikeInstWithConstOps))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -2724,7 +3420,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -2743,7 +3441,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (getTreeEntry(I)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -2754,7 +3454,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *V : VL) { if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -2773,28 +3475,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // Check that every instruction appears once in this bundle. - SmallVector<unsigned, 4> ReuseShuffleIndicies; - SmallVector<Value *, 4> UniqueValues; - DenseMap<Value *, unsigned> UniquePositions; - for (Value *V : VL) { - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) - UniqueValues.emplace_back(V); - } - size_t NumUniqueScalarValues = UniqueValues.size(); - if (NumUniqueScalarValues == VL.size()) { - ReuseShuffleIndicies.clear(); - } else { - LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || - !llvm::isPowerOf2_32(NumUniqueScalarValues)) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } - VL = UniqueValues; - } + if (!TryToFindDuplicates(S)) + return; auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -2867,7 +3549,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); - ++NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time @@ -2885,12 +3566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, dbgs() << " " << Idx; dbgs() << "\n"; }); + fixupOrderingIndices(CurrentOrder); // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2910,8 +3590,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check that we have a buildvector and not a shuffle of 2 or more // different vectors. ValueSet SourceVectors; - for (Value *V : VL) + int MinIdx = std::numeric_limits<int>::max(); + for (Value *V : VL) { SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); + Optional<int> Idx = *getInsertIndex(V, 0); + if (!Idx || *Idx == UndefMaskElem) + continue; + MinIdx = std::min(MinIdx, *Idx); + } if (count_if(VL, [&SourceVectors](Value *V) { return !SourceVectors.contains(V); @@ -2919,13 +3605,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Found 2nd source vector - cancel. LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " "different source vectors.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); BS.cancelScheduling(VL, VL0); return; } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); + auto OrdCompare = [](const std::pair<int, int> &P1, + const std::pair<int, int> &P2) { + return P1.first > P2.first; + }; + PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>, + decltype(OrdCompare)> + Indices(OrdCompare); + for (int I = 0, E = VL.size(); I < E; ++I) { + Optional<int> Idx = *getInsertIndex(VL[I], 0); + if (!Idx || *Idx == UndefMaskElem) + continue; + Indices.emplace(*Idx, I); + } + OrdersType CurrentOrder(VL.size(), VL.size()); + bool IsIdentity = true; + for (int I = 0, E = VL.size(); I < E; ++I) { + CurrentOrder[Indices.top().second] = I; + IsIdentity &= Indices.top().second == I; + Indices.pop(); + } + if (IsIdentity) + CurrentOrder.clear(); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + None, CurrentOrder); LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); constexpr int NumOps = 2; @@ -2936,7 +3644,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TE->setOperand(I, VectorOperands[I]); } - buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0}); + buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1}); return; } case Instruction::Load: { @@ -2946,90 +3654,52 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - Type *ScalarTy = VL0->getType(); - - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); - return; - } - - // Make sure all loads in the bundle are simple - we can't vectorize - // atomic or volatile loads. - SmallVector<Value *, 4> PointerOps(VL.size()); - auto POIter = PointerOps.begin(); - for (Value *V : VL) { - auto *L = cast<LoadInst>(V); - if (!L->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); - return; - } - *POIter = L->getPointerOperand(); - ++POIter; - } - + SmallVector<Value *> PointerOps; OrdersType CurrentOrder; - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { - Value *Ptr0; - Value *PtrN; + TreeEntry *TE = nullptr; + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder, + PointerOps)) { + case LoadsState::Vectorize: if (CurrentOrder.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); + // Original loads are consecutive and does not require reordering. + TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { - Ptr0 = PointerOps[CurrentOrder.front()]; - PtrN = PointerOps[CurrentOrder.back()]; - } - Optional<int> Diff = getPointersDiff( - ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); - // Check that the sorted loads are consecutive. - if (static_cast<unsigned>(*Diff) == VL.size() - 1) { - if (CurrentOrder.empty()) { - // Original loads are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); - } else { - // Need to reorder. - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - TE->setOperandsInOrder(); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; - } - return; - } - Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); - for (Value *V : VL) - CommonAlignment = - commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); - if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), - CommonAlignment)) { - // Vectorizing non-consecutive loads with `llvm.masked.gather`. - TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, - S, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() - << "SLP: added a vector of non-consecutive loads.\n"); - return; + fixupOrderingIndices(CurrentOrder); + // Need to reorder. + TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } + TE->setOperandsInOrder(); + break; + case LoadsState::ScatterVectorize: + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + break; + case LoadsState::Gather: + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); +#ifndef NDEBUG + Type *ScalarTy = VL0->getType(); + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) + LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + else if (any_of(VL, [](Value *V) { + return !cast<LoadInst>(V)->isSimple(); + })) + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + else + LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); +#endif // NDEBUG + break; } - - LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -3213,15 +3883,40 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); - TE->setOperandsInOrder(); - for (unsigned i = 0, e = 2; i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast<Instruction>(V)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, {TE, i}); + SmallVector<ValueList, 2> Operands(2); + // Prepare the operand vector for pointer operands. + for (Value *V : VL) + Operands.front().push_back( + cast<GetElementPtrInst>(V)->getPointerOperand()); + TE->setOperand(0, Operands.front()); + // Need to cast all indices to the same type before vectorization to + // avoid crash. + // Required to be able to find correct matches between different gather + // nodes and reuse the vectorized values rather than trying to gather them + // again. + int IndexIdx = 1; + Type *VL0Ty = VL0->getOperand(IndexIdx)->getType(); + Type *Ty = all_of(VL, + [VL0Ty, IndexIdx](Value *V) { + return VL0Ty == cast<GetElementPtrInst>(V) + ->getOperand(IndexIdx) + ->getType(); + }) + ? VL0Ty + : DL->getIndexType(cast<GetElementPtrInst>(VL0) + ->getPointerOperandType() + ->getScalarType()); + // Prepare the operand vector. + for (Value *V : VL) { + auto *Op = cast<Instruction>(V)->getOperand(IndexIdx); + auto *CI = cast<ConstantInt>(Op); + Operands.back().push_back(ConstantExpr::getIntegerCast( + CI, Ty, CI->getValue().isSignBitSet())); } + TE->setOperand(IndexIdx, Operands.back()); + + for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I) + buildTree_rec(Operands[I], Depth + 1, {TE, I}); return; } case Instruction::Store: { @@ -3276,21 +3971,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (static_cast<unsigned>(*Dist) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { + fixupOrderingIndices(CurrentOrder); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3321,7 +4014,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } Function *F = CI->getCalledFunction(); - unsigned NumArgs = CI->getNumArgOperands(); + unsigned NumArgs = CI->arg_size(); SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); for (unsigned j = 0; j != NumArgs; ++j) if (hasVectorInstrinsicScalarOpd(ID, j)) @@ -3373,7 +4066,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); - for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { + // For scalar operands no need to to create an entry since no need to + // vectorize it. + if (hasVectorInstrinsicScalarOpd(ID, i)) + continue; ValueList Operands; // Prepare the operand vector. for (Value *V : VL) { @@ -3548,7 +4245,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, FastMathFlags FMF; if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) FMF = FPCI->getFastMathFlags(); - SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); + SmallVector<const Value *> Arguments(CI->args()); IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF, dyn_cast<IntrinsicInst>(CI)); auto IntrinsicCost = @@ -3621,25 +4318,42 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, return Cost; } -/// Shuffles \p Mask in accordance with the given \p SubMask. -static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { - if (SubMask.empty()) - return; - if (Mask.empty()) { - Mask.append(SubMask.begin(), SubMask.end()); - return; - } - SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size()); - int TermValue = std::min(Mask.size(), SubMask.size()); - for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) { - NewMask[I] = UndefMaskElem; - continue; +/// Build shuffle mask for shuffle graph entries and lists of main and alternate +/// operations operands. +static void +buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, + ArrayRef<int> ReusesIndices, + const function_ref<bool(Instruction *)> IsAltOp, + SmallVectorImpl<int> &Mask, + SmallVectorImpl<Value *> *OpScalars = nullptr, + SmallVectorImpl<Value *> *AltScalars = nullptr) { + unsigned Sz = VL.size(); + Mask.assign(Sz, UndefMaskElem); + SmallVector<int> OrderMask; + if (!ReorderIndices.empty()) + inversePermutation(ReorderIndices, OrderMask); + for (unsigned I = 0; I < Sz; ++I) { + unsigned Idx = I; + if (!ReorderIndices.empty()) + Idx = OrderMask[I]; + auto *OpInst = cast<Instruction>(VL[Idx]); + if (IsAltOp(OpInst)) { + Mask[I] = Sz + Idx; + if (AltScalars) + AltScalars->push_back(OpInst); + } else { + Mask[I] = Idx; + if (OpScalars) + OpScalars->push_back(OpInst); } - NewMask[I] = Mask[SubMask[I]]; } - Mask.swap(NewMask); + if (!ReusesIndices.empty()) { + SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem); + transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) { + return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem; + }); + Mask.swap(NewMask); + } } InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, @@ -3661,13 +4375,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, if (MinBWs.count(VL[0])) VecTy = FixedVectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); - auto *FinalVecTy = VecTy; + unsigned EntryVF = E->getVectorFactor(); + auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); - unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (NeedToShuffleReuses) - FinalVecTy = - FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers); // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, @@ -3785,7 +4496,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // shuffle of a single/two vectors the scalars are extracted from. SmallVector<int> Mask; Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = - isShuffle(VL, Mask); + isFixedVectorShuffle(VL, Mask); if (ShuffleKind.hasValue()) { // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a @@ -3803,6 +4514,92 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + if (VL.size() > 2 && E->getOpcode() == Instruction::Load && + !E->isAltShuffle()) { + BoUpSLP::ValueSet VectorizedLoads; + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + unsigned VectorizedCnt = 0; + unsigned ScatterVectorizeCnt = 0; + const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); + for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef<Value *> Slice = VL.slice(Cnt, VF); + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector<Value *> PointerOps; + OrdersType CurrentOrder; + LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, + *SE, CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + if (LS == LoadsState::Vectorize) + ++VectorizedCnt; + else + ++ScatterVectorizeCnt; + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + InstructionCost GatherCost = 0; + unsigned NumParts = TTI->getNumberOfParts(VecTy); + bool NeedInsertSubvectorAnalysis = + !NumParts || (VL.size() / VF) > NumParts; + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (VectorizedLoads.contains(VL[I])) + continue; + GatherCost += getGatherCost(VL.slice(I, VF)); + } + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast<LoadInst>(V); + ScalarsCost += TTI->getMemoryOpCost( + Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace(), CostKind, LI); + } + auto *LI = cast<LoadInst>(E->getMainOp()); + auto *LoadTy = FixedVectorType::get(LI->getType(), VF); + Align Alignment = LI->getAlign(); + GatherCost += + VectorizedCnt * + TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, LI); + GatherCost += ScatterVectorizeCnt * + TTI->getGatherScatterOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + if (NeedInsertSubvectorAnalysis) { + // Add the cost for the subvectors insert. + for (int I = VF, E = VL.size(); I < E; I += VF) + GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, + None, I, LoadTy); + } + return ReuseShuffleCost + GatherCost - ScalarsCost; + } + } return ReuseShuffleCost + getGatherCost(VL); } InstructionCost CommonCost = 0; @@ -3852,7 +4649,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ++Idx; } } - Idx = ReuseShuffleNumbers; + Idx = EntryVF; for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast<ExtractElementInst>(V); @@ -3895,29 +4692,33 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return CommonCost; } case Instruction::InsertElement: { + assert(E->ReuseShuffleIndices.empty() && + "Unique insertelements only are expected."); auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); unsigned const NumElts = SrcVecTy->getNumElements(); unsigned const NumScalars = VL.size(); - APInt DemandedElts = APInt::getNullValue(NumElts); + APInt DemandedElts = APInt::getZero(NumElts); // TODO: Add support for Instruction::InsertValue. - unsigned Offset = UINT_MAX; + SmallVector<int> Mask; + if (!E->ReorderIndices.empty()) { + inversePermutation(E->ReorderIndices, Mask); + Mask.append(NumElts - NumScalars, UndefMaskElem); + } else { + Mask.assign(NumElts, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + } + unsigned Offset = *getInsertIndex(VL0, 0); bool IsIdentity = true; - SmallVector<int> ShuffleMask(NumElts, UndefMaskElem); + SmallVector<int> PrevMask(NumElts, UndefMaskElem); + Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { - Optional<int> InsertIdx = getInsertIndex(VL[I], 0); + Optional<int> InsertIdx = getInsertIndex(VL[PrevMask[I]], 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - unsigned Idx = *InsertIdx; - DemandedElts.setBit(Idx); - if (Idx < Offset) { - Offset = Idx; - IsIdentity &= I == 0; - } else { - assert(Idx >= Offset && "Failed to find vector index offset"); - IsIdentity &= Idx - Offset == I; - } - ShuffleMask[Idx] = I; + DemandedElts.setBit(*InsertIdx); + IsIdentity &= *InsertIdx - Offset == I; + Mask[*InsertIdx - Offset] = I; } assert(Offset < NumElts && "Failed to find vector index offset"); @@ -3932,8 +4733,23 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TargetTransformInfo::SK_PermuteSingleSrc, FixedVectorType::get(SrcVecTy->getElementType(), Sz)); } else if (!IsIdentity) { - Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, - ShuffleMask); + auto *FirstInsert = + cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, + cast<Instruction>(V)->getOperand(0)); + })); + if (isa<UndefValue>(FirstInsert->getOperand(0))) { + Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); + } else { + SmallVector<int> InsertMask(NumElts); + std::iota(InsertMask.begin(), InsertMask.end(), 0); + for (unsigned I = 0; I < NumElts; I++) { + if (Mask[I] != UndefMaskElem) + InsertMask[Offset + I] = NumElts + I; + } + Cost += + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask); + } } return Cost; @@ -3955,7 +4771,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { - CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. @@ -3980,7 +4796,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { - CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; @@ -4085,7 +4901,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { - CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = @@ -4103,7 +4919,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); if (NeedToShuffleReuses) { - CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost( @@ -4117,7 +4933,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { - CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecLdCost; @@ -4160,7 +4976,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { - CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; @@ -4215,14 +5031,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI::CastContextHint::None, CostKind); } - SmallVector<int> Mask(E->Scalars.size()); - for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) { - auto *OpInst = cast<Instruction>(E->Scalars[I]); - assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); - } - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); + SmallVector<int> Mask; + buildSuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + CommonCost = + TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask); LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost; } @@ -4231,13 +5049,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, } } -bool BoUpSLP::isFullyVectorizableTinyTree() const { +bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); + auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) { + SmallVector<int> Mask; + return TE->State == TreeEntry::NeedToGather && + !any_of(TE->Scalars, + [this](Value *V) { return EphValues.contains(V); }) && + (allConstant(TE->Scalars) || isSplat(TE->Scalars) || + TE->Scalars.size() < Limit || + (TE->getOpcode() == Instruction::ExtractElement && + isFixedVectorShuffle(TE->Scalars, Mask)) || + (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() == Instruction::Load && !TE->isAltShuffle())); + }; + // We only handle trees of heights 1 and 2. if (VectorizableTree.size() == 1 && - VectorizableTree[0]->State == TreeEntry::Vectorize) + (VectorizableTree[0]->State == TreeEntry::Vectorize || + (ForReduction && + AreVectorizableGathers(VectorizableTree[0].get(), + VectorizableTree[0]->Scalars.size()) && + VectorizableTree[0]->getVectorFactor() > 2))) return true; if (VectorizableTree.size() != 2) @@ -4249,19 +5084,14 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { // or they are extractelements, which form shuffle. SmallVector<int> Mask; if (VectorizableTree[0]->State == TreeEntry::Vectorize && - (allConstant(VectorizableTree[1]->Scalars) || - isSplat(VectorizableTree[1]->Scalars) || - (VectorizableTree[1]->State == TreeEntry::NeedToGather && - VectorizableTree[1]->Scalars.size() < - VectorizableTree[0]->Scalars.size()) || - (VectorizableTree[1]->State == TreeEntry::NeedToGather && - VectorizableTree[1]->getOpcode() == Instruction::ExtractElement && - isShuffle(VectorizableTree[1]->Scalars, Mask)))) + AreVectorizableGathers(VectorizableTree[1].get(), + VectorizableTree[0]->Scalars.size())) return true; // Gathering cost would be too much for tiny trees. if (VectorizableTree[0]->State == TreeEntry::NeedToGather || - VectorizableTree[1]->State == TreeEntry::NeedToGather) + (VectorizableTree[1]->State == TreeEntry::NeedToGather && + VectorizableTree[0]->State != TreeEntry::ScatterVectorize)) return false; return true; @@ -4330,7 +5160,7 @@ bool BoUpSLP::isLoadCombineCandidate() const { return true; } -bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { +bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { // No need to vectorize inserts of gathered values. if (VectorizableTree.size() == 2 && isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && @@ -4344,7 +5174,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { // If we have a tiny tree (a tree whose size is less than MinTreeSize), we // can vectorize it if we can prove it fully vectorizable. - if (isFullyVectorizableTinyTree()) + if (isFullyVectorizableTinyTree(ForReduction)) return false; assert(VectorizableTree.empty() @@ -4496,7 +5326,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. - if (EU.User && isa<InsertElementInst>(EU.User)) { + if (isa_and_nonnull<InsertElementInst>(EU.User)) { if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { Optional<int> InsertIdx = getInsertIndex(EU.User, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) @@ -4508,8 +5338,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { return false; auto *IE1 = cast<InsertElementInst>(VU); auto *IE2 = cast<InsertElementInst>(V); - // Go though of insertelement instructions trying to find either VU as - // the original vector for IE2 or V as the original vector for IE1. + // Go through of insertelement instructions trying to find either VU + // as the original vector for IE2 or V as the original vector for IE1. do { if (IE1 == VU || IE2 == V) return true; @@ -4525,7 +5355,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { VF.push_back(FTy->getNumElements()); ShuffleMask.emplace_back(VF.back(), UndefMaskElem); FirstUsers.push_back(EU.User); - DemandedElts.push_back(APInt::getNullValue(VF.back())); + DemandedElts.push_back(APInt::getZero(VF.back())); VecId = FirstUsers.size() - 1; } else { VecId = std::distance(FirstUsers.begin(), It); @@ -4705,18 +5535,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); - // FIXME: Shall be replaced by GetVF function once non-power-2 patch is - // landed. - auto &&GetVF = [](const TreeEntry *TE) { - if (!TE->ReuseShuffleIndices.empty()) - return TE->ReuseShuffleIndices.size(); - return TE->Scalars.size(); - }; DenseMap<int, const TreeEntry *> VFToTE; for (const TreeEntry *TE : UsedTEs.front()) - VFToTE.try_emplace(GetVF(TE), TE); + VFToTE.try_emplace(TE->getVectorFactor(), TE); for (const TreeEntry *TE : UsedTEs.back()) { - auto It = VFToTE.find(GetVF(TE)); + auto It = VFToTE.find(TE->getVectorFactor()); if (It != VFToTE.end()) { VF = It->first; Entries.push_back(It->second); @@ -4757,16 +5580,17 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices) const { + const DenseSet<unsigned> &ShuffledIndices, + bool NeedToShuffle) const { unsigned NumElts = Ty->getNumElements(); - APInt DemandedElts = APInt::getNullValue(NumElts); + APInt DemandedElts = APInt::getZero(NumElts); for (unsigned I = 0; I < NumElts; ++I) if (!ShuffledIndices.count(I)) DemandedElts.setBit(I); InstructionCost Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, /*Extract*/ false); - if (!ShuffledIndices.empty()) + if (NeedToShuffle) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } @@ -4777,6 +5601,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) ScalarTy = SI->getValueOperand()->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + bool DuplicateNonConst = false; // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. @@ -4785,12 +5610,17 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; - if (isConstant(VL[Idx])) + // No need to shuffle duplicates for constants. + if (isConstant(VL[Idx])) { + ShuffledElements.insert(Idx); continue; - if (!UniqueElements.insert(VL[Idx]).second) + } + if (!UniqueElements.insert(VL[Idx]).second) { + DuplicateNonConst = true; ShuffledElements.insert(Idx); + } } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst); } // Perform operand reordering on the instructions in VL and return the reordered @@ -5006,17 +5836,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { // block: // %phi = phi <2 x > { .., %entry} {%shuffle, %block} - // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1> + // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> // ... (use %2) - // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2} + // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} // br %block - SmallVector<int> UniqueIdxs; + SmallVector<int> UniqueIdxs(VF, UndefMaskElem); SmallSet<int, 4> UsedIdxs; int Pos = 0; int Sz = VL.size(); for (int Idx : E->ReuseShuffleIndices) { - if (Idx != Sz && UsedIdxs.insert(Idx).second) - UniqueIdxs.emplace_back(Pos); + if (Idx != Sz && Idx != UndefMaskElem && + UsedIdxs.insert(Idx).second) + UniqueIdxs[Idx] = Pos; ++Pos; } assert(VF >= UsedIdxs.size() && "Expected vectorization factor " @@ -5047,11 +5878,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { }).base()); VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues)); int UniqueVals = 0; - bool HasUndefs = false; for (Value *V : VL.drop_back(VL.size() - VF)) { if (isa<UndefValue>(V)) { ReuseShuffleIndicies.emplace_back(UndefMaskElem); - HasUndefs = true; continue; } if (isConstant(V)) { @@ -5066,15 +5895,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { ++UniqueVals; } } - if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) { + if (UniqueVals == 1 && UniqueValues.size() == 1) { // Emit pure splat vector. - // FIXME: why it is not identified as an identity. - unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem); - if (NumUndefs == ReuseShuffleIndicies.size() - 1) - ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), - UndefMaskElem); - else - ReuseShuffleIndicies.assign(VF, 0); + ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), + UndefMaskElem); } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { ReuseShuffleIndicies.clear(); UniqueValues.clear(); @@ -5107,12 +5931,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - unsigned VF = E->Scalars.size(); - if (NeedToShuffleReuses) - VF = E->ReuseShuffleIndices.size(); + unsigned VF = E->getVectorFactor(); ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); if (E->State == TreeEntry::NeedToGather) { - setInsertPointAfterBundle(E); + if (E->getMainOp()) + setInsertPointAfterBundle(E); Value *Vec; SmallVector<int> Mask; SmallVector<const TreeEntry *> Entries; @@ -5152,13 +5975,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { + assert( + (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) && + "PHI reordering is free."); auto *PH = cast<PHINode>(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -5209,53 +6036,48 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return NewV; } case Instruction::InsertElement: { - Builder.SetInsertPoint(VL0); + assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); + Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back())); Value *V = vectorizeTree(E->getOperand(1)); + // Create InsertVector shuffle if necessary + auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); + })); const unsigned NumElts = - cast<FixedVectorType>(VL0->getType())->getNumElements(); + cast<FixedVectorType>(FirstInsert->getType())->getNumElements(); const unsigned NumScalars = E->Scalars.size(); + unsigned Offset = *getInsertIndex(VL0, 0); + assert(Offset < NumElts && "Failed to find vector index offset"); + + // Create shuffle to resize vector + SmallVector<int> Mask; + if (!E->ReorderIndices.empty()) { + inversePermutation(E->ReorderIndices, Mask); + Mask.append(NumElts - NumScalars, UndefMaskElem); + } else { + Mask.assign(NumElts, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + } // Create InsertVector shuffle if necessary - Instruction *FirstInsert = nullptr; bool IsIdentity = true; - unsigned Offset = UINT_MAX; + SmallVector<int> PrevMask(NumElts, UndefMaskElem); + Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { - Value *Scalar = E->Scalars[I]; - if (!FirstInsert && - !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0))) - FirstInsert = cast<Instruction>(Scalar); + Value *Scalar = E->Scalars[PrevMask[I]]; Optional<int> InsertIdx = getInsertIndex(Scalar, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - unsigned Idx = *InsertIdx; - if (Idx < Offset) { - Offset = Idx; - IsIdentity &= I == 0; - } else { - assert(Idx >= Offset && "Failed to find vector index offset"); - IsIdentity &= Idx - Offset == I; - } - } - assert(Offset < NumElts && "Failed to find vector index offset"); - - // Create shuffle to resize vector - SmallVector<int> Mask(NumElts, UndefMaskElem); - if (!IsIdentity) { - for (unsigned I = 0; I < NumScalars; ++I) { - Value *Scalar = E->Scalars[I]; - Optional<int> InsertIdx = getInsertIndex(Scalar, 0); - if (!InsertIdx || *InsertIdx == UndefMaskElem) - continue; - Mask[*InsertIdx - Offset] = I; - } - } else { - std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + IsIdentity &= *InsertIdx - Offset == I; + Mask[*InsertIdx - Offset] = I; } if (!IsIdentity || NumElts != NumScalars) V = Builder.CreateShuffleVector(V, Mask); - if (NumElts != NumScalars) { + if ((!IsIdentity || Offset != 0 || + !isa<UndefValue>(FirstInsert->getOperand(0))) && + NumElts != NumScalars) { SmallVector<int> InsertMask(NumElts); std::iota(InsertMask.begin(), InsertMask.end(), 0); for (unsigned I = 0; I < NumElts; I++) { @@ -5295,6 +6117,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5317,6 +6140,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5337,6 +6161,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateSelect(Cond, True, False); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5360,6 +6185,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5403,6 +6229,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5414,9 +6241,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - bool IsReorder = E->updateStateIfReorder(); - if (IsReorder) - VL0 = E->getMainOp(); setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); @@ -5457,9 +6281,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Store: { - bool IsReorder = !E->ReorderIndices.empty(); - auto *SI = cast<StoreInst>( - IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); + auto *SI = cast<StoreInst>(VL0); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); @@ -5491,37 +6313,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::GetElementPtr: { + auto *GEP0 = cast<GetElementPtrInst>(VL0); setInsertPointAfterBundle(E); Value *Op0 = vectorizeTree(E->getOperand(0)); - std::vector<Value *> OpVecs; - for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; - ++j) { - ValueList &VL = E->getOperand(j); - // Need to cast all elements to the same type before vectorization to - // avoid crash. - Type *VL0Ty = VL0->getOperand(j)->getType(); - Type *Ty = llvm::all_of( - VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); }) - ? VL0Ty - : DL->getIndexType(cast<GetElementPtrInst>(VL0) - ->getPointerOperandType() - ->getScalarType()); - for (Value *&V : VL) { - auto *CI = cast<ConstantInt>(V); - V = ConstantExpr::getIntegerCast(CI, Ty, - CI->getValue().isSignBitSet()); - } - Value *OpVec = vectorizeTree(VL); + SmallVector<Value *> OpVecs; + for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { + Value *OpVec = vectorizeTree(E->getOperand(J)); OpVecs.push_back(OpVec); } - Value *V = Builder.CreateGEP( - cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); + Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs); if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5548,7 +6355,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { std::vector<Value *> OpVecs; SmallVector<Type *, 2> TysForDecl = {FixedVectorType::get(CI->getType(), E->Scalars.size())}; - for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { + for (int j = 0, e = CI->arg_size(); j < e; ++j) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. @@ -5594,6 +6401,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } propagateIRFlags(V, E->Scalars, VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5641,19 +6449,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. ValueList OpScalars, AltScalars; - unsigned Sz = E->Scalars.size(); - SmallVector<int> Mask(Sz); - for (unsigned I = 0; I < Sz; ++I) { - auto *OpInst = cast<Instruction>(E->Scalars[I]); - assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == E->getAltOpcode()) { - Mask[I] = Sz + I; - AltScalars.push_back(E->Scalars[I]); - } else { - Mask[I] = I; - OpScalars.push_back(E->Scalars[I]); - } - } + SmallVector<int> Mask; + buildSuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask, &OpScalars, &AltScalars); propagateIRFlags(V0, OpScalars); propagateIRFlags(V1, AltScalars); @@ -5661,7 +6464,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -5836,7 +6638,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); // It is legal to delete users in the ignorelist. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) || + (isa_and_nonnull<Instruction>(U) && + isDeleted(cast<Instruction>(U)))) && "Deleting out-of-tree value"); } } @@ -5911,27 +6715,28 @@ void BoUpSLP::optimizeGatherSequence() { "Worklist not sorted properly!"); BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { - Instruction *In = &*it++; - if (isDeleted(In)) + for (Instruction &In : llvm::make_early_inc_range(*BB)) { + if (isDeleted(&In)) continue; - if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) + if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) && + !isa<ShuffleVectorInst>(&In)) continue; // Check if we can replace this instruction with any of the // visited instructions. + bool Replaced = false; for (Instruction *v : Visited) { - if (In->isIdenticalTo(v) && - DT->dominates(v->getParent(), In->getParent())) { - In->replaceAllUsesWith(v); - eraseInstruction(In); - In = nullptr; + if (In.isIdenticalTo(v) && + DT->dominates(v->getParent(), In.getParent())) { + In.replaceAllUsesWith(v); + eraseInstruction(&In); + Replaced = true; break; } } - if (In) { - assert(!is_contained(Visited, In)); - Visited.push_back(In); + if (!Replaced) { + assert(!is_contained(Visited, &In)); + Visited.push_back(&In); } } } @@ -5944,7 +6749,9 @@ void BoUpSLP::optimizeGatherSequence() { Optional<BoUpSLP::ScheduleData *> BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, const InstructionsState &S) { - if (isa<PHINode>(S.OpValue) || isa<InsertElementInst>(S.OpValue)) + // No need to schedule PHIs, insertelement, extractelement and extractvalue + // instructions. + if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue)) return nullptr; // Initialize the instruction bundle. @@ -6040,7 +6847,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, Value *OpValue) { - if (isa<PHINode>(OpValue) || isa<InsertElementInst>(OpValue)) + if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue)) return; ScheduleData *Bundle = getScheduleData(OpValue); @@ -6080,8 +6887,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, return true; Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); - assert(!isa<PHINode>(I) && !isa<InsertElementInst>(I) && - "phi nodes/insertelements don't need to be scheduled"); + assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && + "phi nodes/insertelements/extractelements/extractvalues don't need to " + "be scheduled"); auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { ScheduleData *ISD = getScheduleData(I); if (!ISD) @@ -6351,7 +7159,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { - assert((isa<InsertElementInst>(SD->Inst) || + assert((isVectorLikeInstWithConstOps(SD->Inst) || SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; @@ -6694,9 +7502,7 @@ struct SLPVectorizer : public FunctionPass { initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); } - bool doInitialization(Module &M) override { - return false; - } + bool doInitialization(Module &M) override { return false; } bool runOnFunction(Function &F) override { if (skipFunction(F)) @@ -6831,44 +7637,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, return Changed; } -/// Order may have elements assigned special value (size) which is out of -/// bounds. Such indices only appear on places which correspond to undef values -/// (see canReuseExtract for details) and used in order to avoid undef values -/// have effect on operands ordering. -/// The first loop below simply finds all unused indices and then the next loop -/// nest assigns these indices for undef values positions. -/// As an example below Order has two undef positions and they have assigned -/// values 3 and 7 respectively: -/// before: 6 9 5 4 9 2 1 0 -/// after: 6 3 5 4 7 2 1 0 -/// \returns Fixed ordering. -static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef<unsigned> Order) { - BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end()); - const unsigned Sz = NewOrder.size(); - SmallBitVector UsedIndices(Sz); - SmallVector<int> MaskedIndices; - for (int I = 0, E = NewOrder.size(); I < E; ++I) { - if (NewOrder[I] < Sz) - UsedIndices.set(NewOrder[I]); - else - MaskedIndices.push_back(I); - } - if (MaskedIndices.empty()) - return NewOrder; - SmallVector<int> AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - NewOrder[MaskedIndices[I]] = AvailableIndices[I]; - return NewOrder; -} - bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, unsigned Idx) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -6884,19 +7652,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, << "\n"); R.buildTree(Chain); - Optional<ArrayRef<unsigned>> Order = R.bestOrder(); - // TODO: Handle orders of size less than number of elements in the vector. - if (Order && Order->size() == Chain.size()) { - // TODO: reorder tree nodes without tree rebuilding. - SmallVector<Value *, 4> ReorderedOps(Chain.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [Chain](const unsigned Idx) { return Chain[Idx]; }); - R.buildTree(ReorderedOps); - } if (R.isTreeTinyAndNotFullyVectorizable()) return false; if (R.isLoadCombineCandidate()) return false; + R.reorderTopToBottom(); + R.reorderBottomToTop(); + R.buildExternalUses(); R.computeMinimumValueSizes(); @@ -7019,7 +7781,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, unsigned EltSize = R.getVectorElementSize(Operands[0]); unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize); + unsigned MinVF = R.getMinVF(EltSize); unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); @@ -7092,11 +7854,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = {A, B}; - return tryToVectorizeList(VL, R, /*AllowReorder=*/true); + return tryToVectorizeList(VL, R); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, - bool AllowReorder) { + bool LimitForRegisterSize) { if (VL.size() < 2) return false; @@ -7130,7 +7892,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, } unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); + unsigned MinVF = R.getMinVF(Sz); unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { @@ -7168,7 +7930,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (!isPowerOf2_32(OpsWidth)) continue; - if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) + if ((LimitForRegisterSize && OpsWidth < MaxVF) || + (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) break; ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); @@ -7183,18 +7946,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, << "\n"); R.buildTree(Ops); - if (AllowReorder) { - Optional<ArrayRef<unsigned>> Order = R.bestOrder(); - if (Order) { - // TODO: reorder tree nodes without tree rebuilding. - SmallVector<Value *, 4> ReorderedOps(Ops.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [Ops](const unsigned Idx) { return Ops[Idx]; }); - R.buildTree(ReorderedOps); - } - } if (R.isTreeTinyAndNotFullyVectorizable()) continue; + R.reorderTopToBottom(); + R.reorderBottomToTop(); + R.buildExternalUses(); R.computeMinimumValueSizes(); InstructionCost Cost = R.getTreeCost(); @@ -7387,10 +8143,20 @@ class HorizontalReduction { Value *RHS, const Twine &Name, bool UseSelect) { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); switch (Kind) { - case RecurKind::Add: - case RecurKind::Mul: case RecurKind::Or: + if (UseSelect && + LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) + return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name); + return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, + Name); case RecurKind::And: + if (UseSelect && + LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) + return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name); + return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, + Name); + case RecurKind::Add: + case RecurKind::Mul: case RecurKind::Xor: case RecurKind::FAdd: case RecurKind::FMul: @@ -7434,8 +8200,12 @@ class HorizontalReduction { static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, Value *RHS, const Twine &Name, const ReductionOpsListType &ReductionOps) { - bool UseSelect = ReductionOps.size() == 2; - assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) && + bool UseSelect = ReductionOps.size() == 2 || + // Logical or/and. + (ReductionOps.size() == 1 && + isa<SelectInst>(ReductionOps.front().front())); + assert((!UseSelect || ReductionOps.size() != 2 || + isa<SelectInst>(ReductionOps[1][0])) && "Expected cmp + select pairs for reduction"); Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { @@ -7573,10 +8343,10 @@ class HorizontalReduction { /// Checks if the instruction is in basic block \p BB. /// For a cmp+sel min/max reduction check that both ops are in \p BB. static bool hasSameParent(Instruction *I, BasicBlock *BB) { - if (isCmpSelMinMax(I)) { + if (isCmpSelMinMax(I) || (isBoolLogicOp(I) && isa<SelectInst>(I))) { auto *Sel = cast<SelectInst>(I); - auto *Cmp = cast<Instruction>(Sel->getCondition()); - return Sel->getParent() == BB && Cmp->getParent() == BB; + auto *Cmp = dyn_cast<Instruction>(Sel->getCondition()); + return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB; } return I->getParent() == BB; } @@ -7758,13 +8528,13 @@ public: } /// Attempt to vectorize the tree found by matchAssociativeReduction. - bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); if (NumReducedVals < 4) - return false; + return nullptr; // Intersect the fast-math-flags from all reduction operations. FastMathFlags RdxFMF; @@ -7838,22 +8608,14 @@ public: unsigned i = 0; while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ExternallyUsedValues, IgnoreList); - Optional<ArrayRef<unsigned>> Order = V.bestOrder(); - if (Order) { - assert(Order->size() == VL.size() && - "Order size must be the same as number of vectorized " - "instructions."); - // TODO: reorder tree nodes without tree rebuilding. - SmallVector<Value *, 4> ReorderedOps(VL.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [VL](const unsigned Idx) { return VL[Idx]; }); - V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); - } - if (V.isTreeTinyAndNotFullyVectorizable()) + V.buildTree(VL, IgnoreList); + if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) break; if (V.isLoadCombineReductionCandidate(RdxKind)) break; + V.reorderTopToBottom(); + V.reorderBottomToTop(/*IgnoreReorder=*/true); + V.buildExternalUses(ExternallyUsedValues); // For a poison-safe boolean logic reduction, do not replace select // instructions with logic ops. All reduced values will be frozen (see @@ -7873,7 +8635,7 @@ public: InstructionCost Cost = TreeCost + ReductionCost; if (!Cost.isValid()) { LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); - return false; + return nullptr; } if (Cost >= -SLPCostThreshold) { V.getORE()->emit([&]() { @@ -7953,7 +8715,7 @@ public: // vector reductions. V.eraseInstructions(IgnoreList); } - return VectorizedTree != nullptr; + return VectorizedTree; } unsigned numReductionValues() const { return ReducedVals.size(); } @@ -7963,6 +8725,7 @@ private: InstructionCost getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal, unsigned ReduxWidth, FastMathFlags FMF) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ScalarTy = FirstReducedVal->getType(); FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); InstructionCost VectorCost, ScalarCost; @@ -7975,33 +8738,39 @@ private: case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF); - ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); + VectorCost = + TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); + ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); break; } case RecurKind::FMax: case RecurKind::FMin: { + auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*unsigned=*/false); - ScalarCost = - TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy)); + /*unsigned=*/false, CostKind); + CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); + ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, + SclCondTy, RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + SclCondTy, RdxPred, CostKind); break; } case RecurKind::SMax: case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: { + auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); bool IsUnsigned = RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; - VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned); - ScalarCost = - TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy)); + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned, + CostKind); + CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); + ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + SclCondTy, RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + SclCondTy, RdxPred, CostKind); break; } default: @@ -8023,6 +8792,7 @@ private: assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, ReductionOps.back()); } @@ -8232,32 +9002,45 @@ static bool tryToVectorizeHorReductionOrInstOperands( // Skip the analysis of CmpInsts.Compiler implements postanalysis of the // CmpInsts so we can skip extra attempts in // tryToVectorizeHorReductionOrInstOperands and save compile time. - SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0}); + std::queue<std::pair<Instruction *, unsigned>> Stack; + Stack.emplace(Root, 0); SmallPtrSet<Value *, 8> VisitedInstrs; + SmallVector<WeakTrackingVH> PostponedInsts; bool Res = false; + auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0, + Value *&B1) -> Value * { + bool IsBinop = matchRdxBop(Inst, B0, B1); + bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); + if (IsBinop || IsSelect) { + HorizontalReduction HorRdx; + if (HorRdx.matchAssociativeReduction(P, Inst)) + return HorRdx.tryToReduce(R, TTI); + } + return nullptr; + }; while (!Stack.empty()) { Instruction *Inst; unsigned Level; - std::tie(Inst, Level) = Stack.pop_back_val(); + std::tie(Inst, Level) = Stack.front(); + Stack.pop(); // Do not try to analyze instruction that has already been vectorized. // This may happen when we vectorize instruction operands on a previous // iteration while stack was populated before that happened. if (R.isDeleted(Inst)) continue; - Value *B0, *B1; - bool IsBinop = matchRdxBop(Inst, B0, B1); - bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); - if (IsBinop || IsSelect) { - HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst)) { - if (HorRdx.tryToReduce(R, TTI)) { - Res = true; - // Set P to nullptr to avoid re-analysis of phi node in - // matchAssociativeReduction function unless this is the root node. - P = nullptr; - continue; - } + Value *B0 = nullptr, *B1 = nullptr; + if (Value *V = TryToReduce(Inst, B0, B1)) { + Res = true; + // Set P to nullptr to avoid re-analysis of phi node in + // matchAssociativeReduction function unless this is the root node. + P = nullptr; + if (auto *I = dyn_cast<Instruction>(V)) { + // Try to find another reduction. + Stack.emplace(I, Level); + continue; } + } else { + bool IsBinop = B0 && B1; if (P && IsBinop) { Inst = dyn_cast<Instruction>(B0); if (Inst == P) @@ -8269,14 +9052,14 @@ static bool tryToVectorizeHorReductionOrInstOperands( continue; } } - } - // Set P to nullptr to avoid re-analysis of phi node in - // matchAssociativeReduction function unless this is the root node. - P = nullptr; - // Do not try to vectorize CmpInst operands, this is done separately. - if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) { - Res = true; - continue; + // Set P to nullptr to avoid re-analysis of phi node in + // matchAssociativeReduction function unless this is the root node. + P = nullptr; + // Do not try to vectorize CmpInst operands, this is done separately. + // Final attempt for binop args vectorization should happen after the loop + // to try to find reductions. + if (!isa<CmpInst>(Inst)) + PostponedInsts.push_back(Inst); } // Try to vectorize operands. @@ -8290,8 +9073,13 @@ static bool tryToVectorizeHorReductionOrInstOperands( // separately. if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) && I->getParent() == BB) - Stack.emplace_back(I, Level); + Stack.emplace(I, Level); } + // Try to vectorized binops where reductions were not found. + for (Value *V : PostponedInsts) + if (auto *Inst = dyn_cast<Instruction>(V)) + if (!R.isDeleted(Inst)) + Res |= Vectorize(Inst, R); return Res; } @@ -8326,7 +9114,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false); + return tryToVectorizeList(BuildVectorOpds, R); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, @@ -8337,11 +9125,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && - isShuffle(BuildVectorOpds, Mask))) + isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); - return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true); + return tryToVectorizeList(BuildVectorInsts, R); } bool SLPVectorizerPass::vectorizeSimpleInstructions( @@ -8382,6 +9170,78 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( return OpsChanged; } +template <typename T> +static bool +tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, + function_ref<unsigned(T *)> Limit, + function_ref<bool(T *, T *)> Comparator, + function_ref<bool(T *, T *)> AreCompatible, + function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize, + bool LimitForRegisterSize) { + bool Changed = false; + // Sort by type, parent, operands. + stable_sort(Incoming, Comparator); + + // Try to vectorize elements base on their type. + SmallVector<T *> Candidates; + for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) { + // Look for the next elements with the same type, parent and operand + // kinds. + auto *SameTypeIt = IncIt; + while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt)) + ++SameTypeIt; + + // Try to vectorize them. + unsigned NumElts = (SameTypeIt - IncIt); + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" + << NumElts << ")\n"); + // The vectorization is a 3-state attempt: + // 1. Try to vectorize instructions with the same/alternate opcodes with the + // size of maximal register at first. + // 2. Try to vectorize remaining instructions with the same type, if + // possible. This may result in the better vectorization results rather than + // if we try just to vectorize instructions with the same/alternate opcodes. + // 3. Final attempt to try to vectorize all instructions with the + // same/alternate ops only, this may result in some extra final + // vectorization. + if (NumElts > 1 && + TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { + // Success start over because instructions might have been changed. + Changed = true; + } else if (NumElts < Limit(*IncIt) && + (Candidates.empty() || + Candidates.front()->getType() == (*IncIt)->getType())) { + Candidates.append(IncIt, std::next(IncIt, NumElts)); + } + // Final attempt to vectorize instructions with the same types. + if (Candidates.size() > 1 && + (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { + if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) { + // Success start over because instructions might have been changed. + Changed = true; + } else if (LimitForRegisterSize) { + // Try to vectorize using small vectors. + for (auto *It = Candidates.begin(), *End = Candidates.end(); + It != End;) { + auto *SameTypeIt = It; + while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) + ++SameTypeIt; + unsigned NumElts = (SameTypeIt - It); + if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts), + /*LimitForRegisterSize=*/false)) + Changed = true; + It = SameTypeIt; + } + } + Candidates.clear(); + } + + // Start over at the next instruction of a different type (or the end). + IncIt = SameTypeIt; + } + return Changed; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; @@ -8390,11 +9250,89 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // node. Allows better to identify the chains that can be vectorized in the // better way. DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; + auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { + assert(isValidElementType(V1->getType()) && + isValidElementType(V2->getType()) && + "Expected vectorizable types only."); + // It is fine to compare type IDs here, since we expect only vectorizable + // types, like ints, floats and pointers, we don't care about other type. + if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) + return true; + if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) + return false; + ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; + ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() < Opcodes2.size()) + return true; + if (Opcodes1.size() > Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) + if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { + DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); + DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); + if (!NodeI1) + return NodeI2 != nullptr; + if (!NodeI2) + return false; + assert((NodeI1 == NodeI2) == + (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeI1 != NodeI2) + return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return I1->getOpcode() < I2->getOpcode(); + } + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) + return true; + if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) + return false; + } + return false; + }; + auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { + if (V1 == V2) + return true; + if (V1->getType() != V2->getType()) + return false; + ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; + ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() != Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) + if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) + return false; + } + return true; + }; + auto Limit = [&R](Value *V) { + unsigned EltSize = R.getVectorElementSize(V); + return std::max(2U, R.getMaxVecRegSize() / EltSize); + }; - bool HaveVectorizedPhiNodes = true; - while (HaveVectorizedPhiNodes) { - HaveVectorizedPhiNodes = false; - + bool HaveVectorizedPhiNodes = false; + do { // Collect the incoming values from the PHIs. Incoming.clear(); for (Instruction &I : *BB) { @@ -8432,132 +9370,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } } - // Sort by type, parent, operands. - stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) { - assert(isValidElementType(V1->getType()) && - isValidElementType(V2->getType()) && - "Expected vectorizable types only."); - // It is fine to compare type IDs here, since we expect only vectorizable - // types, like ints, floats and pointers, we don't care about other type. - if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) - return true; - if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) - return false; - ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; - ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; - if (Opcodes1.size() < Opcodes2.size()) - return true; - if (Opcodes1.size() > Opcodes2.size()) - return false; - for (int I = 0, E = Opcodes1.size(); I < E; ++I) { - // Undefs are compatible with any other value. - if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) - continue; - if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) - if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { - DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); - DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); - if (!NodeI1) - return NodeI2 != nullptr; - if (!NodeI2) - return false; - assert((NodeI1 == NodeI2) == - (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && - "Different nodes should have different DFS numbers"); - if (NodeI1 != NodeI2) - return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}); - if (S.getOpcode()) - continue; - return I1->getOpcode() < I2->getOpcode(); - } - if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) - continue; - if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) - return true; - if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) - return false; - } - return false; - }); - - auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { - if (V1 == V2) - return true; - if (V1->getType() != V2->getType()) - return false; - ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; - ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; - if (Opcodes1.size() != Opcodes2.size()) - return false; - for (int I = 0, E = Opcodes1.size(); I < E; ++I) { - // Undefs are compatible with any other value. - if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) - continue; - if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) - if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { - if (I1->getParent() != I2->getParent()) - return false; - InstructionsState S = getSameOpcode({I1, I2}); - if (S.getOpcode()) - continue; - return false; - } - if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) - continue; - if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) - return false; - } - return true; - }; - - // Try to vectorize elements base on their type. - SmallVector<Value *, 4> Candidates; - for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), - E = Incoming.end(); - IncIt != E;) { - - // Look for the next elements with the same type, parent and operand - // kinds. - SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; - while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) { - VisitedInstrs.insert(*SameTypeIt); - ++SameTypeIt; - } - - // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs (" - << NumElts << ")\n"); - // The order in which the phi nodes appear in the program does not matter. - // So allow tryToVectorizeList to reorder them if it is beneficial. This - // is done when there are exactly two elements since tryToVectorizeList - // asserts that there are only two values when AllowReorder is true. - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - /*AllowReorder=*/true)) { - // Success start over because instructions might have been changed. - HaveVectorizedPhiNodes = true; - Changed = true; - } else if (NumElts < 4 && - (Candidates.empty() || - Candidates.front()->getType() == (*IncIt)->getType())) { - Candidates.append(IncIt, std::next(IncIt, NumElts)); - } - // Final attempt to vectorize phis with the same types. - if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { - if (Candidates.size() > 1 && - tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) { - // Success start over because instructions might have been changed. - HaveVectorizedPhiNodes = true; - Changed = true; - } - Candidates.clear(); - } - - // Start over at the next instruction of a different type (or the end). - IncIt = SameTypeIt; - } - } + HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>( + Incoming, Limit, PHICompare, AreCompatiblePHIs, + [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { + return tryToVectorizeList(Candidates, R, LimitForRegisterSize); + }, + /*LimitForRegisterSize=*/true); + Changed |= HaveVectorizedPhiNodes; + VisitedInstrs.insert(Incoming.begin(), Incoming.end()); + } while (HaveVectorizedPhiNodes); VisitedInstrs.clear(); @@ -8810,6 +9631,10 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { return V1->getValueOperand()->getValueID() == V2->getValueOperand()->getValueID(); }; + auto Limit = [&R, this](StoreInst *SI) { + unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); + return R.getMinVF(EltSize); + }; // Attempt to sort and vectorize each of the store-groups. for (auto &Pair : Stores) { @@ -8819,33 +9644,15 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Pair.second.size() << ".\n"); - stable_sort(Pair.second, StoreSorter); - - // Try to vectorize elements based on their compatibility. - for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(), - E = Pair.second.end(); - IncIt != E;) { - - // Look for the next elements with the same type. - ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt; - Type *EltTy = (*IncIt)->getPointerOperand()->getType(); - - while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt)) - ++SameTypeIt; - - // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores (" - << NumElts << ")\n"); - if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() && - vectorizeStores(makeArrayRef(IncIt, NumElts), R)) { - // Success start over because instructions might have been changed. - Changed = true; - } + if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) + continue; - // Start over at the next instruction of a different type (or the end). - IncIt = SameTypeIt; - } + Changed |= tryToVectorizeSequence<StoreInst>( + Pair.second, Limit, StoreSorter, AreCompatibleStores, + [this, &R](ArrayRef<StoreInst *> Candidates, bool) { + return vectorizeStores(Candidates, R); + }, + /*LimitForRegisterSize=*/false); } return Changed; } |