diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4539 |
1 files changed, 3019 insertions, 1520 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 53c11c58f73d..e3eb6b1804e7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19,7 +19,6 @@ #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" @@ -94,6 +93,7 @@ #include <cstdint> #include <iterator> #include <memory> +#include <optional> #include <set> #include <string> #include <tuple> @@ -205,7 +205,7 @@ static bool isValidElementType(Type *Ty) { /// \returns True if the value is a constant (but not globals/constant /// expressions). static bool isConstant(Value *V) { - return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V); + return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V); } /// Checks if \p V is one of vector-like instructions, i.e. undef, @@ -284,24 +284,124 @@ static bool isCommutative(Instruction *I) { return false; } +/// \returns inserting index of InsertElement or InsertValue instruction, +/// using Offset as base offset for index. +static std::optional<unsigned> getInsertIndex(const Value *InsertInst, + unsigned Offset = 0) { + int Index = Offset; + if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { + const auto *VT = dyn_cast<FixedVectorType>(IE->getType()); + if (!VT) + return std::nullopt; + const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); + if (!CI) + return std::nullopt; + if (CI->getValue().uge(VT->getNumElements())) + return std::nullopt; + Index *= VT->getNumElements(); + Index += CI->getZExtValue(); + return Index; + } + + const auto *IV = cast<InsertValueInst>(InsertInst); + Type *CurrentType = IV->getType(); + for (unsigned I : IV->indices()) { + if (const auto *ST = dyn_cast<StructType>(CurrentType)) { + Index *= ST->getNumElements(); + CurrentType = ST->getElementType(I); + } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) { + Index *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else { + return std::nullopt; + } + Index += I; + } + return Index; +} + +namespace { +/// Specifies the way the mask should be analyzed for undefs/poisonous elements +/// in the shuffle mask. +enum class UseMask { + FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors, + ///< check for the mask elements for the first argument (mask + ///< indices are in range [0:VF)). + SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check + ///< for the mask elements for the second argument (mask indices + ///< are in range [VF:2*VF)) + UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for + ///< future shuffle elements and mark them as ones as being used + ///< in future. Non-undef elements are considered as unused since + ///< they're already marked as used in the mask. +}; +} // namespace + +/// Prepares a use bitset for the given mask either for the first argument or +/// for the second. +static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask, + UseMask MaskArg) { + SmallBitVector UseMask(VF, true); + for (auto P : enumerate(Mask)) { + if (P.value() == UndefMaskElem) { + if (MaskArg == UseMask::UndefsAsMask) + UseMask.reset(P.index()); + continue; + } + if (MaskArg == UseMask::FirstArg && P.value() < VF) + UseMask.reset(P.value()); + else if (MaskArg == UseMask::SecondArg && P.value() >= VF) + UseMask.reset(P.value() - VF); + } + return UseMask; +} + /// Checks if the given value is actually an undefined constant vector. -static bool isUndefVector(const Value *V) { - if (isa<UndefValue>(V)) - return true; - auto *C = dyn_cast<Constant>(V); - if (!C) - return false; - if (!C->containsUndefOrPoisonElement()) - return false; - auto *VecTy = dyn_cast<FixedVectorType>(C->getType()); +/// Also, if the \p UseMask is not empty, tries to check if the non-masked +/// elements actually mask the insertelement buildvector, if any. +template <bool IsPoisonOnly = false> +static SmallBitVector isUndefVector(const Value *V, + const SmallBitVector &UseMask = {}) { + SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true); + using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>; + if (isa<T>(V)) + return Res; + auto *VecTy = dyn_cast<FixedVectorType>(V->getType()); if (!VecTy) - return false; + return Res.reset(); + auto *C = dyn_cast<Constant>(V); + if (!C) { + if (!UseMask.empty()) { + const Value *Base = V; + while (auto *II = dyn_cast<InsertElementInst>(Base)) { + if (isa<T>(II->getOperand(1))) + continue; + Base = II->getOperand(0); + std::optional<unsigned> Idx = getInsertIndex(II); + if (!Idx) + continue; + if (*Idx < UseMask.size() && !UseMask.test(*Idx)) + Res.reset(*Idx); + } + // TODO: Add analysis for shuffles here too. + if (V == Base) { + Res.reset(); + } else { + SmallBitVector SubMask(UseMask.size(), false); + Res &= isUndefVector<IsPoisonOnly>(Base, SubMask); + } + } else { + Res.reset(); + } + return Res; + } for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { if (Constant *Elem = C->getAggregateElement(I)) - if (!isa<UndefValue>(Elem)) - return false; + if (!isa<T>(Elem) && + (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I)))) + Res.reset(I); } - return true; + return Res; } /// Checks if the vector of instructions can be represented as a shuffle, like: @@ -345,16 +445,16 @@ static bool isUndefVector(const Value *V) { /// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from -/// TargetTransformInfo::getInstructionThroughput? -static Optional<TargetTransformInfo::ShuffleKind> +/// ShuffleVectorInst/getShuffleCost? +static std::optional<TargetTransformInfo::ShuffleKind> isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { const auto *It = find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); }); if (It == VL.end()) - return None; + return std::nullopt; auto *EI0 = cast<ExtractElementInst>(*It); if (isa<ScalableVectorType>(EI0->getVectorOperandType())) - return None; + return std::nullopt; unsigned Size = cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); Value *Vec1 = nullptr; @@ -368,19 +468,19 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { continue; auto *EI = cast<ExtractElementInst>(VL[I]); if (isa<ScalableVectorType>(EI->getVectorOperandType())) - return None; + return std::nullopt; auto *Vec = EI->getVectorOperand(); // We can extractelement from undef or poison vector. - if (isUndefVector(Vec)) + if (isUndefVector(Vec).all()) continue; // All vector operands must have the same number of vector elements. if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) - return None; + return std::nullopt; if (isa<UndefValue>(EI->getIndexOperand())) continue; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) - return None; + return std::nullopt; // Undefined behavior if Idx is negative or >= Size. if (Idx->getValue().uge(Size)) continue; @@ -394,7 +494,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { Vec2 = Vec; Mask[I] += Size; } else { - return None; + return std::nullopt; } if (CommonShuffleMode == Permute) continue; @@ -415,6 +515,24 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { : TargetTransformInfo::SK_PermuteSingleSrc; } +/// \returns True if Extract{Value,Element} instruction extracts element Idx. +static std::optional<unsigned> getExtractIndex(Instruction *E) { + unsigned Opcode = E->getOpcode(); + assert((Opcode == Instruction::ExtractElement || + Opcode == Instruction::ExtractValue) && + "Expected extractelement or extractvalue instruction."); + if (Opcode == Instruction::ExtractElement) { + auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); + if (!CI) + return std::nullopt; + return CI->getZExtValue(); + } + auto *EI = cast<ExtractValueInst>(E); + if (EI->getNumIndices() != 1) + return std::nullopt; + return *EI->idx_begin(); +} + namespace { /// Main data required for vectorization of instructions. @@ -473,24 +591,49 @@ static bool isValidForAlternation(unsigned Opcode) { } static InstructionsState getSameOpcode(ArrayRef<Value *> VL, + const TargetLibraryInfo &TLI, unsigned BaseIndex = 0); /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. /// compatible instructions or constants, or just some other regular values. static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, - Value *Op1) { + Value *Op1, const TargetLibraryInfo &TLI) { return (isConstant(BaseOp0) && isConstant(Op0)) || (isConstant(BaseOp1) && isConstant(Op1)) || (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || - getSameOpcode({BaseOp0, Op0}).getOpcode() || - getSameOpcode({BaseOp1, Op1}).getOpcode(); + BaseOp0 == Op0 || BaseOp1 == Op1 || + getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || + getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); +} + +/// \returns true if a compare instruction \p CI has similar "look" and +/// same predicate as \p BaseCI, "as is" or with its operands and predicate +/// swapped, false otherwise. +static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, + const TargetLibraryInfo &TLI) { + assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && + "Assessing comparisons of different types?"); + CmpInst::Predicate BasePred = BaseCI->getPredicate(); + CmpInst::Predicate Pred = CI->getPredicate(); + CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred); + + Value *BaseOp0 = BaseCI->getOperand(0); + Value *BaseOp1 = BaseCI->getOperand(1); + Value *Op0 = CI->getOperand(0); + Value *Op1 = CI->getOperand(1); + + return (BasePred == Pred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) || + (BasePred == SwappedPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI)); } /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef<Value *> VL, + const TargetLibraryInfo &TLI, unsigned BaseIndex) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) @@ -508,9 +651,19 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). + auto *IBase = cast<Instruction>(VL[BaseIndex]); + Intrinsic::ID BaseID = 0; + SmallVector<VFInfo> BaseMappings; + if (auto *CallBase = dyn_cast<CallInst>(IBase)) { + BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI); + BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase); + if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode(); - if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { + auto *I = cast<Instruction>(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (IsBinOp && isa<BinaryOperator>(I)) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && @@ -519,9 +672,11 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, AltIndex = Cnt; continue; } - } else if (IsCastOp && isa<CastInst>(VL[Cnt])) { - Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType(); - Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType(); + } else if (IsCastOp && isa<CastInst>(I)) { + Value *Op0 = IBase->getOperand(0); + Type *Ty0 = Op0->getType(); + Value *Op1 = I->getOperand(0); + Type *Ty1 = Op1->getType(); if (Ty0 == Ty1) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; @@ -534,59 +689,79 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, continue; } } - } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) { - auto *BaseInst = cast<Instruction>(VL[BaseIndex]); - auto *Inst = cast<Instruction>(VL[Cnt]); + } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) { + auto *BaseInst = cast<CmpInst>(VL[BaseIndex]); Type *Ty0 = BaseInst->getOperand(0)->getType(); Type *Ty1 = Inst->getOperand(0)->getType(); if (Ty0 == Ty1) { - Value *BaseOp0 = BaseInst->getOperand(0); - Value *BaseOp1 = BaseInst->getOperand(1); - Value *Op0 = Inst->getOperand(0); - Value *Op1 = Inst->getOperand(1); - CmpInst::Predicate CurrentPred = - cast<CmpInst>(VL[Cnt])->getPredicate(); - CmpInst::Predicate SwappedCurrentPred = - CmpInst::getSwappedPredicate(CurrentPred); + assert(InstOpcode == Opcode && "Expected same CmpInst opcode."); // Check for compatible operands. If the corresponding operands are not // compatible - need to perform alternate vectorization. - if (InstOpcode == Opcode) { - if (BasePred == CurrentPred && - areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) - continue; - if (BasePred == SwappedCurrentPred && - areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0)) - continue; - if (E == 2 && - (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) - continue; - auto *AltInst = cast<CmpInst>(VL[AltIndex]); - CmpInst::Predicate AltPred = AltInst->getPredicate(); - Value *AltOp0 = AltInst->getOperand(0); - Value *AltOp1 = AltInst->getOperand(1); - // Check if operands are compatible with alternate operands. - if (AltPred == CurrentPred && - areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1)) - continue; - if (AltPred == SwappedCurrentPred && - areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0)) + CmpInst::Predicate CurrentPred = Inst->getPredicate(); + CmpInst::Predicate SwappedCurrentPred = + CmpInst::getSwappedPredicate(CurrentPred); + + if (E == 2 && + (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) + continue; + + if (isCmpSameOrSwapped(BaseInst, Inst, TLI)) + continue; + auto *AltInst = cast<CmpInst>(VL[AltIndex]); + if (AltIndex != BaseIndex) { + if (isCmpSameOrSwapped(AltInst, Inst, TLI)) continue; - } - if (BaseIndex == AltIndex && BasePred != CurrentPred) { - assert(isValidForAlternation(Opcode) && - isValidForAlternation(InstOpcode) && - "Cast isn't safe for alternation, logic needs to be updated!"); + } else if (BasePred != CurrentPred) { + assert( + isValidForAlternation(InstOpcode) && + "CmpInst isn't safe for alternation, logic needs to be updated!"); AltIndex = Cnt; continue; } - auto *AltInst = cast<CmpInst>(VL[AltIndex]); CmpInst::Predicate AltPred = AltInst->getPredicate(); if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || AltPred == CurrentPred || AltPred == SwappedCurrentPred) continue; } - } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) + } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) { + if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { + if (Gep->getNumOperands() != 2 || + Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) { + if (!isVectorLikeInstWithConstOps(EI)) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } else if (auto *LI = dyn_cast<LoadInst>(I)) { + auto *BaseLI = cast<LoadInst>(IBase); + if (!LI->isSimple() || !BaseLI->isSimple()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } else if (auto *Call = dyn_cast<CallInst>(I)) { + auto *CallBase = cast<CallInst>(IBase); + if (Call->getCalledFunction() != CallBase->getCalledFunction()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + if (Call->hasOperandBundles() && + !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), + Call->op_begin() + Call->getBundleOperandsEndIndex(), + CallBase->op_begin() + + CallBase->getBundleOperandsStartIndex())) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI); + if (ID != BaseID) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + if (!ID) { + SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call); + if (Mappings.size() != BaseMappings.size() || + Mappings.front().ISA != BaseMappings.front().ISA || + Mappings.front().ScalarName != BaseMappings.front().ScalarName || + Mappings.front().VectorName != BaseMappings.front().VectorName || + Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || + Mappings.front().Shape.Parameters != + BaseMappings.front().Shape.Parameters) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } + } continue; + } return InstructionsState(VL[BaseIndex], nullptr, nullptr); } @@ -605,24 +780,6 @@ static bool allSameType(ArrayRef<Value *> VL) { return true; } -/// \returns True if Extract{Value,Element} instruction extracts element Idx. -static Optional<unsigned> getExtractIndex(Instruction *E) { - unsigned Opcode = E->getOpcode(); - assert((Opcode == Instruction::ExtractElement || - Opcode == Instruction::ExtractValue) && - "Expected extractelement or extractvalue instruction."); - if (Opcode == Instruction::ExtractElement) { - auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); - if (!CI) - return None; - return CI->getZExtValue(); - } - ExtractValueInst *EI = cast<ExtractValueInst>(E); - if (EI->getNumIndices() != 1) - return None; - return *EI->idx_begin(); -} - /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, @@ -644,7 +801,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) return (CI->getArgOperand(i) == Scalar); } - LLVM_FALLTHROUGH; + [[fallthrough]]; } default: return false; @@ -735,40 +892,6 @@ static void inversePermutation(ArrayRef<unsigned> Indices, Mask[Indices[I]] = I; } -/// \returns inserting index of InsertElement or InsertValue instruction, -/// using Offset as base offset for index. -static Optional<unsigned> getInsertIndex(const Value *InsertInst, - unsigned Offset = 0) { - int Index = Offset; - if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { - if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { - auto *VT = cast<FixedVectorType>(IE->getType()); - if (CI->getValue().uge(VT->getNumElements())) - return None; - Index *= VT->getNumElements(); - Index += CI->getZExtValue(); - return Index; - } - return None; - } - - const auto *IV = cast<InsertValueInst>(InsertInst); - Type *CurrentType = IV->getType(); - for (unsigned I : IV->indices()) { - if (const auto *ST = dyn_cast<StructType>(CurrentType)) { - Index *= ST->getNumElements(); - CurrentType = ST->getElementType(I); - } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) { - Index *= AT->getNumElements(); - CurrentType = AT->getElementType(); - } else { - return None; - } - Index += I; - } - return Index; -} - /// Reorders the list of scalars in accordance with the given \p Mask. static void reorderScalars(SmallVectorImpl<Value *> &Scalars, ArrayRef<int> Mask) { @@ -839,6 +962,7 @@ namespace slpvectorizer { class BoUpSLP { struct TreeEntry; struct ScheduleData; + class ShuffleInstructionBuilder; public: using ValueList = SmallVector<Value *, 8>; @@ -867,7 +991,7 @@ public: else MaxVecRegSize = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedSize(); + .getFixedValue(); if (MinVectorRegSizeOption.getNumOccurrences()) MinVecRegSize = MinVectorRegSizeOption; @@ -882,7 +1006,8 @@ public: /// Vectorize the tree but with the list of externally used values \p /// ExternallyUsedValues. Values in this MapVector can be replaced but the /// generated extractvalue instructions. - Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); + Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, + Instruction *ReductionRoot = nullptr); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. @@ -890,7 +1015,7 @@ public: /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None); + InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -900,6 +1025,24 @@ public: /// Construct a vectorizable tree that starts at \p Roots. void buildTree(ArrayRef<Value *> Roots); + /// Checks if the very first tree node is going to be vectorized. + bool isVectorizedFirstNode() const { + return !VectorizableTree.empty() && + VectorizableTree.front()->State == TreeEntry::Vectorize; + } + + /// Returns the main instruction for the very first node. + Instruction *getFirstNodeMainOp() const { + assert(!VectorizableTree.empty() && "No tree to get the first node from"); + return VectorizableTree.front()->getMainOp(); + } + + /// Returns whether the root node has in-tree uses. + bool doesRootHaveInTreeUses() const { + return !VectorizableTree.empty() && + !VectorizableTree.front()->UserTreeIndices.empty(); + } + /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p /// ExternallyUsedValues contains additional list of external uses to handle @@ -912,6 +1055,7 @@ public: VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); + EntryToLastInstruction.clear(); ExternalUses.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); @@ -931,17 +1075,17 @@ public: /// shuffled vector entry + (possibly) permutation with other gathers. It /// implements the checks only for possibly ordered scalars (Loads, /// ExtractElement, ExtractValue), which can be part of the graph. - Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); + std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); /// Sort loads into increasing pointers offsets to allow greater clustering. - Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE); + std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE); /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. /// \param TopToBottom If true, include the order of vectorized stores and /// insertelement nodes, otherwise skip them. - Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); + std::optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes @@ -1052,6 +1196,7 @@ public: /// A helper class used for scoring candidates for two consecutive lanes. class LookAheadHeuristics { + const TargetLibraryInfo &TLI; const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; @@ -1059,9 +1204,11 @@ public: int MaxLevel; // The maximum recursion depth for accumulating score. public: - LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE, - const BoUpSLP &R, int NumLanes, int MaxLevel) - : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {} + LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, + ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, + int MaxLevel) + : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes), + MaxLevel(MaxLevel) {} // The hard-coded scores listed here are not very important, though it shall // be higher for better matches to improve the resulting cost. When @@ -1083,6 +1230,8 @@ public: static const int ScoreSplatLoads = 3; /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). static const int ScoreReversedLoads = 3; + /// A load candidate for masked gather. + static const int ScoreMaskedGatherCandidate = 1; /// ExtractElementInst from same vector and consecutive indexes. static const int ScoreConsecutiveExtracts = 4; /// ExtractElementInst from same vector and reversed indices. @@ -1108,6 +1257,10 @@ public: /// MainAltOps. int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef<Value *> MainAltOps) const { + if (!isValidElementType(V1->getType()) || + !isValidElementType(V2->getType())) + return LookAheadHeuristics::ScoreFail; + if (V1 == V2) { if (isa<LoadInst>(V1)) { // Retruns true if the users of V1 and V2 won't need to be extracted. @@ -1137,18 +1290,26 @@ public: auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); if (LI1 && LI2) { - if (LI1->getParent() != LI2->getParent()) + if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || + !LI2->isSimple()) return LookAheadHeuristics::ScoreFail; - Optional<int> Dist = getPointersDiff( + std::optional<int> Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); - if (!Dist || *Dist == 0) + if (!Dist || *Dist == 0) { + if (getUnderlyingObject(LI1->getPointerOperand()) == + getUnderlyingObject(LI2->getPointerOperand()) && + R.TTI->isLegalMaskedGather( + FixedVectorType::get(LI1->getType(), NumLanes), + LI1->getAlign())) + return LookAheadHeuristics::ScoreMaskedGatherCandidate; return LookAheadHeuristics::ScoreFail; + } // The distance is too large - still may be profitable to use masked // loads/gathers. if (std::abs(*Dist) > NumLanes / 2) - return LookAheadHeuristics::ScoreAltOpcodes; + return LookAheadHeuristics::ScoreMaskedGatherCandidate; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce // better results. It should not affect current vectorization. @@ -1177,7 +1338,7 @@ public: // Undefs are always profitable for extractelements. if (!Ex2Idx) return LookAheadHeuristics::ScoreConsecutiveExtracts; - if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) + if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType()) return LookAheadHeuristics::ScoreConsecutiveExtracts; if (EV2 == EV1) { int Idx1 = Ex1Idx->getZExtValue(); @@ -1205,7 +1366,7 @@ public: SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end()); Ops.push_back(I1); Ops.push_back(I2); - InstructionsState S = getSameOpcode(Ops); + InstructionsState S = getSameOpcode(Ops, TLI); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. if (S.getOpcode() && @@ -1300,7 +1461,7 @@ public: // Recursively calculate the cost at each level int TmpScore = getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), - I1, I2, CurrLevel + 1, None); + I1, I2, CurrLevel + 1, std::nullopt); // Look for the best score. if (TmpScore > LookAheadHeuristics::ScoreFail && TmpScore > MaxTmpScore) { @@ -1381,6 +1542,7 @@ public: /// A vector of operand vectors. SmallVector<OperandDataVec, 4> OpsVec; + const TargetLibraryInfo &TLI; const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; @@ -1464,7 +1626,7 @@ public: auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV); if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV)) return 0; - return R.areAllUsersVectorized(IdxLaneI, None) + return R.areAllUsersVectorized(IdxLaneI, std::nullopt) ? LookAheadHeuristics::ScoreAllUserVectorized : 0; } @@ -1482,7 +1644,7 @@ public: int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps, int Lane, unsigned OpIdx, unsigned Idx, bool &IsUsed) { - LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(), + LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(), LookAheadMaxDepth); // Keep track of the instruction stack as we recurse into the operands // during the look-ahead score exploration. @@ -1520,8 +1682,8 @@ public: // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. - // If no good match can be found, return None. - Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane, + // If no good match can be found, return std::nullopt. + std::optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane, ArrayRef<ReorderingMode> ReorderingModes, ArrayRef<Value *> MainAltOps) { unsigned NumOperands = getNumOperands(); @@ -1532,7 +1694,7 @@ public: // Our strategy mode for OpIdx. ReorderingMode RMode = ReorderingModes[OpIdx]; if (RMode == ReorderingMode::Failed) - return None; + return std::nullopt; // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; @@ -1541,7 +1703,7 @@ public: // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. struct BestOpData { - Optional<unsigned> Idx = None; + std::optional<unsigned> Idx; unsigned Score = 0; } BestOp; BestOp.Score = @@ -1600,8 +1762,8 @@ public: getData(*BestOp.Idx, Lane).IsUsed = IsUsed; return BestOp.Idx; } - // If we could not find a good match return None. - return None; + // If we could not find a good match return std::nullopt. + return std::nullopt; } /// Helper for reorderOperandVecs. @@ -1704,7 +1866,7 @@ public: // Use Boyer-Moore majority voting for finding the majority opcode and // the number of times it occurs. if (auto *I = dyn_cast<Instruction>(OpData.V)) { - if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() || + if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || I->getParent() != Parent) { if (NumOpsWithSameOpcodeParent == 0) { NumOpsWithSameOpcodeParent = 1; @@ -1806,9 +1968,9 @@ public: public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, - ScalarEvolution &SE, const BoUpSLP &R) - : DL(DL), SE(SE), R(R) { + VLOperands(ArrayRef<Value *> RootVL, const TargetLibraryInfo &TLI, + const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) + : TLI(TLI), DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1930,7 +2092,7 @@ public: // Look for a good match for each operand. for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { // Search for the operand that matches SortedOps[OpIdx][Lane-1]. - Optional<unsigned> BestIdx = getBestOperand( + std::optional<unsigned> BestIdx = getBestOperand( OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]); // By not selecting a value, we allow the operands that follow to // select a better matching value. We will get a non-null value in @@ -1949,7 +2111,7 @@ public: if (MainAltOps[OpIdx].size() != 2) { OperandData &AltOp = getData(OpIdx, Lane); InstructionsState OpS = - getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}); + getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); if (OpS.getOpcode() && OpS.isAltShuffle()) MainAltOps[OpIdx].push_back(AltOp.V); } @@ -2018,21 +2180,21 @@ public: /// Evaluate each pair in \p Candidates and return index into \p Candidates /// for a pair which have highest score deemed to have best chance to form - /// root of profitable tree to vectorize. Return None if no candidate scored - /// above the LookAheadHeuristics::ScoreFail. - /// \param Limit Lower limit of the cost, considered to be good enough score. - Optional<int> + /// root of profitable tree to vectorize. Return std::nullopt if no candidate + /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit + /// of the cost, considered to be good enough score. + std::optional<int> findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates, int Limit = LookAheadHeuristics::ScoreFail) { - LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2, + LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2, RootLookAheadMaxDepth); int BestScore = Limit; - Optional<int> Index = None; + std::optional<int> Index; for (int I : seq<int>(0, Candidates.size())) { int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, Candidates[I].second, /*U1=*/nullptr, /*U2=*/nullptr, - /*Level=*/1, None); + /*Level=*/1, std::nullopt); if (Score > BestScore) { BestScore = Score; Index = I; @@ -2063,7 +2225,7 @@ public: } /// Checks if the provided list of reduced values was checked already for /// vectorization. - bool areAnalyzedReductionVals(ArrayRef<Value *> VL) { + bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const { return AnalyzedReductionVals.contains(hash_value(VL)); } /// Adds the list of reduced values to list of already checked values for the @@ -2081,6 +2243,9 @@ public: return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); } + /// Check if the value is vectorized in the tree. + bool isVectorized(Value *V) const { return getTreeEntry(V); } + ~BoUpSLP(); private: @@ -2097,6 +2262,10 @@ private: ArrayRef<TreeEntry *> ReorderableGathers, SmallVectorImpl<TreeEntry *> &GatherOps); + /// Checks if the given \p TE is a gather node with clustered reused scalars + /// and reorders it per given \p Mask. + void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const; + /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, /// if any. If it is not vectorized (gather node), returns nullptr. TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { @@ -2123,6 +2292,11 @@ private: bool areAllUsersVectorized(Instruction *I, ArrayRef<Value *> VectorizedVals) const; + /// Return information about the vector formed for the specified index + /// of a vector of (the same) instruction. + TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL, + unsigned OpIdx); + /// \returns the cost of the vectorizable entry. InstructionCost getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals); @@ -2142,13 +2316,14 @@ private: /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); - /// Vectorize a single entry in the tree, starting in \p VL. - Value *vectorizeTree(ArrayRef<Value *> VL); + /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry + /// \p E. + Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. - Value *createBuildVector(ArrayRef<Value *> VL); + Value *createBuildVector(const TreeEntry *E); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. If \p @@ -2158,12 +2333,22 @@ private: const APInt &ShuffledIndices, bool NeedToShuffle) const; + /// Returns the instruction in the bundle, which can be used as a base point + /// for scheduling. Usually it is the last instruction in the bundle, except + /// for the case when all operands are external (in this case, it is the first + /// instruction in the list). + Instruction &getLastInstructionInBundle(const TreeEntry *E); + /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. + /// \param TE Tree entry checked for permutation. + /// \param VL List of scalars (a subset of the TE scalar), checked for + /// permutations. /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Mask is filled with the shuffle mask. - Optional<TargetTransformInfo::ShuffleKind> - isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, + std::optional<TargetTransformInfo::ShuffleKind> + isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, + SmallVectorImpl<int> &Mask, SmallVectorImpl<const TreeEntry *> &Entries); /// \returns the scalarization cost for this list of values. Assuming that @@ -2184,12 +2369,10 @@ private: /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R); + static void reorderInputsAccordingToOpcode( + ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI, + const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R); /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store @@ -2198,10 +2381,10 @@ private: collectUserStores(const BoUpSLP::TreeEntry *TE) const; /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the - /// stores in \p StoresVec can for a vector instruction. If so it returns true + /// stores in \p StoresVec can form a vector instruction. If so it returns true /// and populates \p ReorderIndices with the shuffle indices of the the stores /// when compared to the sorted vector. - bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec, + bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec, OrdersType &ReorderIndices) const; /// Iterates through the users of \p TE, looking for scalar stores that can be @@ -2247,6 +2430,12 @@ private: return IsSame(Scalars, ReuseShuffleIndices); } + bool isOperandGatherNode(const EdgeInfo &UserEI) const { + return State == TreeEntry::NeedToGather && + UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && + UserTreeIndices.front().UserTE == UserEI.UserTE; + } + /// \returns true if current entry has same operands as \p TE. bool hasEqualOperands(const TreeEntry &TE) const { if (TE.getNumOperands() != getNumOperands()) @@ -2508,11 +2697,11 @@ private: #endif /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef<int> ReuseShuffleIndices = None, - ArrayRef<unsigned> ReorderIndices = None) { + ArrayRef<int> ReuseShuffleIndices = std::nullopt, + ArrayRef<unsigned> ReorderIndices = std::nullopt) { TreeEntry::EntryState EntryState = Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, @@ -2521,11 +2710,11 @@ private: TreeEntry *newTreeEntry(ArrayRef<Value *> VL, TreeEntry::EntryState EntryState, - Optional<ScheduleData *> Bundle, + std::optional<ScheduleData *> Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef<int> ReuseShuffleIndices = None, - ArrayRef<unsigned> ReorderIndices = None) { + ArrayRef<int> ReuseShuffleIndices = std::nullopt, + ArrayRef<unsigned> ReorderIndices = std::nullopt) { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"); @@ -2547,7 +2736,7 @@ private: return UndefValue::get(VL.front()->getType()); return VL[Idx]; }); - InstructionsState S = getSameOpcode(Last->Scalars); + InstructionsState S = getSameOpcode(Last->Scalars, *TLI); Last->setOperations(S); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } @@ -2611,6 +2800,14 @@ private: /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A map between the vectorized entries and the last instructions in the + /// bundles. The bundles are built in use order, not in the def order of the + /// instructions. So, we cannot rely directly on the last instruction in the + /// bundle being the last instruction in the program order during + /// vectorization process since the basic blocks are affected, need to + /// pre-gather them before. + DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -2635,9 +2832,9 @@ private: Instruction *Inst2) { // First check if the result is already in the cache. AliasCacheKey key = std::make_pair(Inst1, Inst2); - Optional<bool> &result = AliasCache[key]; + std::optional<bool> &result = AliasCache[key]; if (result) { - return result.value(); + return *result; } bool aliased = true; if (Loc1.Ptr && isSimple(Inst1)) @@ -2651,7 +2848,7 @@ private: /// Cache for alias results. /// TODO: consider moving this to the AliasAnalysis itself. - DenseMap<AliasCacheKey, Optional<bool>> AliasCache; + DenseMap<AliasCacheKey, std::optional<bool>> AliasCache; // Cache for pointerMayBeCaptured calls inside AA. This is preserved // globally through SLP because we don't perform any action which @@ -2680,8 +2877,9 @@ private: /// Values used only by @llvm.assume calls. SmallPtrSet<const Value *, 32> EphValues; - /// Holds all of the instructions that we gathered. - SetVector<Instruction *> GatherShuffleSeq; + /// Holds all of the instructions that we gathered, shuffle instructions and + /// extractelements. + SetVector<Instruction *> GatherShuffleExtractSeq; /// A list of blocks that we are going to CSE. SetVector<BasicBlock *> CSEBlocks; @@ -2994,7 +3192,7 @@ private: // okay. auto *In = BundleMember->Inst; assert(In && - (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || + (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && "Missed TreeEntry operands?"); (void)In; // fake use to avoid build failure when assertions disabled @@ -3102,9 +3300,9 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - /// \returns the scheduling bundle. The returned Optional value is non-None - /// if \p VL is allowed to be scheduled. - Optional<ScheduleData *> + /// \returns the scheduling bundle. The returned Optional value is not + /// std::nullopt if \p VL is allowed to be scheduled. + std::optional<ScheduleData *> tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, const InstructionsState &S); @@ -3319,9 +3517,10 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { std::string Str; raw_string_ostream OS(Str); + OS << Entry->Idx << ".\n"; if (isSplat(Entry->Scalars)) OS << "<splat> "; - for (auto V : Entry->Scalars) { + for (auto *V : Entry->Scalars) { OS << *V; if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; @@ -3336,6 +3535,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { const BoUpSLP *) { if (Entry->State == TreeEntry::NeedToGather) return "color=red"; + if (Entry->State == TreeEntry::ScatterVectorize) + return "color=blue"; return ""; } }; @@ -3407,7 +3608,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) { fixupOrderingIndices(Order); } -Optional<BoUpSLP::OrdersType> +std::optional<BoUpSLP::OrdersType> BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); unsigned NumScalars = TE.Scalars.size(); @@ -3427,11 +3628,11 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { STE = LocalSTE; else if (STE != LocalSTE) // Take the order only from the single vector node. - return None; + return std::nullopt; unsigned Lane = std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); if (Lane >= NumScalars) - return None; + return std::nullopt; if (CurrentOrder[Lane] != NumScalars) { if (Lane != I) continue; @@ -3470,7 +3671,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { } return CurrentOrder; } - return None; + return std::nullopt; } namespace { @@ -3478,12 +3679,31 @@ namespace { enum class LoadsState { Gather, Vectorize, ScatterVectorize }; } // anonymous namespace +static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, + const TargetLibraryInfo &TLI, + bool CompareOpcodes = true) { + if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) + return false; + auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); + if (!GEP1) + return false; + auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); + if (!GEP2) + return false; + return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 && + ((isConstant(GEP1->getOperand(1)) && + isConstant(GEP2->getOperand(1))) || + !CompareOpcodes || + getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) + .getOpcode()); +} + /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, - LoopInfo &LI, + LoopInfo &LI, const TargetLibraryInfo &TLI, SmallVectorImpl<unsigned> &Order, SmallVectorImpl<Value *> &PointerOps) { // Check that a vectorized load would load the same memory as a scalar @@ -3513,18 +3733,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, Order.clear(); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); - if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) { - if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front())) - return false; - auto *GEP = dyn_cast<GetElementPtrInst>(P); - if (!GEP) - return false; - auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front()); - return GEP->getNumOperands() == 2 && - ((isConstant(GEP->getOperand(1)) && - isConstant(GEP0->getOperand(1))) || - getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)}) - .getOpcode()); + if (IsSorted || all_of(PointerOps, [&](Value *P) { + return arePointersCompatible(P, PointerOps.front(), TLI); })) { if (IsSorted) { Value *Ptr0; @@ -3536,7 +3746,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, Ptr0 = PointerOps[Order.front()]; PtrN = PointerOps[Order.back()]; } - Optional<int> Diff = + std::optional<int> Diff = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); // Check that the sorted loads are consecutive. if (static_cast<unsigned>(*Diff) == VL.size() - 1) @@ -3584,7 +3794,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, unsigned Cnt = 1; for (Value *Ptr : VL.drop_front()) { bool Found = any_of(Bases, [&](auto &Base) { - Optional<int> Diff = + std::optional<int> Diff = getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, /*StrictCheck=*/true); if (!Diff) @@ -3636,7 +3846,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, return true; } -Optional<BoUpSLP::OrdersType> +std::optional<BoUpSLP::OrdersType> BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); Type *ScalarTy = TE.Scalars[0]->getType(); @@ -3646,27 +3856,176 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { for (Value *V : TE.Scalars) { auto *L = dyn_cast<LoadInst>(V); if (!L || !L->isSimple()) - return None; + return std::nullopt; Ptrs.push_back(L->getPointerOperand()); } BoUpSLP::OrdersType Order; if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) return Order; - return None; + return std::nullopt; +} + +/// Check if two insertelement instructions are from the same buildvector. +static bool areTwoInsertFromSameBuildVector( + InsertElementInst *VU, InsertElementInst *V, + function_ref<Value *(InsertElementInst *)> GetBaseOperand) { + // Instructions must be from the same basic blocks. + if (VU->getParent() != V->getParent()) + return false; + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + // Multiple used inserts are separate nodes. + if (!VU->hasOneUse() && !V->hasOneUse()) + return false; + auto *IE1 = VU; + auto *IE2 = V; + std::optional<unsigned> Idx1 = getInsertIndex(IE1); + std::optional<unsigned> Idx2 = getInsertIndex(IE2); + if (Idx1 == std::nullopt || Idx2 == std::nullopt) + return false; + // Go through the vector operand of insertelement instructions trying to find + // either VU as the original vector for IE2 or V as the original vector for + // IE1. + do { + if (IE2 == VU) + return VU->hasOneUse(); + if (IE1 == V) + return V->hasOneUse(); + if (IE1) { + if ((IE1 != VU && !IE1->hasOneUse()) || + getInsertIndex(IE1).value_or(*Idx2) == *Idx2) + IE1 = nullptr; + else + IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1)); + } + if (IE2) { + if ((IE2 != V && !IE2->hasOneUse()) || + getInsertIndex(IE2).value_or(*Idx1) == *Idx1) + IE2 = nullptr; + else + IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2)); + } + } while (IE1 || IE2); + return false; } -Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, +std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the // node. - if (!TE.ReuseShuffleIndices.empty()) - return None; + if (!TE.ReuseShuffleIndices.empty()) { + // Check if reuse shuffle indices can be improved by reordering. + // For this, check that reuse mask is "clustered", i.e. each scalar values + // is used once in each submask of size <number_of_scalars>. + // Example: 4 scalar values. + // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. + // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because + // element 3 is used twice in the second submask. + unsigned Sz = TE.Scalars.size(); + if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, + Sz)) + return std::nullopt; + unsigned VF = TE.getVectorFactor(); + // Try build correct order for extractelement instructions. + SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(), + TE.ReuseShuffleIndices.end()); + if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() && + all_of(TE.Scalars, [Sz](Value *V) { + std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V)); + return Idx && *Idx < Sz; + })) { + SmallVector<int> ReorderMask(Sz, UndefMaskElem); + if (TE.ReorderIndices.empty()) + std::iota(ReorderMask.begin(), ReorderMask.end(), 0); + else + inversePermutation(TE.ReorderIndices, ReorderMask); + for (unsigned I = 0; I < VF; ++I) { + int &Idx = ReusedMask[I]; + if (Idx == UndefMaskElem) + continue; + Value *V = TE.Scalars[ReorderMask[Idx]]; + std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V)); + Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI)); + } + } + // Build the order of the VF size, need to reorder reuses shuffles, they are + // always of VF size. + OrdersType ResOrder(VF); + std::iota(ResOrder.begin(), ResOrder.end(), 0); + auto *It = ResOrder.begin(); + for (unsigned K = 0; K < VF; K += Sz) { + OrdersType CurrentOrder(TE.ReorderIndices); + SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)}; + if (SubMask.front() == UndefMaskElem) + std::iota(SubMask.begin(), SubMask.end(), 0); + reorderOrder(CurrentOrder, SubMask); + transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); + std::advance(It, Sz); + } + if (all_of(enumerate(ResOrder), + [](const auto &Data) { return Data.index() == Data.value(); })) + return {}; // Use identity order. + return ResOrder; + } if (TE.State == TreeEntry::Vectorize && (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && !TE.isAltShuffle()) return TE.ReorderIndices; + if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { + auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) { + if (!V1->hasOneUse() || !V2->hasOneUse()) + return false; + auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin()); + auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin()); + if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1)) + if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) { + if (!areTwoInsertFromSameBuildVector( + IE1, IE2, + [](InsertElementInst *II) { return II->getOperand(0); })) + return false; + std::optional<unsigned> Idx1 = getInsertIndex(IE1); + std::optional<unsigned> Idx2 = getInsertIndex(IE2); + if (Idx1 == std::nullopt || Idx2 == std::nullopt) + return false; + return *Idx1 < *Idx2; + } + if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1)) + if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) { + if (EE1->getOperand(0) != EE2->getOperand(0)) + return false; + std::optional<unsigned> Idx1 = getExtractIndex(EE1); + std::optional<unsigned> Idx2 = getExtractIndex(EE2); + if (Idx1 == std::nullopt || Idx2 == std::nullopt) + return false; + return *Idx1 < *Idx2; + } + return false; + }; + auto IsIdentityOrder = [](const OrdersType &Order) { + for (unsigned Idx : seq<unsigned>(0, Order.size())) + if (Idx != Order[Idx]) + return false; + return true; + }; + if (!TE.ReorderIndices.empty()) + return TE.ReorderIndices; + DenseMap<Value *, unsigned> PhiToId; + SmallVector<Value *, 4> Phis; + OrdersType ResOrder(TE.Scalars.size()); + for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) { + PhiToId[TE.Scalars[Id]] = Id; + Phis.push_back(TE.Scalars[Id]); + } + llvm::stable_sort(Phis, PHICompare); + for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) + ResOrder[Id] = PhiToId[Phis[Id]]; + if (IsIdentityOrder(ResOrder)) + return {}; + return ResOrder; + } if (TE.State == TreeEntry::NeedToGather) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. @@ -3694,13 +4053,55 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, return CurrentOrder; } } - if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) + if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; if (TE.Scalars.size() >= 4) - if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE)) + if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE)) return Order; } - return None; + return std::nullopt; +} + +/// Checks if the given mask is a "clustered" mask with the same clusters of +/// size \p Sz, which are not identity submasks. +static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask, + unsigned Sz) { + ArrayRef<int> FirstCluster = Mask.slice(0, Sz); + if (ShuffleVectorInst::isIdentityMask(FirstCluster)) + return false; + for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) { + ArrayRef<int> Cluster = Mask.slice(I, Sz); + if (Cluster != FirstCluster) + return false; + } + return true; +} + +void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const { + // Reorder reuses mask. + reorderReuses(TE.ReuseShuffleIndices, Mask); + const unsigned Sz = TE.Scalars.size(); + // For vectorized and non-clustered reused no need to do anything else. + if (TE.State != TreeEntry::NeedToGather || + !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, + Sz) || + !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) + return; + SmallVector<int> NewMask; + inversePermutation(TE.ReorderIndices, NewMask); + addMask(NewMask, TE.ReuseShuffleIndices); + // Clear reorder since it is going to be applied to the new mask. + TE.ReorderIndices.clear(); + // Try to improve gathered nodes with clustered reuses, if possible. + ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz); + SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end()); + inversePermutation(NewOrder, NewMask); + reorderScalars(TE.Scalars, NewMask); + // Fill the reuses mask with the identity submasks. + for (auto *It = TE.ReuseShuffleIndices.begin(), + *End = TE.ReuseShuffleIndices.end(); + It != End; std::advance(It, Sz)) + std::iota(It, std::next(It, Sz), 0); } void BoUpSLP::reorderTopToBottom() { @@ -3710,6 +4111,9 @@ void BoUpSLP::reorderTopToBottom() { // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; + // Phi nodes can have preferred ordering based on their result users + DenseMap<const TreeEntry *, OrdersType> PhisToOrders; + // AltShuffles can also have a preferred ordering that leads to fewer // instructions, e.g., the addsub instruction in x86. DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders; @@ -3724,13 +4128,13 @@ void BoUpSLP::reorderTopToBottom() { // extracts. for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries, &GathersToOrders, &ExternalUserReorderMap, - &AltShufflesToOrders]( + &AltShufflesToOrders, &PhisToOrders]( const std::unique_ptr<TreeEntry> &TE) { // Look for external users that will probably be vectorized. SmallVector<OrdersType, 1> ExternalUserReorderIndices = findExternalStoreUsersReorderIndices(TE.get()); if (!ExternalUserReorderIndices.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); ExternalUserReorderMap.try_emplace(TE.get(), std::move(ExternalUserReorderIndices)); } @@ -3750,13 +4154,13 @@ void BoUpSLP::reorderTopToBottom() { OpcodeMask.set(Lane); // If this pattern is supported by the target then we consider the order. if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); } // TODO: Check the reverse order too. } - if (Optional<OrdersType> CurrentOrder = + if (std::optional<OrdersType> CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/true)) { // Do not include ordering for nodes used in the alt opcode vectorization, // better to reorder them during bottom-to-top stage. If follow the order @@ -3778,14 +4182,17 @@ void BoUpSLP::reorderTopToBottom() { UserTE = UserTE->UserTreeIndices.back().UserTE; ++Cnt; } - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - if (TE->State != TreeEntry::Vectorize) + VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); + if (TE->State == TreeEntry::Vectorize && + TE->getOpcode() == Instruction::PHI) + PhisToOrders.try_emplace(TE.get(), *CurrentOrder); } }); // Reorder the graph nodes according to their vectorization factor. - for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1; + for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; VF /= 2) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) @@ -3803,12 +4210,13 @@ void BoUpSLP::reorderTopToBottom() { for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, // just need to merge reordering shuffle and the reuse shuffle. - if (!OpTE->ReuseShuffleIndices.empty()) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; // Count number of orders uses. - const auto &Order = [OpTE, &GathersToOrders, - &AltShufflesToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) { + const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, + &PhisToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather || + !OpTE->ReuseShuffleIndices.empty()) { auto It = GathersToOrders.find(OpTE); if (It != GathersToOrders.end()) return It->second; @@ -3818,14 +4226,28 @@ void BoUpSLP::reorderTopToBottom() { if (It != AltShufflesToOrders.end()) return It->second; } + if (OpTE->State == TreeEntry::Vectorize && + OpTE->getOpcode() == Instruction::PHI) { + auto It = PhisToOrders.find(OpTE); + if (It != PhisToOrders.end()) + return It->second; + } return OpTE->ReorderIndices; }(); // First consider the order of the external scalar users. auto It = ExternalUserReorderMap.find(OpTE); if (It != ExternalUserReorderMap.end()) { const auto &ExternalUserReorderIndices = It->second; - for (const OrdersType &ExtOrder : ExternalUserReorderIndices) - ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + // If the OpTE vector factor != number of scalars - use natural order, + // it is an attempt to reorder node with reused scalars but with + // external uses. + if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { + OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += + ExternalUserReorderIndices.size(); + } else { + for (const OrdersType &ExtOrder : ExternalUserReorderIndices) + ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + } // No other useful reorder data in this entry. if (Order.empty()) continue; @@ -3885,7 +4307,7 @@ void BoUpSLP::reorderTopToBottom() { "All users must be of VF size."); // Update ordering of the operands with the smaller VF than the given // one. - reorderReuses(TE->ReuseShuffleIndices, Mask); + reorderNodeWithReuses(*TE, Mask); } continue; } @@ -3982,10 +4404,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { const std::unique_ptr<TreeEntry> &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - if (Optional<OrdersType> CurrentOrder = + if (std::optional<OrdersType> CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - if (TE->State != TreeEntry::Vectorize) + if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } }); @@ -4057,10 +4479,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { TreeEntry *OpTE = Op.second; if (!VisitedOps.insert(OpTE).second) continue; - if (!OpTE->ReuseShuffleIndices.empty()) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) + if (OpTE->State == TreeEntry::NeedToGather || + !OpTE->ReuseShuffleIndices.empty()) return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); @@ -4166,8 +4589,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (!VisitedOps.insert(TE).second) continue; if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { - // Just reorder reuses indices. - reorderReuses(TE->ReuseShuffleIndices, Mask); + reorderNodeWithReuses(*TE, Mask); continue; } // Gathers are processed separately. @@ -4322,7 +4744,7 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { return PtrToStoresMap; } -bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec, +bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec, OrdersType &ReorderIndices) const { // We check whether the stores in StoreVec can form a vector by sorting them // and checking whether they are consecutive. @@ -4336,7 +4758,7 @@ bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec, Value *S0Ptr = S0->getPointerOperand(); for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) { StoreInst *SI = StoresVec[Idx]; - Optional<int> Diff = + std::optional<int> Diff = getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); @@ -4416,7 +4838,7 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { // If the stores are not consecutive then abandon this StoresVec. OrdersType ReorderIndices; - if (!CanFormVector(StoresVec, ReorderIndices)) + if (!canFormVector(StoresVec, ReorderIndices)) continue; // We now know that the scalars in StoresVec can form a vector instruction, @@ -4472,24 +4894,24 @@ static std::pair<size_t, size_t> generateKeySubkey( hash_code SubKey = hash_value(0); // Sort the loads by the distance between the pointers. if (auto *LI = dyn_cast<LoadInst>(V)) { - Key = hash_combine(hash_value(Instruction::Load), Key); + Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key); if (LI->isSimple()) SubKey = hash_value(LoadsSubkeyGenerator(Key, LI)); else - SubKey = hash_value(LI); + Key = SubKey = hash_value(LI); } else if (isVectorLikeInstWithConstOps(V)) { // Sort extracts by the vector operands. if (isa<ExtractElementInst, UndefValue>(V)) Key = hash_value(Value::UndefValueVal + 1); if (auto *EI = dyn_cast<ExtractElementInst>(V)) { - if (!isUndefVector(EI->getVectorOperand()) && + if (!isUndefVector(EI->getVectorOperand()).all() && !isa<UndefValue>(EI->getIndexOperand())) SubKey = hash_value(EI->getVectorOperand()); } } else if (auto *I = dyn_cast<Instruction>(V)) { // Sort other instructions just by the opcodes except for CMPInst. // For CMP also sort by the predicate kind. - if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) && + if ((isa<BinaryOperator, CastInst>(I)) && isValidForAlternation(I->getOpcode())) { if (AllowAlternate) Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0); @@ -4504,7 +4926,7 @@ static std::pair<size_t, size_t> generateKeySubkey( if (isa<CastInst>(I)) { std::pair<size_t, size_t> OpVals = generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator, - /*=AllowAlternate*/ true); + /*AllowAlternate=*/true); Key = hash_combine(OpVals.first, Key); SubKey = hash_combine(OpVals.first, SubKey); } @@ -4547,6 +4969,13 @@ static std::pair<size_t, size_t> generateKeySubkey( return std::make_pair(Key, SubKey); } +/// Checks if the specified instruction \p I is an alternate operation for +/// the given \p MainOp and \p AltOp instructions. +static bool isAlternateInstruction(const Instruction *I, + const Instruction *MainOp, + const Instruction *AltOp, + const TargetLibraryInfo &TLI); + void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -4557,7 +4986,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, &UserTreeIdx, this](const InstructionsState &S) { // Check that every instruction appears once in this bundle. - DenseMap<Value *, unsigned> UniquePositions; + DenseMap<Value *, unsigned> UniquePositions(VL.size()); for (Value *V : VL) { if (isConstant(V)) { ReuseShuffleIndicies.emplace_back( @@ -4583,7 +5012,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, })) || !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; } VL = UniqueValues; @@ -4591,7 +5020,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return true; }; - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(VL, *TLI); // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of // a load), in which case peek through to include it in the tree, without @@ -4607,7 +5036,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4618,7 +5047,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4627,14 +5056,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (S.OpValue->getType()->isVectorTy() && !isa<InsertElementInst>(S.OpValue)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } @@ -4715,14 +5144,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, BB && sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, SortedIndices)); - if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts || + if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) || (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>( S.OpValue) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4736,7 +5165,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (EphValues.count(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } } @@ -4748,7 +5177,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4768,7 +5197,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4780,7 +5209,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (UserIgnoreList && UserIgnoreList->contains(V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4789,9 +5218,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Special processing for sorted pointers for ScatterVectorize node with // constant indeces only. - if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) && - UserTreeIdx.UserTE && - UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) { + if (AreAllSameInsts && UserTreeIdx.UserTE && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && + !(S.getOpcode() && allSameBlock(VL))) { assert(S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && @@ -4799,7 +5228,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Reset S to make it GetElementPtr kind of node. const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }); assert(It != VL.end() && "Expected at least one GEP."); - S = getSameOpcode(*It); + S = getSameOpcode(*It, *TLI); } // Check that all of the users of the scalars that we want to vectorize are @@ -4811,7 +5240,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } @@ -4820,7 +5249,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // place to insert a shuffle if we need to, so just avoid that issue. if (isa<CatchSwitchInst>(BB->getTerminator())) { LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } @@ -4834,7 +5263,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, BlockScheduling &BS = *BSRef; - Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); + std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); @@ -4844,7 +5273,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4864,7 +5293,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -4931,7 +5360,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; @@ -4944,7 +5373,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ValueSet SourceVectors; for (Value *V : VL) { SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); - assert(getInsertIndex(V) != None && "Non-constant or undef index?"); + assert(getInsertIndex(V) != std::nullopt && + "Non-constant or undef index?"); } if (count_if(VL, [&SourceVectors](Value *V) { @@ -4953,7 +5383,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Found 2nd source vector - cancel. LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " "different source vectors.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); BS.cancelScheduling(VL, VL0); return; } @@ -4979,7 +5409,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (IsIdentity) CurrentOrder.clear(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - None, CurrentOrder); + std::nullopt, CurrentOrder); LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); constexpr int NumOps = 2; @@ -5003,8 +5433,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, SmallVector<Value *> PointerOps; OrdersType CurrentOrder; TreeEntry *TE = nullptr; - switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder, - PointerOps)) { + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, + CurrentOrder, PointerOps)) { case LoadsState::Vectorize: if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. @@ -5030,7 +5460,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, break; case LoadsState::Gather: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -5065,7 +5495,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); @@ -5098,7 +5528,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); @@ -5115,7 +5545,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -5162,7 +5592,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -5190,7 +5620,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (I->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -5208,7 +5638,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -5230,7 +5660,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -5298,7 +5728,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); return; @@ -5313,7 +5743,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto *SI = cast<StoreInst>(V); if (!SI->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); return; @@ -5336,7 +5766,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - Optional<int> Dist = + std::optional<int> Dist = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); // Check that the sorted pointer operands are consecutive. if (static_cast<unsigned>(*Dist) == VL.size() - 1) { @@ -5361,7 +5791,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; @@ -5379,7 +5809,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!VecFunc && !isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; @@ -5398,7 +5828,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); @@ -5411,7 +5841,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << ScalarArgs[j] << "!=" << A1J @@ -5426,7 +5856,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *V << '\n'); @@ -5457,7 +5887,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // then do not vectorize this instruction. if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; @@ -5473,31 +5903,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!CI || all_of(VL, [](Value *V) { return cast<CmpInst>(V)->isCommutative(); })) { - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, + *this); } else { - CmpInst::Predicate P0 = CI->getPredicate(); - CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate(); - assert(P0 != AltP0 && + auto *MainCI = cast<CmpInst>(S.MainOp); + auto *AltCI = cast<CmpInst>(S.AltOp); + CmpInst::Predicate MainP = MainCI->getPredicate(); + CmpInst::Predicate AltP = AltCI->getPredicate(); + assert(MainP != AltP && "Expected different main/alternate predicates."); - CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); - Value *BaseOp0 = VL0->getOperand(0); - Value *BaseOp1 = VL0->getOperand(1); // Collect operands - commute if it uses the swapped predicate or // alternate operation. for (Value *V : VL) { auto *Cmp = cast<CmpInst>(V); Value *LHS = Cmp->getOperand(0); Value *RHS = Cmp->getOperand(1); - CmpInst::Predicate CurrentPred = Cmp->getPredicate(); - if (P0 == AltP0Swapped) { - if (CI != Cmp && S.AltOp != Cmp && - ((P0 == CurrentPred && - !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || - (AltP0 == CurrentPred && - areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))) + + if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) { + if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) + std::swap(LHS, RHS); + } else { + if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) std::swap(LHS, RHS); - } else if (P0 != CurrentPred && AltP0 != CurrentPred) { - std::swap(LHS, RHS); } Left.push_back(LHS); Right.push_back(RHS); @@ -5523,7 +5950,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; @@ -5534,8 +5961,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; - while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) || - isa<VectorType>(EltTy)) { + while (isa<StructType, ArrayType, VectorType>(EltTy)) { if (auto *ST = dyn_cast<StructType>(EltTy)) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) @@ -5617,7 +6043,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) if (isa<UndefValue>(EE->getIndexOperand())) continue; - Optional<unsigned> Idx = getExtractIndex(Inst); + std::optional<unsigned> Idx = getExtractIndex(Inst); if (!Idx) break; const unsigned ExtIdx = *Idx; @@ -5785,32 +6211,388 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, } } -/// Checks if the specified instruction \p I is an alternate operation for the -/// given \p MainOp and \p AltOp instructions. static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, - const Instruction *AltOp) { - if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) { - auto *AltCI0 = cast<CmpInst>(AltOp); + const Instruction *AltOp, + const TargetLibraryInfo &TLI) { + if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) { + auto *AltCI = cast<CmpInst>(AltOp); + CmpInst::Predicate MainP = MainCI->getPredicate(); + CmpInst::Predicate AltP = AltCI->getPredicate(); + assert(MainP != AltP && "Expected different main/alternate predicates."); auto *CI = cast<CmpInst>(I); - CmpInst::Predicate P0 = CI0->getPredicate(); - CmpInst::Predicate AltP0 = AltCI0->getPredicate(); - assert(P0 != AltP0 && "Expected different main/alternate predicates."); - CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); - CmpInst::Predicate CurrentPred = CI->getPredicate(); - if (P0 == AltP0Swapped) - return I == AltCI0 || - (I != MainOp && - !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), - CI->getOperand(0), CI->getOperand(1))); - return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; + if (isCmpSameOrSwapped(MainCI, CI, TLI)) + return false; + if (isCmpSameOrSwapped(AltCI, CI, TLI)) + return true; + CmpInst::Predicate P = CI->getPredicate(); + CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P); + + assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && + "CmpInst expected to match either main or alternate predicate or " + "their swap."); + (void)AltP; + return MainP != P && MainP != SwappedP; } return I->getOpcode() == AltOp->getOpcode(); } +TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL, + unsigned OpIdx) { + assert(!VL.empty()); + const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof)); + const auto *Op0 = I0->getOperand(OpIdx); + + const bool IsConstant = all_of(VL, [&](Value *V) { + // TODO: We should allow undef elements here + const auto *I = dyn_cast<Instruction>(V); + if (!I) + return true; + auto *Op = I->getOperand(OpIdx); + return isConstant(Op) && !isa<UndefValue>(Op); + }); + const bool IsUniform = all_of(VL, [&](Value *V) { + // TODO: We should allow undef elements here + const auto *I = dyn_cast<Instruction>(V); + if (!I) + return false; + return I->getOperand(OpIdx) == Op0; + }); + const bool IsPowerOfTwo = all_of(VL, [&](Value *V) { + // TODO: We should allow undef elements here + const auto *I = dyn_cast<Instruction>(V); + if (!I) { + assert((isa<UndefValue>(V) || + I0->getOpcode() == Instruction::GetElementPtr) && + "Expected undef or GEP."); + return true; + } + auto *Op = I->getOperand(OpIdx); + if (auto *CI = dyn_cast<ConstantInt>(Op)) + return CI->getValue().isPowerOf2(); + return false; + }); + const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) { + // TODO: We should allow undef elements here + const auto *I = dyn_cast<Instruction>(V); + if (!I) { + assert((isa<UndefValue>(V) || + I0->getOpcode() == Instruction::GetElementPtr) && + "Expected undef or GEP."); + return true; + } + const auto *Op = I->getOperand(OpIdx); + if (auto *CI = dyn_cast<ConstantInt>(Op)) + return CI->getValue().isNegatedPowerOf2(); + return false; + }); + + TTI::OperandValueKind VK = TTI::OK_AnyValue; + if (IsConstant && IsUniform) + VK = TTI::OK_UniformConstantValue; + else if (IsConstant) + VK = TTI::OK_NonUniformConstantValue; + else if (IsUniform) + VK = TTI::OK_UniformValue; + + TTI::OperandValueProperties VP = TTI::OP_None; + VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP; + VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP; + + return {VK, VP}; +} + +namespace { +/// The base class for shuffle instruction emission and shuffle cost estimation. +class BaseShuffleAnalysis { +protected: + /// Checks if the mask is an identity mask. + /// \param IsStrict if is true the function returns false if mask size does + /// not match vector size. + static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy, + bool IsStrict) { + int Limit = Mask.size(); + int VF = VecTy->getNumElements(); + return (VF == Limit || !IsStrict) && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask); + } + + /// Tries to combine 2 different masks into single one. + /// \param LocalVF Vector length of the permuted input vector. \p Mask may + /// change the size of the vector, \p LocalVF is the original size of the + /// shuffled vector. + static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask, + ArrayRef<int> ExtMask) { + unsigned VF = Mask.size(); + SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) + continue; + int MaskedIdx = Mask[ExtMask[I] % VF]; + NewMask[I] = + MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF; + } + Mask.swap(NewMask); + } + + /// Looks through shuffles trying to reduce final number of shuffles in the + /// code. The function looks through the previously emitted shuffle + /// instructions and properly mark indices in mask as undef. + /// For example, given the code + /// \code + /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> + /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> + /// \endcode + /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will + /// look through %s1 and %s2 and select vectors %0 and %1 with mask + /// <0, 1, 2, 3> for the shuffle. + /// If 2 operands are of different size, the smallest one will be resized and + /// the mask recalculated properly. + /// For example, given the code + /// \code + /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> + /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> + /// \endcode + /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will + /// look through %s1 and %s2 and select vectors %0 and %1 with mask + /// <0, 1, 2, 3> for the shuffle. + /// So, it tries to transform permutations to simple vector merge, if + /// possible. + /// \param V The input vector which must be shuffled using the given \p Mask. + /// If the better candidate is found, \p V is set to this best candidate + /// vector. + /// \param Mask The input mask for the shuffle. If the best candidate is found + /// during looking-through-shuffles attempt, it is updated accordingly. + /// \param SinglePermute true if the shuffle operation is originally a + /// single-value-permutation. In this case the look-through-shuffles procedure + /// may look for resizing shuffles as the best candidates. + /// \return true if the shuffle results in the non-resizing identity shuffle + /// (and thus can be ignored), false - otherwise. + static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask, + bool SinglePermute) { + Value *Op = V; + ShuffleVectorInst *IdentityOp = nullptr; + SmallVector<int> IdentityMask; + while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) { + // Exit if not a fixed vector type or changing size shuffle. + auto *SVTy = dyn_cast<FixedVectorType>(SV->getType()); + if (!SVTy) + break; + // Remember the identity or broadcast mask, if it is not a resizing + // shuffle. If no better candidates are found, this Op and Mask will be + // used in the final shuffle. + if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) { + if (!IdentityOp || !SinglePermute || + (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) && + !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) { + IdentityOp = SV; + // Store current mask in the IdentityMask so later we did not lost + // this info if IdentityOp is selected as the best candidate for the + // permutation. + IdentityMask.assign(Mask); + } + } + // Remember the broadcast mask. If no better candidates are found, this Op + // and Mask will be used in the final shuffle. + // Zero splat can be used as identity too, since it might be used with + // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. + // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is + // expensive, the analysis founds out, that the source vector is just a + // broadcast, this original mask can be transformed to identity mask <0, + // 1, 2, 3>. + // \code + // %0 = shuffle %v, poison, zeroinitalizer + // %res = shuffle %0, poison, <3, 1, 2, 0> + // \endcode + // may be transformed to + // \code + // %0 = shuffle %v, poison, zeroinitalizer + // %res = shuffle %0, poison, <0, 1, 2, 3> + // \endcode + if (SV->isZeroEltSplat()) { + IdentityOp = SV; + IdentityMask.assign(Mask); + } + int LocalVF = Mask.size(); + if (auto *SVOpTy = + dyn_cast<FixedVectorType>(SV->getOperand(0)->getType())) + LocalVF = SVOpTy->getNumElements(); + SmallVector<int> ExtMask(Mask.size(), UndefMaskElem); + for (auto [Idx, I] : enumerate(Mask)) { + if (I == UndefMaskElem) + continue; + ExtMask[Idx] = SV->getMaskValue(I); + } + bool IsOp1Undef = + isUndefVector(SV->getOperand(0), + buildUseMask(LocalVF, ExtMask, UseMask::FirstArg)) + .all(); + bool IsOp2Undef = + isUndefVector(SV->getOperand(1), + buildUseMask(LocalVF, ExtMask, UseMask::SecondArg)) + .all(); + if (!IsOp1Undef && !IsOp2Undef) { + // Update mask and mark undef elems. + for (int &I : Mask) { + if (I == UndefMaskElem) + continue; + if (SV->getMaskValue(I % SV->getShuffleMask().size()) == + UndefMaskElem) + I = UndefMaskElem; + } + break; + } + SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(), + SV->getShuffleMask().end()); + combineMasks(LocalVF, ShuffleMask, Mask); + Mask.swap(ShuffleMask); + if (IsOp2Undef) + Op = SV->getOperand(0); + else + Op = SV->getOperand(1); + } + if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType()); + !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) { + if (IdentityOp) { + V = IdentityOp; + assert(Mask.size() == IdentityMask.size() && + "Expected masks of same sizes."); + // Clear known poison elements. + for (auto [I, Idx] : enumerate(Mask)) + if (Idx == UndefMaskElem) + IdentityMask[I] = UndefMaskElem; + Mask.swap(IdentityMask); + auto *Shuffle = dyn_cast<ShuffleVectorInst>(V); + return SinglePermute && + (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()), + /*IsStrict=*/true) || + (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && + Shuffle->isZeroEltSplat() && + ShuffleVectorInst::isZeroEltSplatMask(Mask))); + } + V = Op; + return false; + } + V = Op; + return true; + } + + /// Smart shuffle instruction emission, walks through shuffles trees and + /// tries to find the best matching vector for the actual shuffle + /// instruction. + template <typename ShuffleBuilderTy> + static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask, + ShuffleBuilderTy &Builder) { + assert(V1 && "Expected at least one vector value."); + int VF = Mask.size(); + if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) + VF = FTy->getNumElements(); + if (V2 && + !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) { + // Peek through shuffles. + Value *Op1 = V1; + Value *Op2 = V2; + int VF = + cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); + SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem); + SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (Mask[I] < VF) + CombinedMask1[I] = Mask[I]; + else + CombinedMask2[I] = Mask[I] - VF; + } + Value *PrevOp1; + Value *PrevOp2; + do { + PrevOp1 = Op1; + PrevOp2 = Op2; + (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false); + (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false); + // Check if we have 2 resizing shuffles - need to peek through operands + // again. + if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1)) + if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) { + SmallVector<int> ExtMask1(Mask.size(), UndefMaskElem); + for (auto [Idx, I] : enumerate(CombinedMask1)) { + if (I == UndefMaskElem) + continue; + ExtMask1[Idx] = SV1->getMaskValue(I); + } + SmallBitVector UseMask1 = buildUseMask( + cast<FixedVectorType>(SV1->getOperand(1)->getType()) + ->getNumElements(), + ExtMask1, UseMask::SecondArg); + SmallVector<int> ExtMask2(CombinedMask2.size(), UndefMaskElem); + for (auto [Idx, I] : enumerate(CombinedMask2)) { + if (I == UndefMaskElem) + continue; + ExtMask2[Idx] = SV2->getMaskValue(I); + } + SmallBitVector UseMask2 = buildUseMask( + cast<FixedVectorType>(SV2->getOperand(1)->getType()) + ->getNumElements(), + ExtMask2, UseMask::SecondArg); + if (SV1->getOperand(0)->getType() == + SV2->getOperand(0)->getType() && + SV1->getOperand(0)->getType() != SV1->getType() && + isUndefVector(SV1->getOperand(1), UseMask1).all() && + isUndefVector(SV2->getOperand(1), UseMask2).all()) { + Op1 = SV1->getOperand(0); + Op2 = SV2->getOperand(0); + SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(), + SV1->getShuffleMask().end()); + int LocalVF = ShuffleMask1.size(); + if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType())) + LocalVF = FTy->getNumElements(); + combineMasks(LocalVF, ShuffleMask1, CombinedMask1); + CombinedMask1.swap(ShuffleMask1); + SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(), + SV2->getShuffleMask().end()); + LocalVF = ShuffleMask2.size(); + if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType())) + LocalVF = FTy->getNumElements(); + combineMasks(LocalVF, ShuffleMask2, CombinedMask2); + CombinedMask2.swap(ShuffleMask2); + } + } + } while (PrevOp1 != Op1 || PrevOp2 != Op2); + Builder.resizeToMatch(Op1, Op2); + VF = std::max(cast<VectorType>(Op1->getType()) + ->getElementCount() + .getKnownMinValue(), + cast<VectorType>(Op2->getType()) + ->getElementCount() + .getKnownMinValue()); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (CombinedMask2[I] != UndefMaskElem) { + assert(CombinedMask1[I] == UndefMaskElem && + "Expected undefined mask element"); + CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); + } + } + return Builder.createShuffleVector( + Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, + CombinedMask1); + } + if (isa<PoisonValue>(V1)) + return PoisonValue::get(FixedVectorType::get( + cast<VectorType>(V1->getType())->getElementType(), Mask.size())); + SmallVector<int> NewMask(Mask.begin(), Mask.end()); + bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); + assert(V1 && "Expected non-null value after looking through shuffles."); + + if (!IsIdentity) + return Builder.createShuffleVector(V1, NewMask); + return V1; + } +}; +} // namespace + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals) { - ArrayRef<Value*> VL = E->Scalars; + ArrayRef<Value *> VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -5832,9 +6614,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); // FIXME: it tries to fix a problem with MSVC buildbots. - TargetTransformInfo &TTIRef = *TTI; - auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, - VectorizedVals, E](InstructionCost &Cost) { + TargetTransformInfo *TTI = this->TTI; + auto AdjustExtractsCost = [=](InstructionCost &Cost) { + // If the resulting type is scalarized, do not adjust the cost. + unsigned VecNumParts = TTI->getNumberOfParts(VecTy); + if (VecNumParts == VecTy->getNumElements()) + return; DenseMap<Value *, int> ExtractVectorsTys; SmallPtrSet<Value *, 4> CheckedExtracts; for (auto *V : VL) { @@ -5852,12 +6637,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, (VE && VE != E)) continue; auto *EE = cast<ExtractElementInst>(V); - Optional<unsigned> EEIdx = getExtractIndex(EE); + std::optional<unsigned> EEIdx = getExtractIndex(EE); if (!EEIdx) continue; unsigned Idx = *EEIdx; - if (TTIRef.getNumberOfParts(VecTy) != - TTIRef.getNumberOfParts(EE->getVectorOperandType())) { + if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) { auto It = ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; It->getSecond() = std::min<int>(It->second, Idx); @@ -5865,23 +6649,23 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // Take credit for instruction that will become dead. if (EE->hasOneUse()) { Instruction *Ext = EE->user_back(); - if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa<GetElementPtrInst>(U); })) { + if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) { + return isa<GetElementPtrInst>(U); + })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. Cost -= - TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); + TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); // Add back the cost of s|zext which is subtracted separately. - Cost += TTIRef.getCastInstrCost( + Cost += TTI->getCastInstrCost( Ext->getOpcode(), Ext->getType(), EE->getType(), TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } - Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement, - EE->getVectorOperandType(), Idx); + Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, + Idx); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -5889,13 +6673,13 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, unsigned NumElts = VecTy->getNumElements(); if (Data.second % NumElts == 0) continue; - if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { + if (TTI->getNumberOfParts(EEVTy) > VecNumParts) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); if (Idx + NumElts <= EENumElts) { Cost += - TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, Idx, VecTy); + TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, VecTy); } else { // Need to round up the subvector type vectorization factor to avoid a // crash in cost model functions. Make SubVT so that Idx + VF of SubVT @@ -5903,12 +6687,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, auto *SubVT = FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); Cost += - TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, Idx, SubVT); + TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, SubVT); } } else { - Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, None, 0, EEVTy); + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector, + VecTy, std::nullopt, CostKind, 0, EEVTy); } } }; @@ -5917,13 +6701,36 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return 0; if (isa<InsertElementInst>(VL[0])) return InstructionCost::getInvalid(); + SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the reorder indices and reorder scalars per this + // mask. + SmallVector<int> ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(GatheredScalars, ReorderMask); SmallVector<int> Mask; + std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle; SmallVector<const TreeEntry *> Entries; - Optional<TargetTransformInfo::ShuffleKind> Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle) { + // Do not try to look for reshuffled loads for gathered loads (they will be + // handled later), for vectorized scalars, and cases, which are definitely + // not profitable (splats and small gather nodes.) + if (E->getOpcode() != Instruction::Load || E->isAltShuffle() || + all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || + isSplat(E->Scalars) || + (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) + GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + if (GatherShuffle) { + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); InstructionCost GatherCost = 0; - if (ShuffleVectorInst::isIdentityMask(Mask)) { + int Limit = Mask.size() * 2; + if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( @@ -5942,8 +6749,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // previously vectorized nodes. Add the cost of the permutation rather // than gather. ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); + GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); } + if (!all_of(GatheredScalars, UndefValue::classof)) + GatherCost += getGatherCost(GatheredScalars); return GatherCost; } if ((E->getOpcode() == Instruction::ExtractElement || @@ -5955,7 +6764,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. SmallVector<int> Mask; - Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = + std::optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isFixedVectorShuffle(VL, Mask); if (ShuffleKind) { // Found the bunch of extractelement instructions that must be gathered @@ -5975,9 +6784,24 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // broadcast. assert(VecTy == FinalVecTy && "No reused scalars expected for broadcast."); - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, - /*Mask=*/None, /*Index=*/0, - /*SubTp=*/nullptr, /*Args=*/VL[0]); + const auto *It = + find_if(VL, [](Value *V) { return !isa<UndefValue>(V); }); + // If all values are undefs - consider cost free. + if (It == VL.end()) + return TTI::TCC_Free; + // Add broadcast for non-identity shuffle only. + bool NeedShuffle = + VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof); + InstructionCost InsertCost = + TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, + /*Index=*/0, PoisonValue::get(VecTy), *It); + return InsertCost + (NeedShuffle + ? TTI->getShuffleCost( + TargetTransformInfo::SK_Broadcast, VecTy, + /*Mask=*/std::nullopt, CostKind, + /*Index=*/0, + /*SubTp=*/nullptr, /*Args=*/VL[0]) + : TTI::TCC_Free); } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) @@ -6003,7 +6827,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, OrdersType CurrentOrder; LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, - CurrentOrder, PointerOps); + *TLI, CurrentOrder, PointerOps); switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: @@ -6046,9 +6870,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, InstructionCost ScalarsCost = 0; for (Value *V : VectorizedLoads) { auto *LI = cast<LoadInst>(V); - ScalarsCost += TTI->getMemoryOpCost( - Instruction::Load, LI->getType(), LI->getAlign(), - LI->getPointerAddressSpace(), CostKind, LI); + ScalarsCost += + TTI->getMemoryOpCost(Instruction::Load, LI->getType(), + LI->getAlign(), LI->getPointerAddressSpace(), + CostKind, TTI::OperandValueInfo(), LI); } auto *LI = cast<LoadInst>(E->getMainOp()); auto *LoadTy = FixedVectorType::get(LI->getType(), VF); @@ -6056,7 +6881,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, GatherCost += VectorizedCnt * TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, LI); + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI); GatherCost += ScatterVectorizeCnt * TTI->getGatherScatterOpCost( Instruction::Load, LoadTy, LI->getPointerOperand(), @@ -6064,8 +6890,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, if (NeedInsertSubvectorAnalysis) { // Add the cost for the subvectors insert. for (int I = VF, E = VL.size(); I < E; I += VF) - GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, - None, I, LoadTy); + GatherCost += + TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, + std::nullopt, CostKind, I, LoadTy); } return ReuseShuffleCost + GatherCost - ScalarsCost; } @@ -6101,240 +6928,306 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); + const unsigned Sz = VL.size(); + auto GetCostDiff = + [=](function_ref<InstructionCost(unsigned)> ScalarEltCost, + function_ref<InstructionCost(InstructionCost)> VectorCost) { + // Calculate the cost of this instruction. + InstructionCost ScalarCost = 0; + if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) { + // For some of the instructions no need to calculate cost for each + // particular instruction, we can use the cost of the single + // instruction x total number of scalar instructions. + ScalarCost = Sz * ScalarEltCost(0); + } else { + for (unsigned I = 0; I < Sz; ++I) + ScalarCost += ScalarEltCost(I); + } + + InstructionCost VecCost = VectorCost(CommonCost); + LLVM_DEBUG( + dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost)); + // Disable warnings for `this` and `E` are unused. Required for + // `dumpTreeCosts`. + (void)this; + (void)E; + return VecCost - ScalarCost; + }; + // Calculate cost difference from vectorizing set of GEPs. + // Negative value means vectorizing is profitable. + auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) { + InstructionCost CostSavings = 0; + for (Value *V : Ptrs) { + if (V == BasePtr) + continue; + auto *Ptr = dyn_cast<GetElementPtrInst>(V); + // GEPs may contain just addresses without instructions, considered free. + // GEPs with all constant indices also considered to have zero cost. + if (!Ptr || Ptr->hasAllConstantIndices()) + continue; + + // Here we differentiate two cases: when GEPs represent a regular + // vectorization tree node (and hence vectorized) and when the set is + // arguments of a set of loads or stores being vectorized. In the former + // case all the scalar GEPs will be removed as a result of vectorization. + // For any external uses of some lanes extract element instructions will + // be generated (which cost is estimated separately). For the latter case + // since the set of GEPs itself is not vectorized those used more than + // once will remain staying in vectorized code as well. So we should not + // count them as savings. + if (!Ptr->hasOneUse() && isa<LoadInst, StoreInst>(VL0)) + continue; + + // TODO: it is target dependent, so need to implement and then use a TTI + // interface. + CostSavings += TTI->getArithmeticInstrCost(Instruction::Add, + Ptr->getType(), CostKind); + } + LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n"; + E->dump()); + LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n"); + return InstructionCost() - CostSavings; + }; + switch (ShuffleOrOp) { - case Instruction::PHI: - return 0; + case Instruction::PHI: { + // Count reused scalars. + InstructionCost ScalarCost = 0; + SmallPtrSet<const TreeEntry *, 4> CountedOps; + for (Value *V : VL) { + auto *PHI = dyn_cast<PHINode>(V); + if (!PHI) + continue; - case Instruction::ExtractValue: - case Instruction::ExtractElement: { - // The common cost of removal ExtractElement/ExtractValue instructions + - // the cost of shuffles, if required to resuffle the original vector. - if (NeedToShuffleReuses) { - unsigned Idx = 0; - for (unsigned I : E->ReuseShuffleIndices) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *EE = cast<ExtractElementInst>(VL[I]); - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - EE->getVectorOperandType(), - *getExtractIndex(EE)); - } else { - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); - ++Idx; - } - } - Idx = EntryVF; - for (Value *V : VL) { - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *EE = cast<ExtractElementInst>(V); - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - EE->getVectorOperandType(), - *getExtractIndex(EE)); - } else { - --Idx; - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); - } - } - } - if (ShuffleOrOp == Instruction::ExtractValue) { - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - auto *EI = cast<Instruction>(VL[I]); - // Take credit for instruction that will become dead. - if (EI->hasOneUse()) { - Instruction *Ext = EI->user_back(); - if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa<GetElementPtrInst>(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - CommonCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, I); - // Add back the cost of s|zext which is subtracted separately. - CommonCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); - continue; - } - } - CommonCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); - } - } else { - AdjustExtractsCost(CommonCost); + ValueList Operands(PHI->getNumIncomingValues(), nullptr); + for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) { + Value *Op = PHI->getIncomingValue(I); + Operands[I] = Op; } - return CommonCost; + if (const TreeEntry *OpTE = getTreeEntry(Operands.front())) + if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second) + if (!OpTE->ReuseShuffleIndices.empty()) + ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() - + OpTE->Scalars.size()); } - case Instruction::InsertElement: { - assert(E->ReuseShuffleIndices.empty() && - "Unique insertelements only are expected."); - auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); - unsigned const NumElts = SrcVecTy->getNumElements(); - unsigned const NumScalars = VL.size(); - - unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); - - unsigned OffsetBeg = *getInsertIndex(VL.front()); - unsigned OffsetEnd = OffsetBeg; - for (Value *V : VL.drop_front()) { - unsigned Idx = *getInsertIndex(V); - if (OffsetBeg > Idx) - OffsetBeg = Idx; - else if (OffsetEnd < Idx) - OffsetEnd = Idx; - } - unsigned VecScalarsSz = PowerOf2Ceil(NumElts); - if (NumOfParts > 0) - VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); - unsigned VecSz = - (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * - VecScalarsSz; - unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); - unsigned InsertVecSz = std::min<unsigned>( - PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), - ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * - VecScalarsSz); - bool IsWholeSubvector = - OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); - // Check if we can safely insert a subvector. If it is not possible, just - // generate a whole-sized vector and shuffle the source vector and the new - // subvector. - if (OffsetBeg + InsertVecSz > VecSz) { - // Align OffsetBeg to generate correct mask. - OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); - InsertVecSz = VecSz; - } - - APInt DemandedElts = APInt::getZero(NumElts); - // TODO: Add support for Instruction::InsertValue. - SmallVector<int> Mask; - if (!E->ReorderIndices.empty()) { - inversePermutation(E->ReorderIndices, Mask); - Mask.append(InsertVecSz - Mask.size(), UndefMaskElem); + + return CommonCost - ScalarCost; + } + case Instruction::ExtractValue: + case Instruction::ExtractElement: { + auto GetScalarCost = [=](unsigned Idx) { + auto *I = cast<Instruction>(VL[Idx]); + VectorType *SrcVecTy; + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast<ExtractElementInst>(I); + SrcVecTy = EE->getVectorOperandType(); } else { - Mask.assign(VecSz, UndefMaskElem); - std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); - } - bool IsIdentity = true; - SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem); - Mask.swap(PrevMask); - for (unsigned I = 0; I < NumScalars; ++I) { - unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); - DemandedElts.setBit(InsertIdx); - IsIdentity &= InsertIdx - OffsetBeg == I; - Mask[InsertIdx - OffsetBeg] = I; + auto *EV = cast<ExtractValueInst>(I); + Type *AggregateTy = EV->getAggregateOperand()->getType(); + unsigned NumElts; + if (auto *ATy = dyn_cast<ArrayType>(AggregateTy)) + NumElts = ATy->getNumElements(); + else + NumElts = AggregateTy->getStructNumElements(); + SrcVecTy = FixedVectorType::get(ScalarTy, NumElts); } - assert(Offset < NumElts && "Failed to find vector index offset"); - - InstructionCost Cost = 0; - Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, - /*Insert*/ true, /*Extract*/ false); - - // First cost - resize to actual vector size if not identity shuffle or - // need to shift the vector. - // Do not calculate the cost if the actual size is the register size and - // we can merge this shuffle with the following SK_Select. - auto *InsertVecTy = - FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); - if (!IsIdentity) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - InsertVecTy, Mask); - auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { - return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); - })); - // Second cost - permutation with subvector, if some elements are from the - // initial vector or inserting a subvector. - // TODO: Implement the analysis of the FirstInsert->getOperand(0) - // subvector of ActualVecTy. - if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts && - !IsWholeSubvector) { - if (InsertVecSz != VecSz) { - auto *ActualVecTy = - FixedVectorType::get(SrcVecTy->getElementType(), VecSz); - Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, - None, OffsetBeg - Offset, InsertVecTy); - } else { - for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) - Mask[I] = I; - for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; - I <= End; ++I) - if (Mask[I] != UndefMaskElem) - Mask[I] = I + VecSz; - for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) - Mask[I] = I; - Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); + if (I->hasOneUse()) { + Instruction *Ext = I->user_back(); + if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa<GetElementPtrInst>(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + InstructionCost Cost = TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I)); + // Subtract the cost of s|zext which is subtracted separately. + Cost -= TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), I->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + return Cost; } } - return Cost; + return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, + CostKind, *getExtractIndex(I)); + }; + auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::InsertElement: { + assert(E->ReuseShuffleIndices.empty() && + "Unique insertelements only are expected."); + auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); + unsigned const NumElts = SrcVecTy->getNumElements(); + unsigned const NumScalars = VL.size(); + + unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); + + SmallVector<int> InsertMask(NumElts, UndefMaskElem); + unsigned OffsetBeg = *getInsertIndex(VL.front()); + unsigned OffsetEnd = OffsetBeg; + InsertMask[OffsetBeg] = 0; + for (auto [I, V] : enumerate(VL.drop_front())) { + unsigned Idx = *getInsertIndex(V); + if (OffsetBeg > Idx) + OffsetBeg = Idx; + else if (OffsetEnd < Idx) + OffsetEnd = Idx; + InsertMask[Idx] = I + 1; + } + unsigned VecScalarsSz = PowerOf2Ceil(NumElts); + if (NumOfParts > 0) + VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); + unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * + VecScalarsSz; + unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); + unsigned InsertVecSz = std::min<unsigned>( + PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), + ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz); + bool IsWholeSubvector = + OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); + // Check if we can safely insert a subvector. If it is not possible, just + // generate a whole-sized vector and shuffle the source vector and the new + // subvector. + if (OffsetBeg + InsertVecSz > VecSz) { + // Align OffsetBeg to generate correct mask. + OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); + InsertVecSz = VecSz; + } + + APInt DemandedElts = APInt::getZero(NumElts); + // TODO: Add support for Instruction::InsertValue. + SmallVector<int> Mask; + if (!E->ReorderIndices.empty()) { + inversePermutation(E->ReorderIndices, Mask); + Mask.append(InsertVecSz - Mask.size(), UndefMaskElem); + } else { + Mask.assign(VecSz, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); + } + bool IsIdentity = true; + SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem); + Mask.swap(PrevMask); + for (unsigned I = 0; I < NumScalars; ++I) { + unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); + DemandedElts.setBit(InsertIdx); + IsIdentity &= InsertIdx - OffsetBeg == I; + Mask[InsertIdx - OffsetBeg] = I; + } + assert(Offset < NumElts && "Failed to find vector index offset"); + + InstructionCost Cost = 0; + Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, + /*Insert*/ true, /*Extract*/ false, + CostKind); + + // First cost - resize to actual vector size if not identity shuffle or + // need to shift the vector. + // Do not calculate the cost if the actual size is the register size and + // we can merge this shuffle with the following SK_Select. + auto *InsertVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); + if (!IsIdentity) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + InsertVecTy, Mask); + auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); + })); + // Second cost - permutation with subvector, if some elements are from the + // initial vector or inserting a subvector. + // TODO: Implement the analysis of the FirstInsert->getOperand(0) + // subvector of ActualVecTy. + SmallBitVector InMask = + isUndefVector(FirstInsert->getOperand(0), + buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask)); + if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { + if (InsertVecSz != VecSz) { + auto *ActualVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), VecSz); + Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, + std::nullopt, CostKind, OffsetBeg - Offset, + InsertVecTy); + } else { + for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) + Mask[I] = InMask.test(I) ? UndefMaskElem : I; + for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; + I <= End; ++I) + if (Mask[I] != UndefMaskElem) + Mask[I] = I + VecSz; + for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) + Mask[I] = + ((I >= InMask.size()) || InMask.test(I)) ? UndefMaskElem : I; + Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); + } } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { + return Cost; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto GetScalarCost = [=](unsigned Idx) { + auto *VI = cast<Instruction>(VL[Idx]); + return TTI->getCastInstrCost(E->getOpcode(), ScalarTy, + VI->getOperand(0)->getType(), + TTI::getCastContextHint(VI), CostKind, VI); + }; + auto GetVectorCost = [=](InstructionCost CommonCost) { Type *SrcTy = VL0->getOperand(0)->getType(); - InstructionCost ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, - TTI::getCastContextHint(VL0), CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - - // Calculate the cost of this instruction. - InstructionCost ScalarCost = VL.size() * ScalarEltCost; - auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); - InstructionCost VecCost = 0; + InstructionCost VecCost = CommonCost; // Check if the values are candidates to demote. - if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = CommonCost + TTI->getCastInstrCost( - E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); - } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return VecCost - ScalarCost; - } - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: { - // Calculate the cost of this instruction. - InstructionCost ScalarEltCost = - TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), - CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } + if (!MinBWs.count(VL0) || VecTy != SrcVecTy) + VecCost += + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); + return VecCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: { + CmpInst::Predicate VecPred, SwappedVecPred; + auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value()); + if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) || + match(VL0, MatchCmp)) + SwappedVecPred = CmpInst::getSwappedPredicate(VecPred); + else + SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy() + ? CmpInst::BAD_FCMP_PREDICATE + : CmpInst::BAD_ICMP_PREDICATE; + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast<Instruction>(VL[Idx]); + CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy() + ? CmpInst::BAD_FCMP_PREDICATE + : CmpInst::BAD_ICMP_PREDICATE; + auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); + if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) && + !match(VI, MatchCmp)) || + (CurrentPred != VecPred && CurrentPred != SwappedVecPred)) + VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy() + ? CmpInst::BAD_FCMP_PREDICATE + : CmpInst::BAD_ICMP_PREDICATE; + + return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, + Builder.getInt1Ty(), CurrentPred, CostKind, + VI); + }; + auto GetVectorCost = [&](InstructionCost CommonCost) { auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - - // Check if all entries in VL are either compares or selects with compares - // as condition that have the same predicates. - CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; - bool First = true; - for (auto *V : VL) { - CmpInst::Predicate CurrentPred; - auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); - if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && - !match(V, MatchCmp)) || - (!First && VecPred != CurrentPred)) { - VecPred = CmpInst::BAD_ICMP_PREDICATE; - break; - } - First = false; - VecPred = CurrentPred; - } InstructionCost VecCost = TTI->getCmpSelInstrCost( E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); - // Check if it is possible and profitable to use min/max for selects in - // VL. + // Check if it is possible and profitable to use min/max for selects + // in VL. // auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { @@ -6342,216 +7235,181 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, {VecTy, VecTy}); InstructionCost IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - // If the selects are the only uses of the compares, they will be dead - // and we can adjust the cost by removing their cost. + // If the selects are the only uses of the compares, they will be + // dead and we can adjust the cost by removing their cost. if (IntrinsicAndUse.second) IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, VecPred, CostKind); VecCost = std::min(VecCost, IntrinsicCost); } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::FNeg: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Certain instructions can be cheaper to vectorize if they have a - // constant second vector operand. - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - TargetTransformInfo::OperandValueProperties Op1VP = - TargetTransformInfo::OP_None; - TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_PowerOf2; - - // If all operands are exactly the same ConstantInt then set the - // operand kind to OK_UniformConstantValue. - // If instead not all operands are constants, then set the operand kind - // to OK_AnyValue. If all operands are constants but not the same, - // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast<Instruction>(VL[i]); - unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0; - ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx)); - if (!CInt) { - Op2VK = TargetTransformInfo::OK_AnyValue; - Op2VP = TargetTransformInfo::OP_None; - break; - } - if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; - } - if (CInt0 != CInt) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - - SmallVector<const Value *, 4> Operands(VL0->operand_values()); - InstructionCost ScalarEltCost = - TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, - Op2VK, Op1VP, Op2VP, Operands, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecCost = - TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, - Op2VK, Op1VP, Op2VP, Operands, VL0); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::GetElementPtr: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - any_of(VL, - [](Value *V) { - return isa<GetElementPtrInst>(V) && - !isConstant( - cast<GetElementPtrInst>(V)->getOperand(1)); - }) - ? TargetTransformInfo::OK_AnyValue - : TargetTransformInfo::OK_UniformConstantValue; - - InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( - Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecCost = TTI->getArithmeticInstrCost( - Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - case Instruction::Load: { - // Cost of wide load - cost of scalar loads. - Align Alignment = cast<LoadInst>(VL0)->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; + return VecCost + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::FNeg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + auto GetScalarCost = [=](unsigned Idx) { + auto *VI = cast<Instruction>(VL[Idx]); + unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1; + TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); + TTI::OperandValueInfo Op2Info = + TTI::getOperandInfo(VI->getOperand(OpIdx)); + SmallVector<const Value *> Operands(VI->operand_values()); + return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind, + Op1Info, Op2Info, Operands, VI); + }; + auto GetVectorCost = [=](InstructionCost CommonCost) { + unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; + TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0); + TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx); + return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, + Op2Info) + + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::GetElementPtr: { + return CommonCost + GetGEPCostDiff(VL, VL0); + } + case Instruction::Load: { + auto GetScalarCost = [=](unsigned Idx) { + auto *VI = cast<LoadInst>(VL[Idx]); + return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), + VI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), VI); + }; + auto *LI0 = cast<LoadInst>(VL0); + auto GetVectorCost = [=](InstructionCost CommonCost) { InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { - VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, - CostKind, VL0); + VecLdCost = TTI->getMemoryOpCost( + Instruction::Load, VecTy, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); - Align CommonAlignment = Alignment; + Align CommonAlignment = LI0->getAlign(); for (Value *V : VL) CommonAlignment = std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( - Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind, VL0); + Instruction::Load, VecTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind); } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); - return CommonCost + VecLdCost - ScalarLdCost; - } - case Instruction::Store: { + return VecLdCost + CommonCost; + }; + + InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); + // If this node generates masked gather load then it is not a terminal node. + // Hence address operand cost is estimated separately. + if (E->State == TreeEntry::ScatterVectorize) + return Cost; + + // Estimate cost of GEPs since this tree node is a terminator. + SmallVector<Value *> PointerOps(VL.size()); + for (auto [I, V] : enumerate(VL)) + PointerOps[I] = cast<LoadInst>(V)->getPointerOperand(); + return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand()); + } + case Instruction::Store: { + bool IsReorder = !E->ReorderIndices.empty(); + auto GetScalarCost = [=](unsigned Idx) { + auto *VI = cast<StoreInst>(VL[Idx]); + TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0); + return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(), + VI->getPointerAddressSpace(), CostKind, + OpInfo, VI); + }; + auto *BaseSI = + cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); + auto GetVectorCost = [=](InstructionCost CommonCost) { // We know that we can merge the stores. Calculate the cost. - bool IsReorder = !E->ReorderIndices.empty(); - auto *SI = - cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); - Align Alignment = SI->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); - InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecStCost = TTI->getMemoryOpCost( - Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); - return CommonCost + VecStCost - ScalarStCost; + TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0); + return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(), + BaseSI->getPointerAddressSpace(), CostKind, + OpInfo) + + CommonCost; + }; + SmallVector<Value *> PointerOps(VL.size()); + for (auto [I, V] : enumerate(VL)) { + unsigned Idx = IsReorder ? E->ReorderIndices[I] : I; + PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand(); } - case Instruction::Call: { - CallInst *CI = cast<CallInst>(VL0); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, 1); - InstructionCost ScalarEltCost = - TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; + return GetCostDiff(GetScalarCost, GetVectorCost) + + GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); + } + case Instruction::Call: { + auto GetScalarCost = [=](unsigned Idx) { + auto *CI = cast<CallInst>(VL[Idx]); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID != Intrinsic::not_intrinsic) { + IntrinsicCostAttributes CostAttrs(ID, *CI, 1); + return TTI->getIntrinsicInstrCost(CostAttrs, CostKind); } - InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; - + return TTI->getCallInstrCost(CI->getCalledFunction(), + CI->getFunctionType()->getReturnType(), + CI->getFunctionType()->params(), CostKind); + }; + auto GetVectorCost = [=](InstructionCost CommonCost) { + auto *CI = cast<CallInst>(VL0); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); - InstructionCost VecCallCost = - std::min(VecCallCosts.first, VecCallCosts.second); - - LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost - << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *CI << "\n"); - - return CommonCost + VecCallCost - ScalarCallCost; - } - case Instruction::ShuffleVector: { - assert(E->isAltShuffle() && - ((Instruction::isBinaryOp(E->getOpcode()) && - Instruction::isBinaryOp(E->getAltOpcode())) || - (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode())) || - (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && - "Invalid Shuffle Vector Operand"); - InstructionCost ScalarCost = 0; - if (NeedToShuffleReuses) { - for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast<Instruction>(VL[Idx]); - CommonCost -= TTI->getInstructionCost(I, CostKind); - } - for (Value *V : VL) { - Instruction *I = cast<Instruction>(V); - CommonCost += TTI->getInstructionCost(I, CostKind); - } - } - for (Value *V : VL) { - Instruction *I = cast<Instruction>(V); - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - ScalarCost += TTI->getInstructionCost(I, CostKind); + return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + case Instruction::ShuffleVector: { + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode())) || + (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && + "Invalid Shuffle Vector Operand"); + // Try to find the previous shuffle node with the same operands and same + // main/alternate ops. + auto TryFindNodeWithEqualOperands = [=]() { + for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + if (TE.get() == E) + break; + if (TE->isAltShuffle() && + ((TE->getOpcode() == E->getOpcode() && + TE->getAltOpcode() == E->getAltOpcode()) || + (TE->getOpcode() == E->getAltOpcode() && + TE->getAltOpcode() == E->getOpcode())) && + TE->hasEqualOperands(*E)) + return true; } + return false; + }; + auto GetScalarCost = [=](unsigned Idx) { + auto *VI = cast<Instruction>(VL[Idx]); + assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); + (void)E; + return TTI->getInstructionCost(VI, CostKind); + }; + // Need to clear CommonCost since the final shuffle cost is included into + // vector cost. + auto GetVectorCost = [&](InstructionCost) { // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. InstructionCost VecCost = 0; - // Try to find the previous shuffle node with the same operands and same - // main/alternate ops. - auto &&TryFindNodeWithEqualOperands = [this, E]() { - for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { - if (TE.get() == E) - break; - if (TE->isAltShuffle() && - ((TE->getOpcode() == E->getOpcode() && - TE->getAltOpcode() == E->getAltOpcode()) || - (TE->getOpcode() == E->getAltOpcode() && - TE->getAltOpcode() == E->getOpcode())) && - TE->hasEqualOperands(*E)) - return true; - } - return false; - }; if (TryFindNodeWithEqualOperands()) { LLVM_DEBUG({ dbgs() << "SLP: diamond match for alternate node found.\n"; @@ -6561,8 +7419,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // same main/alternate vector ops, just do different shuffling. } else if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, - CostKind); + VecCost += + TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), @@ -6581,9 +7439,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, TTI::CastContextHint::None, CostKind); } - if (E->ReuseShuffleIndices.empty()) { - CommonCost = + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); } else { SmallVector<int> Mask; @@ -6594,14 +7451,15 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return I->getOpcode() == E->getAltOpcode(); }, Mask); - CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); } - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); - return CommonCost + VecCost - ScalarCost; - } - default: - llvm_unreachable("Unknown instruction"); + return VecCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } + default: + llvm_unreachable("Unknown instruction"); } } @@ -6817,9 +7675,30 @@ InstructionCost BoUpSLP::getSpillCost() const { continue; } + auto NoCallIntrinsic = [this](Instruction *I) { + if (auto *II = dyn_cast<IntrinsicInst>(I)) { + if (II->isAssumeLikeIntrinsic()) + return true; + FastMathFlags FMF; + SmallVector<Type *, 4> Tys; + for (auto &ArgOp : II->args()) + Tys.push_back(ArgOp->getType()); + if (auto *FPMO = dyn_cast<FPMathOperator>(II)) + FMF = FPMO->getFastMathFlags(); + IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys, + FMF); + InstructionCost IntrCost = + TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput); + InstructionCost CallCost = TTI->getCallInstrCost( + nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput); + if (IntrCost < CallCost) + return true; + } + return false; + }; + // Debug information does not impact spill cost. - if ((isa<CallInst>(&*PrevInstIt) && - !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && + if (isa<CallInst>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) && &*PrevInstIt != PrevInst) NumCalls++; @@ -6843,48 +7722,6 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } -/// Check if two insertelement instructions are from the same buildvector. -static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, - InsertElementInst *V) { - // Instructions must be from the same basic blocks. - if (VU->getParent() != V->getParent()) - return false; - // Checks if 2 insertelements are from the same buildvector. - if (VU->getType() != V->getType()) - return false; - // Multiple used inserts are separate nodes. - if (!VU->hasOneUse() && !V->hasOneUse()) - return false; - auto *IE1 = VU; - auto *IE2 = V; - unsigned Idx1 = *getInsertIndex(IE1); - unsigned Idx2 = *getInsertIndex(IE2); - // Go through the vector operand of insertelement instructions trying to find - // either VU as the original vector for IE2 or V as the original vector for - // IE1. - do { - if (IE2 == VU) - return VU->hasOneUse(); - if (IE1 == V) - return V->hasOneUse(); - if (IE1) { - if ((IE1 != VU && !IE1->hasOneUse()) || - getInsertIndex(IE1).value_or(Idx2) == Idx2) - IE1 = nullptr; - else - IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); - } - if (IE2) { - if ((IE2 != V && !IE2->hasOneUse()) || - getInsertIndex(IE2).value_or(Idx1) == Idx1) - IE2 = nullptr; - else - IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); - } - } while (IE1 || IE2); - return false; -} - /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the /// buildvector sequence. static bool isFirstInsertElement(const InsertElementInst *IE1, @@ -6919,13 +7756,11 @@ namespace { /// value, otherwise. struct ValueSelect { template <typename U> - static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type - get(Value *V) { + static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) { return V; } template <typename U> - static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type - get(Value *) { + static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) { return U(); } }; @@ -6947,19 +7782,23 @@ template <typename T> static T *performExtractsShuffleAction( MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base, function_ref<unsigned(T *)> GetVF, - function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction, + function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction, function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) { assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts."); SmallVector<int> Mask(ShuffleMask.begin()->second); auto VMIt = std::next(ShuffleMask.begin()); T *Prev = nullptr; - bool IsBaseNotUndef = !isUndefVector(Base); - if (IsBaseNotUndef) { + SmallBitVector UseMask = + buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask); + SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask); + if (!IsBaseUndef.all()) { // Base is not undef, need to combine it with the next subvectors. - std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask); + std::pair<T *, bool> Res = + ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false); + SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask); for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { if (Mask[Idx] == UndefMaskElem) - Mask[Idx] = Idx; + Mask[Idx] = IsBasePoison.test(Idx) ? UndefMaskElem : Idx; else Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; } @@ -6971,7 +7810,8 @@ static T *performExtractsShuffleAction( } else if (ShuffleMask.size() == 1) { // Base is undef and only 1 vector is shuffled - perform the action only for // single vector, if the mask is not the identity mask. - std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask); + std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask, + /*ForSingleMask=*/true); if (Res.second) // Identity mask is found. Prev = Res.first; @@ -6995,9 +7835,10 @@ static T *performExtractsShuffleAction( Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); } else { // Vectors of different sizes - resize and reshuffle. - std::pair<T *, bool> Res1 = - ResizeAction(ShuffleMask.begin()->first, Mask); - std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second); + std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask, + /*ForSingleMask=*/false); + std::pair<T *, bool> Res2 = + ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); ArrayRef<int> SecMask = VMIt->second; for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { if (Mask[I] != UndefMaskElem) { @@ -7013,10 +7854,13 @@ static T *performExtractsShuffleAction( } VMIt = std::next(VMIt); } + bool IsBaseNotUndef = !IsBaseUndef.all(); + (void)IsBaseNotUndef; // Perform requested actions for the remaining masks/vectors. for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { // Shuffle other input vectors, if any. - std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second); + std::pair<T *, bool> Res = + ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); ArrayRef<int> SecMask = VMIt->second; for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { if (SecMask[I] != UndefMaskElem) { @@ -7041,6 +7885,18 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I]; + if (TE.State == TreeEntry::NeedToGather) { + if (const TreeEntry *E = getTreeEntry(TE.getMainOp()); + E && E->getVectorFactor() == TE.getVectorFactor() && + E->isSame(TE.Scalars)) { + // Some gather nodes might be absolutely the same as some vectorizable + // nodes after reordering, need to handle it. + LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with " + << *TE.Scalars[0] << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + continue; + } + } InstructionCost C = getEntryCost(&TE, VectorizedVals); Cost += C; @@ -7071,24 +7927,25 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { if (isa<FixedVectorType>(EU.Scalar->getType())) continue; - // Already counted the cost for external uses when tried to adjust the cost - // for extractelements, no need to add it again. - if (isa<ExtractElementInst>(EU.Scalar)) - continue; - // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { - Optional<unsigned> InsertIdx = getInsertIndex(VU); + std::optional<unsigned> InsertIdx = getInsertIndex(VU); if (InsertIdx) { const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); - auto *It = - find_if(FirstUsers, - [VU](const std::pair<Value *, const TreeEntry *> &Pair) { - return areTwoInsertFromSameBuildVector( - VU, cast<InsertElementInst>(Pair.first)); - }); + auto *It = find_if( + FirstUsers, + [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) { + return areTwoInsertFromSameBuildVector( + VU, cast<InsertElementInst>(Pair.first), + [this](InsertElementInst *II) -> Value * { + Value *Op0 = II->getOperand(0); + if (getTreeEntry(II) && !getTreeEntry(Op0)) + return nullptr; + return Op0; + }); + }); int VecId = -1; if (It == FirstUsers.end()) { (void)ShuffleMasks.emplace_back(); @@ -7140,6 +7997,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); @@ -7149,14 +8007,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + CostKind, EU.Lane); } } InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) { + auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask, + bool) { InstructionCost C = 0; unsigned VF = Mask.size(); unsigned VecVF = TE->getVectorFactor(); @@ -7218,12 +8077,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { return TEs.back(); }; (void)performExtractsShuffleAction<const TreeEntry>( - makeMutableArrayRef(Vector.data(), Vector.size()), Base, + MutableArrayRef(Vector.data(), Vector.size()), Base, [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF, EstimateShufflesCost); InstructionCost InsertCost = TTI->getScalarizationOverhead( cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I], - /*Insert*/ true, /*Extract*/ false); + /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput); Cost -= InsertCost; } @@ -7243,22 +8102,89 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { return Cost; } -Optional<TargetTransformInfo::ShuffleKind> -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, +std::optional<TargetTransformInfo::ShuffleKind> +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, + SmallVectorImpl<int> &Mask, SmallVectorImpl<const TreeEntry *> &Entries) { + Entries.clear(); + // No need to check for the topmost gather node. + if (TE == VectorizableTree.front().get()) + return std::nullopt; + Mask.assign(VL.size(), UndefMaskElem); + assert(TE->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - Mask.assign(TE->Scalars.size(), UndefMaskElem); - Entries.clear(); + Instruction &UserInst = + getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE); + auto *PHI = dyn_cast<PHINode>(&UserInst); + auto *NodeUI = DT->getNode( + PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx) + : UserInst.getParent()); + assert(NodeUI && "Should only process reachable instructions"); + SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end()); + auto CheckOrdering = [&](Instruction *LastEI) { + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + // Gather nodes usually are not scheduled and inserted before their first + // user node. So, instead of checking dependency between the gather nodes + // themselves, we check the dependency between their user nodes. + // If one user node comes before the second one, we cannot use the second + // gather node as the source vector for the first gather node, because in + // the list of instructions it will be emitted later. + auto *EntryParent = LastEI->getParent(); + auto *NodeEUI = DT->getNode(EntryParent); + if (!NodeEUI) + return false; + assert((NodeUI == NodeEUI) == + (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + // Check the order of the gather nodes users. + if (UserInst.getParent() != EntryParent && + (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI))) + return false; + if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI)) + return false; + return true; + }; // Build a lists of values to tree entries. DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs; for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { if (EntryPtr.get() == TE) - break; + continue; if (EntryPtr->State != TreeEntry::NeedToGather) continue; + if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) { + return GatheredScalars.contains(V); + })) + continue; + assert(EntryPtr->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); + Instruction &EntryUserInst = + getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE); + if (&UserInst == &EntryUserInst) { + // If 2 gathers are operands of the same entry, compare operands indices, + // use the earlier one as the base. + if (TE->UserTreeIndices.front().UserTE == + EntryPtr->UserTreeIndices.front().UserTE && + TE->UserTreeIndices.front().EdgeIdx < + EntryPtr->UserTreeIndices.front().EdgeIdx) + continue; + } + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + auto *EntryPHI = dyn_cast<PHINode>(&EntryUserInst); + auto *EntryI = + EntryPHI + ? EntryPHI + ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx) + ->getTerminator() + : &EntryUserInst; + if (!CheckOrdering(EntryI)) + continue; for (Value *V : EntryPtr->Scalars) - ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); + if (!isConstant(V)) + ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); } // Find all tree entries used by the gathered values. If no common entries // found - not a shuffle. @@ -7270,7 +8196,7 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; DenseMap<Value *, int> UsedValuesEntry; for (Value *V : TE->Scalars) { - if (isa<UndefValue>(V)) + if (isConstant(V)) continue; // Build a list of tree entries where V is used. SmallPtrSet<const TreeEntry *, 4> VToTEs; @@ -7280,10 +8206,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, if (const TreeEntry *VTE = getTreeEntry(V)) VToTEs.insert(VTE); if (VToTEs.empty()) - return None; + continue; if (UsedTEs.empty()) { // The first iteration, just insert the list of nodes to vector. UsedTEs.push_back(VToTEs); + UsedValuesEntry.try_emplace(V, 0); } else { // Need to check if there are any previously used tree nodes which use V. // If there are no such nodes, consider that we have another one input @@ -7308,8 +8235,9 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, if (Idx == UsedTEs.size()) { // If the number of input vectors is greater than 2 - not a permutation, // fallback to the regular gather. + // TODO: support multiple reshuffled nodes. if (UsedTEs.size() == 2) - return None; + continue; UsedTEs.push_back(SavedVToTEs); Idx = UsedTEs.size() - 1; } @@ -7317,32 +8245,55 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, } } - if (UsedTEs.empty()) { - assert(all_of(TE->Scalars, UndefValue::classof) && - "Expected vector of undefs only."); - return None; - } + if (UsedTEs.empty()) + return std::nullopt; unsigned VF = 0; if (UsedTEs.size() == 1) { + // Keep the order to avoid non-determinism. + SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(), + UsedTEs.front().end()); + sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); // Try to find the perfect match in another gather node at first. - auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { - return EntryPtr->isSame(TE->Scalars); + auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { + return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); }); - if (It != UsedTEs.front().end()) { + if (It != FirstEntries.end()) { Entries.push_back(*It); std::iota(Mask.begin(), Mask.end(), 0); + // Clear undef scalars. + for (int I = 0, Sz = VL.size(); I < Sz; ++I) + if (isa<PoisonValue>(TE->Scalars[I])) + Mask[I] = UndefMaskElem; return TargetTransformInfo::SK_PermuteSingleSrc; } - // No perfect match, just shuffle, so choose the first tree node. - Entries.push_back(*UsedTEs.front().begin()); + // No perfect match, just shuffle, so choose the first tree node from the + // tree. + Entries.push_back(FirstEntries.front()); } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); + // Keep the order of tree nodes to avoid non-determinism. DenseMap<int, const TreeEntry *> VFToTE; - for (const TreeEntry *TE : UsedTEs.front()) - VFToTE.try_emplace(TE->getVectorFactor(), TE); - for (const TreeEntry *TE : UsedTEs.back()) { + for (const TreeEntry *TE : UsedTEs.front()) { + unsigned VF = TE->getVectorFactor(); + auto It = VFToTE.find(VF); + if (It != VFToTE.end()) { + if (It->second->Idx > TE->Idx) + It->getSecond() = TE; + continue; + } + VFToTE.try_emplace(VF, TE); + } + // Same, keep the order to avoid non-determinism. + SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(), + UsedTEs.back().end()); + sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); + for (const TreeEntry *TE : SecondEntries) { auto It = VFToTE.find(TE->getVectorFactor()); if (It != VFToTE.end()) { VF = It->first; @@ -7354,40 +8305,135 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, // No 2 source vectors with the same vector factor - give up and do regular // gather. if (Entries.empty()) - return None; - } - + return std::nullopt; + } + + bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof); + // Checks if the 2 PHIs are compatible in terms of high possibility to be + // vectorized. + auto AreCompatiblePHIs = [&](Value *V, Value *V1) { + auto *PHI = cast<PHINode>(V); + auto *PHI1 = cast<PHINode>(V1); + // Check that all incoming values are compatible/from same parent (if they + // are instructions). + // The incoming values are compatible if they all are constants, or + // instruction with the same/alternate opcodes from the same basic block. + for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { + Value *In = PHI->getIncomingValue(I); + Value *In1 = PHI1->getIncomingValue(I); + if (isConstant(In) && isConstant(In1)) + continue; + if (!getSameOpcode({In, In1}, *TLI).getOpcode()) + return false; + if (cast<Instruction>(In)->getParent() != + cast<Instruction>(In1)->getParent()) + return false; + } + return true; + }; + // Check if the value can be ignored during analysis for shuffled gathers. + // We suppose it is better to ignore instruction, which do not form splats, + // are not vectorized/not extractelements (these instructions will be handled + // by extractelements processing) or may form vector node in future. + auto MightBeIgnored = [=](Value *V) { + auto *I = dyn_cast<Instruction>(V); + SmallVector<Value *> IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) && + !isVectorLikeInstWithConstOps(I) && + !areAllUsersVectorized(I, IgnoredVals) && isSimple(I); + }; + // Check that the neighbor instruction may form a full vector node with the + // current instruction V. It is possible, if they have same/alternate opcode + // and same parent basic block. + auto NeighborMightBeIgnored = [&](Value *V, int Idx) { + Value *V1 = VL[Idx]; + bool UsedInSameVTE = false; + auto It = UsedValuesEntry.find(V1); + if (It != UsedValuesEntry.end()) + UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; + return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && + getSameOpcode({V, V1}, *TLI).getOpcode() && + cast<Instruction>(V)->getParent() == + cast<Instruction>(V1)->getParent() && + (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1)); + }; // Build a shuffle mask for better cost estimation and vector emission. - for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { - Value *V = TE->Scalars[I]; - if (isa<UndefValue>(V)) + SmallBitVector UsedIdxs(Entries.size()); + SmallVector<std::pair<unsigned, int>> EntryLanes; + for (int I = 0, E = VL.size(); I < E; ++I) { + Value *V = VL[I]; + auto It = UsedValuesEntry.find(V); + if (It == UsedValuesEntry.end()) + continue; + // Do not try to shuffle scalars, if they are constants, or instructions + // that can be vectorized as a result of the following vector build + // vectorization. + if (isConstant(V) || (MightBeIgnored(V) && + ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || + (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) + continue; + unsigned Idx = It->second; + EntryLanes.emplace_back(Idx, I); + UsedIdxs.set(Idx); + } + // Iterate through all shuffled scalars and select entries, which can be used + // for final shuffle. + SmallVector<const TreeEntry *> TempEntries; + for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { + if (!UsedIdxs.test(I)) continue; - unsigned Idx = UsedValuesEntry.lookup(V); - const TreeEntry *VTE = Entries[Idx]; - int FoundLane = VTE->findLaneForValue(V); - Mask[I] = Idx * VF + FoundLane; - // Extra check required by isSingleSourceMaskImpl function (called by - // ShuffleVectorInst::isSingleSourceMask). - if (Mask[I] >= 2 * E) - return None; + // Fix the entry number for the given scalar. If it is the first entry, set + // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). + // These indices are used when calculating final shuffle mask as the vector + // offset. + for (std::pair<unsigned, int> &Pair : EntryLanes) + if (Pair.first == I) + Pair.first = TempEntries.size(); + TempEntries.push_back(Entries[I]); + } + Entries.swap(TempEntries); + if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) { + // We may have here 1 or 2 entries only. If the number of scalars is equal + // to the number of entries, no need to do the analysis, it is not very + // profitable. Since VL is not the same as TE->Scalars, it means we already + // have some shuffles before. Cut off not profitable case. + Entries.clear(); + return std::nullopt; + } + // Build the final mask, check for the identity shuffle, if possible. + bool IsIdentity = Entries.size() == 1; + // Pair.first is the offset to the vector, while Pair.second is the index of + // scalar in the list. + for (const std::pair<unsigned, int> &Pair : EntryLanes) { + Mask[Pair.second] = Pair.first * VF + + Entries[Pair.first]->findLaneForValue(VL[Pair.second]); + IsIdentity &= Mask[Pair.second] == Pair.second; } switch (Entries.size()) { case 1: - return TargetTransformInfo::SK_PermuteSingleSrc; + if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteSingleSrc; + break; case 2: - return TargetTransformInfo::SK_PermuteTwoSrc; + if (EntryLanes.size() > 2 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteTwoSrc; + break; default: break; } - return None; + Entries.clear(); + return std::nullopt; } InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, const APInt &ShuffledIndices, bool NeedToShuffle) const { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, - /*Extract*/ false); + /*Extract*/ false, CostKind); if (NeedToShuffle) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; @@ -7423,22 +8469,20 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R) { +void BoUpSLP::reorderInputsAccordingToOpcode( + ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI, + const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE, R); + VLOperands Ops(VL, TLI, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); Right = Ops.getVL(1); } -void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { +Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block (except for extractelement-like instructions with // constant indeces). @@ -7487,13 +8531,34 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { return LastInst; }; - auto &&FindFirstInst = [E, Front]() { + auto &&FindFirstInst = [E, Front, this]() { Instruction *FirstInst = Front; for (Value *V : E->Scalars) { auto *I = dyn_cast<Instruction>(V); if (!I) continue; - if (I->comesBefore(FirstInst)) + if (FirstInst->getParent() == I->getParent()) { + if (I->comesBefore(FirstInst)) + FirstInst = I; + continue; + } + assert(isVectorLikeInstWithConstOps(FirstInst) && + isVectorLikeInstWithConstOps(I) && + "Expected vector-like insts only."); + if (!DT->isReachableFromEntry(FirstInst->getParent())) { + FirstInst = I; + continue; + } + if (!DT->isReachableFromEntry(I->getParent())) + continue; + auto *NodeA = DT->getNode(FirstInst->getParent()); + auto *NodeB = DT->getNode(I->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == + (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn()) FirstInst = I; } return FirstInst; @@ -7502,19 +8567,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // Set the insert point to the beginning of the basic block if the entry // should not be scheduled. if (E->State != TreeEntry::NeedToGather && - doesNotNeedToSchedule(E->Scalars)) { + (doesNotNeedToSchedule(E->Scalars) || + all_of(E->Scalars, isVectorLikeInstWithConstOps))) { Instruction *InsertInst; - if (all_of(E->Scalars, isUsedOutsideBlock)) + if (all_of(E->Scalars, [](Value *V) { + return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V); + })) InsertInst = FindLastInst(); else InsertInst = FindFirstInst(); - // If the instruction is PHI, set the insert point after all the PHIs. - if (isa<PHINode>(InsertInst)) - InsertInst = BB->getFirstNonPHI(); - BasicBlock::iterator InsertPt = InsertInst->getIterator(); - Builder.SetInsertPoint(BB, InsertPt); - Builder.SetCurrentDebugLocation(Front->getDebugLoc()); - return; + return *InsertInst; } // The last instruction in the bundle in program order. @@ -7553,17 +8615,29 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // not ideal. However, this should be exceedingly rare since it requires that // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). - if (!LastInst) { + if (!LastInst) LastInst = FindLastInst(); - // If the instruction is PHI, set the insert point after all the PHIs. - if (isa<PHINode>(LastInst)) - LastInst = BB->getFirstNonPHI()->getPrevNode(); - } assert(LastInst && "Failed to find last instruction in bundle"); + return *LastInst; +} - // Set the insertion point after the last instruction in the bundle. Set the - // debug location to Front. - Builder.SetInsertPoint(BB, std::next(LastInst->getIterator())); +void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { + auto *Front = E->getMainOp(); + Instruction *LastInst = EntryToLastInstruction.lookup(E); + assert(LastInst && "Failed to find last instruction in bundle"); + // If the instruction is PHI, set the insert point after all the PHIs. + bool IsPHI = isa<PHINode>(LastInst); + if (IsPHI) + LastInst = LastInst->getParent()->getFirstNonPHI(); + if (IsPHI || (E->State != TreeEntry::NeedToGather && + doesNotNeedToSchedule(E->Scalars))) { + Builder.SetInsertPoint(LastInst); + } else { + // Set the insertion point after the last instruction in the bundle. Set the + // debug location to Front. + Builder.SetInsertPoint(LastInst->getParent(), + std::next(LastInst->getIterator())); + } Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } @@ -7594,7 +8668,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { auto *InsElt = dyn_cast<InsertElementInst>(Vec); if (!InsElt) return Vec; - GatherShuffleSeq.insert(InsElt); + GatherShuffleExtractSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(V)) { @@ -7630,196 +8704,452 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { return Vec; } -namespace { -/// Merges shuffle masks and emits final shuffle instruction, if required. -class ShuffleInstructionBuilder { - IRBuilderBase &Builder; - const unsigned VF = 0; +/// Merges shuffle masks and emits final shuffle instruction, if required. It +/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, +/// when the actual shuffle instruction is generated only if this is actually +/// required. Otherwise, the shuffle instruction emission is delayed till the +/// end of the process, to reduce the number of emitted instructions and further +/// analysis/transformations. +/// The class also will look through the previously emitted shuffle instructions +/// and properly mark indices in mask as undef. +/// For example, given the code +/// \code +/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> +/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> +/// \endcode +/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will +/// look through %s1 and %s2 and emit +/// \code +/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> +/// \endcode +/// instead. +/// If 2 operands are of different size, the smallest one will be resized and +/// the mask recalculated properly. +/// For example, given the code +/// \code +/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> +/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> +/// \endcode +/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will +/// look through %s1 and %s2 and emit +/// \code +/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> +/// \endcode +/// instead. +class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { bool IsFinalized = false; - SmallVector<int, 4> Mask; - /// Holds all of the instructions that we gathered. - SetVector<Instruction *> &GatherShuffleSeq; - /// A list of blocks that we are going to CSE. - SetVector<BasicBlock *> &CSEBlocks; + /// Combined mask for all applied operands and masks. It is built during + /// analysis and actual emission of shuffle vector instructions. + SmallVector<int> CommonMask; + /// List of operands for the shuffle vector instruction. It hold at max 2 + /// operands, if the 3rd is going to be added, the first 2 are combined into + /// shuffle with \p CommonMask mask, the first operand sets to be the + /// resulting shuffle and the second operand sets to be the newly added + /// operand. The \p CommonMask is transformed in the proper way after that. + SmallVector<Value *, 2> InVectors; + IRBuilderBase &Builder; + BoUpSLP &R; -public: - ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF, - SetVector<Instruction *> &GatherShuffleSeq, - SetVector<BasicBlock *> &CSEBlocks) - : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq), - CSEBlocks(CSEBlocks) {} - - /// Adds a mask, inverting it before applying. - void addInversedMask(ArrayRef<unsigned> SubMask) { - if (SubMask.empty()) - return; - SmallVector<int, 4> NewMask; - inversePermutation(SubMask, NewMask); - addMask(NewMask); - } + class ShuffleIRBuilder { + IRBuilderBase &Builder; + /// Holds all of the instructions that we gathered. + SetVector<Instruction *> &GatherShuffleExtractSeq; + /// A list of blocks that we are going to CSE. + SetVector<BasicBlock *> &CSEBlocks; + + public: + ShuffleIRBuilder(IRBuilderBase &Builder, + SetVector<Instruction *> &GatherShuffleExtractSeq, + SetVector<BasicBlock *> &CSEBlocks) + : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq), + CSEBlocks(CSEBlocks) {} + ~ShuffleIRBuilder() = default; + /// Creates shufflevector for the 2 operands with the given mask. + Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) { + Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + /// Creates permutation of the single vector operand with the given mask, if + /// it is not identity mask. + Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) { + if (Mask.empty()) + return V1; + unsigned VF = Mask.size(); + unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements(); + if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask)) + return V1; + Value *Vec = Builder.CreateShuffleVector(V1, Mask); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + /// Resizes 2 input vector to match the sizes, if the they are not equal + /// yet. The smallest vector is resized to the size of the larger vector. + void resizeToMatch(Value *&V1, Value *&V2) { + if (V1->getType() == V2->getType()) + return; + int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements(); + int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); + int VF = std::max(V1VF, V2VF); + int MinVF = std::min(V1VF, V2VF); + SmallVector<int> IdentityMask(VF, UndefMaskElem); + std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF), + 0); + Value *&Op = MinVF == V1VF ? V1 : V2; + Op = Builder.CreateShuffleVector(Op, IdentityMask); + if (auto *I = dyn_cast<Instruction>(Op)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + if (MinVF == V1VF) + V1 = Op; + else + V2 = Op; + } + }; - /// Functions adds masks, merging them into single one. - void addMask(ArrayRef<unsigned> SubMask) { - SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end()); - addMask(NewMask); + /// Smart shuffle instruction emission, walks through shuffles trees and + /// tries to find the best matching vector for the actual shuffle + /// instruction. + Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) { + assert(V1 && "Expected at least one vector value."); + ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, + R.CSEBlocks); + return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder); } - void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); } + /// Transforms mask \p CommonMask per given \p Mask to make proper set after + /// shuffle emission. + static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, + ArrayRef<int> Mask) { + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } - Value *finalize(Value *V) { +public: + ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R) + : Builder(Builder), R(R) {} + + /// Adds 2 input vectors and the mask for their shuffling. + void add(Value *V1, Value *V2, ArrayRef<int> Mask) { + assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); + if (InVectors.empty()) { + InVectors.push_back(V1); + InVectors.push_back(V2); + CommonMask.assign(Mask.begin(), Mask.end()); + return; + } + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Vec = createShuffle(Vec, InVectors.back(), CommonMask); + transformMaskAfterShuffle(CommonMask, Mask); + } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() != + Mask.size()) { + Vec = createShuffle(Vec, nullptr, CommonMask); + transformMaskAfterShuffle(CommonMask, Mask); + } + V1 = createShuffle(V1, V2, Mask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx + Sz; + InVectors.front() = Vec; + if (InVectors.size() == 2) + InVectors.back() = V1; + else + InVectors.push_back(V1); + } + /// Adds another one input vector and the mask for the shuffling. + void add(Value *V1, ArrayRef<int> Mask) { + if (InVectors.empty()) { + if (!isa<FixedVectorType>(V1->getType())) { + V1 = createShuffle(V1, nullptr, CommonMask); + CommonMask.assign(Mask.size(), UndefMaskElem); + transformMaskAfterShuffle(CommonMask, Mask); + } + InVectors.push_back(V1); + CommonMask.assign(Mask.begin(), Mask.end()); + return; + } + const auto *It = find(InVectors, V1); + if (It == InVectors.end()) { + if (InVectors.size() == 2 || + InVectors.front()->getType() != V1->getType() || + !isa<FixedVectorType>(V1->getType())) { + Value *V = InVectors.front(); + if (InVectors.size() == 2) { + V = createShuffle(InVectors.front(), InVectors.back(), CommonMask); + transformMaskAfterShuffle(CommonMask, CommonMask); + } else if (cast<FixedVectorType>(V->getType())->getNumElements() != + CommonMask.size()) { + V = createShuffle(InVectors.front(), nullptr, CommonMask); + transformMaskAfterShuffle(CommonMask, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = + V->getType() != V1->getType() + ? Idx + Sz + : Mask[Idx] + cast<FixedVectorType>(V1->getType()) + ->getNumElements(); + if (V->getType() != V1->getType()) + V1 = createShuffle(V1, nullptr, Mask); + InVectors.front() = V; + if (InVectors.size() == 2) + InVectors.back() = V1; + else + InVectors.push_back(V1); + return; + } + // Check if second vector is required if the used elements are already + // used from the first one. + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) { + InVectors.push_back(V1); + break; + } + } + int VF = CommonMask.size(); + if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) + VF = FTy->getNumElements(); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) + CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); + } + /// Adds another one input vector and the mask for the shuffling. + void addOrdered(Value *V1, ArrayRef<unsigned> Order) { + SmallVector<int> NewMask; + inversePermutation(Order, NewMask); + add(V1, NewMask); + } + /// Finalize emission of the shuffles. + Value * + finalize(ArrayRef<int> ExtMask = std::nullopt) { IsFinalized = true; - unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements(); - if (VF == ValueVF && Mask.empty()) - return V; - SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem); - std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0); - addMask(NormalizedMask); - - if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) - return V; - Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle"); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (!ExtMask.empty()) { + if (CommonMask.empty()) { + CommonMask.assign(ExtMask.begin(), ExtMask.end()); + } else { + SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) + continue; + NewMask[I] = CommonMask[ExtMask[I]]; + } + CommonMask.swap(NewMask); + } } - return Vec; + if (CommonMask.empty()) { + assert(InVectors.size() == 1 && "Expected only one vector with no mask"); + return InVectors.front(); + } + if (InVectors.size() == 2) + return createShuffle(InVectors.front(), InVectors.back(), CommonMask); + return createShuffle(InVectors.front(), nullptr, CommonMask); } ~ShuffleInstructionBuilder() { - assert((IsFinalized || Mask.empty()) && + assert((IsFinalized || CommonMask.empty()) && "Shuffle construction must be finalized."); } }; -} // namespace -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { +Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { + ArrayRef<Value *> VL = E->getOperand(NodeIdx); const unsigned VF = VL.size(); - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }); if (It != VL.end()) - S = getSameOpcode(*It); + S = getSameOpcode(*It, *TLI); } if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) { - if (!E->ReuseShuffleIndices.empty()) { - // Reshuffle to get only unique values. - // If some of the scalars are duplicated in the vectorization tree - // entry, we do not vectorize them but instead generate a mask for - // the reuses. But if there are several users of the same entry, - // they may have different vectorization factors. This is especially - // important for PHI nodes. In this case, we need to adapt the - // resulting instruction for the user vectorization factor and have - // to reshuffle it again to take only unique elements of the vector. - // Without this code the function incorrectly returns reduced vector - // instruction with the same elements, not with the unique ones. - - // block: - // %phi = phi <2 x > { .., %entry} {%shuffle, %block} - // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> - // ... (use %2) - // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} - // br %block - SmallVector<int> UniqueIdxs(VF, UndefMaskElem); - SmallSet<int, 4> UsedIdxs; - int Pos = 0; - int Sz = VL.size(); - for (int Idx : E->ReuseShuffleIndices) { - if (Idx != Sz && Idx != UndefMaskElem && - UsedIdxs.insert(Idx).second) - UniqueIdxs[Idx] = Pos; - ++Pos; - } - assert(VF >= UsedIdxs.size() && "Expected vectorization factor " - "less than original vector size."); - UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); - V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); - } else { - assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && - "Expected vectorization factor less " - "than original vector size."); - SmallVector<int> UniformMask(VF, 0); - std::iota(UniformMask.begin(), UniformMask.end(), 0); - V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); - } - if (auto *I = dyn_cast<Instruction>(V)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (TreeEntry *VE = getTreeEntry(S.OpValue); + VE && VE->isSame(VL) && + (any_of(VE->UserTreeIndices, + [E, NodeIdx](const EdgeInfo &EI) { + return EI.UserTE == E && EI.EdgeIdx == NodeIdx; + }) || + any_of(VectorizableTree, + [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) { + return TE->isOperandGatherNode({E, NodeIdx}) && + VE->isSame(TE->Scalars); + }))) { + auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) { + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleBuilder.add(V, Mask); + return ShuffleBuilder.finalize(std::nullopt); + }; + Value *V = vectorizeTree(VE); + if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) { + if (!VE->ReuseShuffleIndices.empty()) { + // Reshuffle to get only unique values. + // If some of the scalars are duplicated in the vectorization + // tree entry, we do not vectorize them but instead generate a + // mask for the reuses. But if there are several users of the + // same entry, they may have different vectorization factors. + // This is especially important for PHI nodes. In this case, we + // need to adapt the resulting instruction for the user + // vectorization factor and have to reshuffle it again to take + // only unique elements of the vector. Without this code the + // function incorrectly returns reduced vector instruction with + // the same elements, not with the unique ones. + + // block: + // %phi = phi <2 x > { .., %entry} {%shuffle, %block} + // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> + // ... (use %2) + // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} + // br %block + SmallVector<int> UniqueIdxs(VF, UndefMaskElem); + SmallSet<int, 4> UsedIdxs; + int Pos = 0; + for (int Idx : VE->ReuseShuffleIndices) { + if (Idx != static_cast<int>(VF) && Idx != UndefMaskElem && + UsedIdxs.insert(Idx).second) + UniqueIdxs[Idx] = Pos; + ++Pos; } + assert(VF >= UsedIdxs.size() && "Expected vectorization factor " + "less than original vector size."); + UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); + V = FinalShuffle(V, UniqueIdxs); + } else { + assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && + "Expected vectorization factor less " + "than original vector size."); + SmallVector<int> UniformMask(VF, 0); + std::iota(UniformMask.begin(), UniformMask.end(), 0); + V = FinalShuffle(V, UniformMask); } - return V; } + return V; + } } - // Can't vectorize this, so simply build a new vector with each lane - // corresponding to the requested value. - return createBuildVector(VL); + // Find the corresponding gather entry and vectorize it. + // Allows to be more accurate with tree/graph transformations, checks for the + // correctness of the transformations in many cases. + auto *I = find_if(VectorizableTree, + [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) { + return TE->isOperandGatherNode({E, NodeIdx}); + }); + assert(I != VectorizableTree.end() && "Gather node is not in the graph."); + assert(I->get()->UserTreeIndices.size() == 1 && + "Expected only single user for the gather node."); + assert(I->get()->isSame(VL) && "Expected same list of scalars."); + IRBuilder<>::InsertPointGuard Guard(Builder); + if (E->getOpcode() != Instruction::InsertElement && + E->getOpcode() != Instruction::PHI) { + Instruction *LastInst = EntryToLastInstruction.lookup(E); + assert(LastInst && "Failed to find last instruction in bundle"); + Builder.SetInsertPoint(LastInst); + } + return vectorizeTree(I->get()); } -Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) { - assert(any_of(VectorizableTree, - [VL](const std::unique_ptr<TreeEntry> &TE) { - return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); - }) && - "Non-matching gather node."); - unsigned VF = VL.size(); - // Exploit possible reuse of values across lanes. - SmallVector<int> ReuseShuffleIndicies; - SmallVector<Value *> UniqueValues; - if (VL.size() > 2) { + +Value *BoUpSLP::createBuildVector(const TreeEntry *E) { + assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); + unsigned VF = E->getVectorFactor(); + + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + SmallVector<Value *> Gathered( + VF, PoisonValue::get(E->Scalars.front()->getType())); + bool NeedFreeze = false; + SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the redorder indices and reorder scalars per this mask. + SmallVector<int> ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(VL, ReorderMask); + SmallVector<int> ReuseMask(VF, UndefMaskElem); + if (!allConstant(VL)) { + // For splats with can emit broadcasts instead of gathers, so try to find + // such sequences. + bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back()); + SmallVector<int> UndefPos; DenseMap<Value *, unsigned> UniquePositions; - unsigned NumValues = - std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) { - return !isa<UndefValue>(V); - }).base()); - VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues)); - int UniqueVals = 0; - for (Value *V : VL.drop_back(VL.size() - VF)) { + // Gather unique non-const values and all constant values. + // For repeated values, just shuffle them. + for (auto [I, V] : enumerate(VL)) { if (isa<UndefValue>(V)) { - ReuseShuffleIndicies.emplace_back(UndefMaskElem); + if (!isa<PoisonValue>(V)) { + Gathered[I] = V; + ReuseMask[I] = I; + UndefPos.push_back(I); + } continue; } if (isConstant(V)) { - ReuseShuffleIndicies.emplace_back(UniqueValues.size()); - UniqueValues.emplace_back(V); + Gathered[I] = V; + ReuseMask[I] = I; continue; } - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) { - UniqueValues.emplace_back(V); - ++UniqueVals; - } - } - if (UniqueVals == 1 && UniqueValues.size() == 1) { - // Emit pure splat vector. - ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), - UndefMaskElem); - } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { - if (UniqueValues.empty()) { - assert(all_of(VL, UndefValue::classof) && "Expected list of undefs."); - NumValues = VF; + if (IsSplat) { + Gathered.front() = V; + ReuseMask[I] = 0; + } else { + const auto Res = UniquePositions.try_emplace(V, I); + Gathered[Res.first->second] = V; + ReuseMask[I] = Res.first->second; + } + } + if (!UndefPos.empty() && IsSplat) { + // For undef values, try to replace them with the simple broadcast. + // We can do it if the broadcasted value is guaranteed to be + // non-poisonous, or by freezing the incoming scalar value first. + auto *It = find_if(Gathered, [this, E](Value *V) { + return !isa<UndefValue>(V) && + (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || + any_of(V->uses(), [E](const Use &U) { + // Check if the value already used in the same operation in + // one of the nodes already. + return E->UserTreeIndices.size() == 1 && + is_contained( + E->UserTreeIndices.front().UserTE->Scalars, + U.getUser()) && + E->UserTreeIndices.front().EdgeIdx != U.getOperandNo(); + })); + }); + if (It != Gathered.end()) { + // Replace undefs by the non-poisoned scalars and emit broadcast. + int Pos = std::distance(Gathered.begin(), It); + for_each(UndefPos, [&](int I) { + // Set the undef position to the non-poisoned scalar. + ReuseMask[I] = Pos; + // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already. + if (I != Pos) + Gathered[I] = PoisonValue::get(Gathered[I]->getType()); + }); + } else { + // Replace undefs by the poisons, emit broadcast and then emit + // freeze. + for_each(UndefPos, [&](int I) { + ReuseMask[I] = UndefMaskElem; + if (isa<UndefValue>(Gathered[I])) + Gathered[I] = PoisonValue::get(Gathered[I]->getType()); + }); + NeedFreeze = true; } - ReuseShuffleIndicies.clear(); - UniqueValues.clear(); - UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues)); } - UniqueValues.append(VF - UniqueValues.size(), - PoisonValue::get(VL[0]->getType())); - VL = UniqueValues; - } - - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, - CSEBlocks); - Value *Vec = gather(VL); - if (!ReuseShuffleIndicies.empty()) { - ShuffleBuilder.addMask(ReuseShuffleIndicies); - Vec = ShuffleBuilder.finalize(Vec); - } + } else { + ReuseMask.clear(); + copy(VL, Gathered.begin()); + } + // Gather unique scalars and all constants. + Value *Vec = gather(Gathered); + ShuffleBuilder.add(Vec, ReuseMask); + Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + if (NeedFreeze) + Vec = Builder.CreateFreeze(Vec); return Vec; } @@ -7831,34 +9161,55 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, - CSEBlocks); + auto FinalShuffle = [&](Value *V, const TreeEntry *E) { + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + if (E->State != TreeEntry::NeedToGather && + E->getOpcode() == Instruction::Store) { + ArrayRef<int> Mask = + ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()), + E->ReorderIndices.size()); + ShuffleBuilder.add(V, Mask); + } else { + ShuffleBuilder.addOrdered(V, E->ReorderIndices); + } + return ShuffleBuilder.finalize(E->ReuseShuffleIndices); + }; + if (E->State == TreeEntry::NeedToGather) { + if (E->Idx > 0) { + // We are in the middle of a vectorizable chain. We need to gather the + // scalars from the users. + Value *Vec = createBuildVector(E); + E->VectorizedValue = Vec; + return Vec; + } if (E->getMainOp()) setInsertPointAfterBundle(E); + SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the reorder indices and reorder scalars per this + // mask. + SmallVector<int> ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(GatheredScalars, ReorderMask); Value *Vec; SmallVector<int> Mask; SmallVector<const TreeEntry *> Entries; - Optional<TargetTransformInfo::ShuffleKind> Shuffle = - isGatherShuffledEntry(E, Mask, Entries); + std::optional<TargetTransformInfo::ShuffleKind> Shuffle = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); if (Shuffle) { assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherShuffleSeq.insert(I); + GatherShuffleExtractSeq.insert(I); CSEBlocks.insert(I->getParent()); } } else { Vec = gather(E->Scalars); } - if (NeedToShuffleReuses) { - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - Vec = ShuffleBuilder.finalize(Vec); - } + Vec = FinalShuffle(Vec, E); E->VectorizedValue = Vec; return Vec; } @@ -7891,9 +9242,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -7905,6 +9254,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ValueList Operands; BasicBlock *IBB = PH->getIncomingBlock(i); + // Stop emission if all incoming values are generated. + if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return V; + } + if (!VisitedBBs.insert(IBB).second) { NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); continue; @@ -7912,7 +9267,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(E->getOperand(i)); + Value *Vec = vectorizeOperand(E, i); NewPhi->addIncoming(Vec, IBB); } @@ -7923,10 +9278,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); - Builder.SetInsertPoint(VL0); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + setInsertPointAfterBundle(E); + V = FinalShuffle(V, E); E->VectorizedValue = V; return V; } @@ -7937,16 +9290,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - NewV = ShuffleBuilder.finalize(NewV); + NewV = FinalShuffle(NewV, E); E->VectorizedValue = NewV; return NewV; } case Instruction::InsertElement: { assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back())); - Value *V = vectorizeTree(E->getOperand(1)); + Value *V = vectorizeOperand(E, 1); // Create InsertVector shuffle if necessary auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { @@ -7981,27 +9332,58 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!IsIdentity || NumElts != NumScalars) { V = Builder.CreateShuffleVector(V, Mask); if (auto *I = dyn_cast<Instruction>(V)) { - GatherShuffleSeq.insert(I); + GatherShuffleExtractSeq.insert(I); CSEBlocks.insert(I->getParent()); } } - if ((!IsIdentity || Offset != 0 || - !isUndefVector(FirstInsert->getOperand(0))) && + SmallVector<int> InsertMask(NumElts, UndefMaskElem); + for (unsigned I = 0; I < NumElts; I++) { + if (Mask[I] != UndefMaskElem) + InsertMask[Offset + I] = I; + } + SmallBitVector UseMask = + buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask); + SmallBitVector IsFirstUndef = + isUndefVector(FirstInsert->getOperand(0), UseMask); + if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) && NumElts != NumScalars) { - SmallVector<int> InsertMask(NumElts); - std::iota(InsertMask.begin(), InsertMask.end(), 0); - for (unsigned I = 0; I < NumElts; I++) { - if (Mask[I] != UndefMaskElem) - InsertMask[Offset + I] = NumElts + I; - } - - V = Builder.CreateShuffleVector( - FirstInsert->getOperand(0), V, InsertMask, - cast<Instruction>(E->Scalars.back())->getName()); - if (auto *I = dyn_cast<Instruction>(V)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (IsFirstUndef.all()) { + if (!ShuffleVectorInst::isIdentityMask(InsertMask)) { + SmallBitVector IsFirstPoison = + isUndefVector<true>(FirstInsert->getOperand(0), UseMask); + if (!IsFirstPoison.all()) { + for (unsigned I = 0; I < NumElts; I++) { + if (InsertMask[I] == UndefMaskElem && !IsFirstPoison.test(I)) + InsertMask[I] = I + NumElts; + } + } + V = Builder.CreateShuffleVector( + V, + IsFirstPoison.all() ? PoisonValue::get(V->getType()) + : FirstInsert->getOperand(0), + InsertMask, cast<Instruction>(E->Scalars.back())->getName()); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } + } else { + SmallBitVector IsFirstPoison = + isUndefVector<true>(FirstInsert->getOperand(0), UseMask); + for (unsigned I = 0; I < NumElts; I++) { + if (InsertMask[I] == UndefMaskElem) + InsertMask[I] = IsFirstPoison.test(I) ? UndefMaskElem : I; + else + InsertMask[I] += NumElts; + } + V = Builder.CreateShuffleVector( + FirstInsert->getOperand(0), V, InsertMask, + cast<Instruction>(E->Scalars.back())->getName()); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } } @@ -8023,8 +9405,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::BitCast: { setInsertPointAfterBundle(E); - Value *InVec = vectorizeTree(E->getOperand(0)); - + Value *InVec = vectorizeOperand(E, 0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; @@ -8032,9 +9413,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8044,9 +9423,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::ICmp: { setInsertPointAfterBundle(E); - Value *L = vectorizeTree(E->getOperand(0)); - Value *R = vectorizeTree(E->getOperand(1)); - + Value *L = vectorizeOperand(E, 0); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } + Value *R = vectorizeOperand(E, 1); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; @@ -8055,9 +9437,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8066,19 +9446,24 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Select: { setInsertPointAfterBundle(E); - Value *Cond = vectorizeTree(E->getOperand(0)); - Value *True = vectorizeTree(E->getOperand(1)); - Value *False = vectorizeTree(E->getOperand(2)); - + Value *Cond = vectorizeOperand(E, 0); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } + Value *True = vectorizeOperand(E, 1); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } + Value *False = vectorizeOperand(E, 2); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } Value *V = Builder.CreateSelect(Cond, True, False); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8087,7 +9472,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::FNeg: { setInsertPointAfterBundle(E); - Value *Op = vectorizeTree(E->getOperand(0)); + Value *Op = vectorizeOperand(E, 0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -8100,9 +9485,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8129,9 +9512,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Xor: { setInsertPointAfterBundle(E); - Value *LHS = vectorizeTree(E->getOperand(0)); - Value *RHS = vectorizeTree(E->getOperand(1)); - + Value *LHS = vectorizeOperand(E, 0); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } + Value *RHS = vectorizeOperand(E, 1); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; @@ -8144,9 +9530,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8177,7 +9561,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } } else { assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); - Value *VecPtr = vectorizeTree(E->getOperand(0)); + Value *VecPtr = vectorizeOperand(E, 0); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } // Use the minimum alignment of the gathered loads. Align CommonAlignment = LI->getAlign(); for (Value *V : E->Scalars) @@ -8187,9 +9575,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = propagateMetadata(NewLI, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -8200,9 +9586,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E); - Value *VecValue = vectorizeTree(E->getOperand(0)); - ShuffleBuilder.addMask(E->ReorderIndices); - VecValue = ShuffleBuilder.finalize(VecValue); + Value *VecValue = vectorizeOperand(E, 0); + VecValue = FinalShuffle(VecValue, E); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( @@ -8231,11 +9616,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *GEP0 = cast<GetElementPtrInst>(VL0); setInsertPointAfterBundle(E); - Value *Op0 = vectorizeTree(E->getOperand(0)); + Value *Op0 = vectorizeOperand(E, 0); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } SmallVector<Value *> OpVecs; for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { - Value *OpVec = vectorizeTree(E->getOperand(J)); + Value *OpVec = vectorizeOperand(E, J); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } OpVecs.push_back(OpVec); } @@ -8249,9 +9642,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V = propagateMetadata(I, GEPs); } - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8289,7 +9680,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { continue; } - Value *OpVec = vectorizeTree(E->getOperand(j)); + Value *OpVec = vectorizeOperand(E, j); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) @@ -8324,9 +9719,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } propagateIRFlags(V, E->Scalars, VL0); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8344,13 +9737,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = nullptr, *RHS = nullptr; if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { setInsertPointAfterBundle(E); - LHS = vectorizeTree(E->getOperand(0)); - RHS = vectorizeTree(E->getOperand(1)); + LHS = vectorizeOperand(E, 0); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } + RHS = vectorizeOperand(E, 1); } else { setInsertPointAfterBundle(E); - LHS = vectorizeTree(E->getOperand(0)); + LHS = vectorizeOperand(E, 0); } - if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; @@ -8377,7 +9773,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // instruction, if any. for (Value *V : {V0, V1}) { if (auto *I = dyn_cast<Instruction>(V)) { - GatherShuffleSeq.insert(I); + GatherShuffleExtractSeq.insert(I); CSEBlocks.insert(I->getParent()); } } @@ -8389,9 +9785,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<int> Mask; buildShuffleEntryMask( E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { + [E, this](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); + return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), + *TLI); }, Mask, &OpScalars, &AltScalars); @@ -8401,10 +9798,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (auto *I = dyn_cast<Instruction>(V)) { V = propagateMetadata(I, E->Scalars); - GatherShuffleSeq.insert(I); + GatherShuffleExtractSeq.insert(I); CSEBlocks.insert(I->getParent()); } - V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -8433,14 +9829,27 @@ struct ShuffledInsertData { }; } // namespace -Value * -BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { +Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, + Instruction *ReductionRoot) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { scheduleBlock(BSIter.second.get()); } - Builder.SetInsertPoint(&F->getEntryBlock().front()); + // Pre-gather last instructions. + for (const std::unique_ptr<TreeEntry> &E : VectorizableTree) { + if ((E->State == TreeEntry::NeedToGather && + (!E->getMainOp() || E->Idx > 0)) || + (E->State != TreeEntry::NeedToGather && + E->getOpcode() == Instruction::ExtractValue) || + E->getOpcode() == Instruction::InsertElement) + continue; + Instruction *LastInst = &getLastInstructionInBundle(E.get()); + EntryToLastInstruction.try_emplace(E.get(), LastInst); + } + + Builder.SetInsertPoint(ReductionRoot ? ReductionRoot + : &F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); // If the vectorized tree can be rewritten in a smaller type, we truncate the @@ -8469,6 +9878,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { SmallVector<ShuffledInsertData> ShuffledInserts; // Maps vector instruction to original insertelement instruction DenseMap<Value *, InsertElementInst *> VectorToInsertElement; + // Maps extract Scalar to the corresponding extractelement instruction in the + // basic block. Only one extractelement per block should be emitted. + DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs; // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -8493,13 +9905,36 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Value *Lane = Builder.getInt32(ExternalUse.Lane); auto ExtractAndExtendIfNeeded = [&](Value *Vec) { if (Scalar->getType() != Vec->getType()) { - Value *Ex; - // "Reuse" the existing extract to improve final codegen. - if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { - Ex = Builder.CreateExtractElement(ES->getOperand(0), - ES->getOperand(1)); - } else { - Ex = Builder.CreateExtractElement(Vec, Lane); + Value *Ex = nullptr; + auto It = ScalarToEEs.find(Scalar); + if (It != ScalarToEEs.end()) { + // No need to emit many extracts, just move the only one in the + // current block. + auto EEIt = It->second.find(Builder.GetInsertBlock()); + if (EEIt != It->second.end()) { + Instruction *I = EEIt->second; + if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && + Builder.GetInsertPoint()->comesBefore(I)) + I->moveBefore(&*Builder.GetInsertPoint()); + Ex = I; + } + } + if (!Ex) { + // "Reuse" the existing extract to improve final codegen. + if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { + Ex = Builder.CreateExtractElement(ES->getOperand(0), + ES->getOperand(1)); + } else { + Ex = Builder.CreateExtractElement(Vec, Lane); + } + if (auto *I = dyn_cast<Instruction>(Ex)) + ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), I); + } + // The then branch of the previous if may produce constants, since 0 + // operand might be a constant. + if (auto *ExI = dyn_cast<Instruction>(Ex)) { + GatherShuffleExtractSeq.insert(ExI); + CSEBlocks.insert(ExI->getParent()); } // If necessary, sign-extend or zero-extend ScalarRoot // to the larger type. @@ -8524,13 +9959,15 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { "Scalar with nullptr as an external user must be registered in " "ExternallyUsedValues map"); if (auto *VecI = dyn_cast<Instruction>(Vec)) { - Builder.SetInsertPoint(VecI->getParent(), - std::next(VecI->getIterator())); + if (auto *PHI = dyn_cast<PHINode>(VecI)) + Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI()); + else + Builder.SetInsertPoint(VecI->getParent(), + std::next(VecI->getIterator())); } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); } Value *NewInst = ExtractAndExtendIfNeeded(Vec); - CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); auto &NewInstLocs = ExternallyUsedValues[NewInst]; auto It = ExternallyUsedValues.find(Scalar); assert(It != ExternallyUsedValues.end() && @@ -8546,7 +9983,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // Skip if the scalar is another vector op or Vec is not an instruction. if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) { if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) { - Optional<unsigned> InsertIdx = getInsertIndex(VU); + std::optional<unsigned> InsertIdx = getInsertIndex(VU); if (InsertIdx) { // Need to use original vector, if the root is truncated. if (MinBWs.count(Scalar) && @@ -8556,7 +9993,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) { // Checks if 2 insertelements are from the same buildvector. InsertElementInst *VecInsert = Data.InsertElements.front(); - return areTwoInsertFromSameBuildVector(VU, VecInsert); + return areTwoInsertFromSameBuildVector( + VU, VecInsert, + [](InsertElementInst *II) { return II->getOperand(0); }); }); unsigned Idx = *InsertIdx; if (It == ShuffledInserts.end()) { @@ -8620,157 +10059,42 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); } Value *NewInst = ExtractAndExtendIfNeeded(Vec); - CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, NewInst); } } } else { Builder.SetInsertPoint(cast<Instruction>(User)); Value *NewInst = ExtractAndExtendIfNeeded(Vec); - CSEBlocks.insert(cast<Instruction>(User)->getParent()); User->replaceUsesOfWith(Scalar, NewInst); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *NewInst = ExtractAndExtendIfNeeded(Vec); - CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, NewInst); } LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); } - // Checks if the mask is an identity mask. - auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) { - int Limit = Mask.size(); - return VecTy->getNumElements() == Mask.size() && - all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask); - }; - // Tries to combine 2 different masks into single one. - auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) { - SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem); - for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { - if (ExtMask[I] == UndefMaskElem) - continue; - NewMask[I] = Mask[ExtMask[I]]; - } - Mask.swap(NewMask); - }; - // Peek through shuffles, trying to simplify the final shuffle code. - auto &&PeekThroughShuffles = - [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask, - bool CheckForLengthChange = false) { - while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) { - // Exit if not a fixed vector type or changing size shuffle. - if (!isa<FixedVectorType>(SV->getType()) || - (CheckForLengthChange && SV->changesLength())) - break; - // Exit if the identity or broadcast mask is found. - if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) || - SV->isZeroEltSplat()) - break; - bool IsOp1Undef = isUndefVector(SV->getOperand(0)); - bool IsOp2Undef = isUndefVector(SV->getOperand(1)); - if (!IsOp1Undef && !IsOp2Undef) - break; - SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(), - SV->getShuffleMask().end()); - CombineMasks(ShuffleMask, Mask); - Mask.swap(ShuffleMask); - if (IsOp2Undef) - V = SV->getOperand(0); - else - V = SV->getOperand(1); - } - }; - // Smart shuffle instruction emission, walks through shuffles trees and - // tries to find the best matching vector for the actual shuffle - // instruction. - auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles, - &CombineMasks](Value *V1, Value *V2, - ArrayRef<int> Mask) -> Value * { - assert(V1 && "Expected at least one vector value."); - if (V2 && !isUndefVector(V2)) { - // Peek through shuffles. - Value *Op1 = V1; - Value *Op2 = V2; - int VF = - cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); - SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem); - SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem); - for (int I = 0, E = Mask.size(); I < E; ++I) { - if (Mask[I] < VF) - CombinedMask1[I] = Mask[I]; - else - CombinedMask2[I] = Mask[I] - VF; - } - Value *PrevOp1; - Value *PrevOp2; - do { - PrevOp1 = Op1; - PrevOp2 = Op2; - PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true); - PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true); - // Check if we have 2 resizing shuffles - need to peek through operands - // again. - if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1)) - if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) - if (SV1->getOperand(0)->getType() == - SV2->getOperand(0)->getType() && - SV1->getOperand(0)->getType() != SV1->getType() && - isUndefVector(SV1->getOperand(1)) && - isUndefVector(SV2->getOperand(1))) { - Op1 = SV1->getOperand(0); - Op2 = SV2->getOperand(0); - SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(), - SV1->getShuffleMask().end()); - CombineMasks(ShuffleMask1, CombinedMask1); - CombinedMask1.swap(ShuffleMask1); - SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(), - SV2->getShuffleMask().end()); - CombineMasks(ShuffleMask2, CombinedMask2); - CombinedMask2.swap(ShuffleMask2); - } - } while (PrevOp1 != Op1 || PrevOp2 != Op2); - VF = cast<VectorType>(Op1->getType()) - ->getElementCount() - .getKnownMinValue(); - for (int I = 0, E = Mask.size(); I < E; ++I) { - if (CombinedMask2[I] != UndefMaskElem) { - assert(CombinedMask1[I] == UndefMaskElem && - "Expected undefined mask element"); - CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); - } - } - Value *Vec = Builder.CreateShuffleVector( - Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, - CombinedMask1); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; - } - if (isa<PoisonValue>(V1)) - return PoisonValue::get(FixedVectorType::get( - cast<VectorType>(V1->getType())->getElementType(), Mask.size())); - Value *Op = V1; - SmallVector<int> CombinedMask(Mask.begin(), Mask.end()); - PeekThroughShuffles(Op, CombinedMask); - if (!isa<FixedVectorType>(Op->getType()) || - !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) { - Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; + auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) { + SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem); + SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem); + int VF = cast<FixedVectorType>(V1->getType())->getNumElements(); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (Mask[I] < VF) + CombinedMask1[I] = Mask[I]; + else + CombinedMask2[I] = Mask[I] - VF; } - return Op; + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleBuilder.add(V1, CombinedMask1); + if (V2) + ShuffleBuilder.add(V2, CombinedMask2); + return ShuffleBuilder.finalize(std::nullopt); }; - auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) { + auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask, + bool ForSingleMask) { unsigned VF = Mask.size(); unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); if (VF != VecVF) { @@ -8778,12 +10102,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Vec = CreateShuffle(Vec, nullptr, Mask); return std::make_pair(Vec, true); } - SmallVector<int> ResizeMask(VF, UndefMaskElem); - for (unsigned I = 0; I < VF; ++I) { - if (Mask[I] != UndefMaskElem) - ResizeMask[Mask[I]] = Mask[I]; + if (!ForSingleMask) { + SmallVector<int> ResizeMask(VF, UndefMaskElem); + for (unsigned I = 0; I < VF; ++I) { + if (Mask[I] != UndefMaskElem) + ResizeMask[Mask[I]] = Mask[I]; + } + Vec = CreateShuffle(Vec, nullptr, ResizeMask); } - Vec = CreateShuffle(Vec, nullptr, ResizeMask); } return std::make_pair(Vec, false); @@ -8798,7 +10124,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Builder.SetInsertPoint(LastInsert); auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); Value *NewInst = performExtractsShuffleAction<Value>( - makeMutableArrayRef(Vector.data(), Vector.size()), + MutableArrayRef(Vector.data(), Vector.size()), FirstInsert->getOperand(0), [](Value *Vec) { return cast<VectorType>(Vec->getType()) @@ -8855,6 +10181,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { CSEBlocks.insert(LastInsert->getParent()); } + SmallVector<Instruction *> RemovedInsts; // For each vectorized value: for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -8889,9 +10216,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { #endif LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); eraseInstruction(cast<Instruction>(Scalar)); + // Retain to-be-deleted instructions for some debug-info + // bookkeeping. NOTE: eraseInstruction only marks the instruction for + // deletion - instructions are not deleted until later. + RemovedInsts.push_back(cast<Instruction>(Scalar)); } } + // Merge the DIAssignIDs from the about-to-be-deleted instructions into the + // new vector instruction. + if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue)) + V->mergeDIAssignID(RemovedInsts); + Builder.ClearInsertionPoint(); InstrElementSize.clear(); @@ -8899,10 +10235,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } void BoUpSLP::optimizeGatherSequence() { - LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size() + LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *I : GatherShuffleSeq) { + for (Instruction *I : GatherShuffleExtractSeq) { if (isDeleted(I)) continue; @@ -8927,6 +10263,7 @@ void BoUpSLP::optimizeGatherSequence() { // We can hoist this instruction. Move it to the pre-header. I->moveBefore(PreHeader->getTerminator()); + CSEBlocks.insert(PreHeader); } // Make a list of all reachable blocks in our CSE queue. @@ -9002,8 +10339,8 @@ void BoUpSLP::optimizeGatherSequence() { for (Instruction &In : llvm::make_early_inc_range(*BB)) { if (isDeleted(&In)) continue; - if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) && - !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In)) + if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) && + !GatherShuffleExtractSeq.contains(&In)) continue; // Check if we can replace this instruction with any of the @@ -9022,7 +10359,7 @@ void BoUpSLP::optimizeGatherSequence() { break; } if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && - GatherShuffleSeq.contains(V) && + GatherShuffleExtractSeq.contains(V) && IsIdenticalOrLessDefined(V, &In, NewMask) && DT->dominates(In.getParent(), V->getParent())) { In.moveAfter(V); @@ -9043,7 +10380,7 @@ void BoUpSLP::optimizeGatherSequence() { } } CSEBlocks.clear(); - GatherShuffleSeq.clear(); + GatherShuffleExtractSeq.clear(); } BoUpSLP::ScheduleData * @@ -9075,7 +10412,7 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. -Optional<BoUpSLP::ScheduleData *> +std::optional<BoUpSLP::ScheduleData *> BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, const InstructionsState &S) { // No need to schedule PHIs, insertelement, extractelement and extractvalue @@ -9137,7 +10474,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // dependencies and emit instruction in the wrong order at the actual // scheduling. TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); - return None; + return std::nullopt; } } @@ -9167,7 +10504,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); - return None; + return std::nullopt; } return Bundle; } @@ -9395,13 +10732,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, WorkList.push_back(DestBundle); }; - // Any instruction which isn't safe to speculate at the begining of the + // Any instruction which isn't safe to speculate at the beginning of the // block is control dependend on any early exit or non-willreturn call // which proceeds it. if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) { for (Instruction *I = BundleMember->Inst->getNextNode(); I != ScheduleEnd; I = I->getNextNode()) { - if (isSafeToSpeculativelyExecute(I, &*BB->begin())) + if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC)) continue; // Add the dependency @@ -9436,9 +10773,12 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } // In addition to the cases handle just above, we need to prevent - // allocas from moving below a stacksave. The stackrestore case - // is currently thought to be conservatism. - if (isa<AllocaInst>(BundleMember->Inst)) { + // allocas and loads/stores from moving below a stacksave or a + // stackrestore. Avoiding moving allocas below stackrestore is currently + // thought to be conservatism. Moving loads/stores below a stackrestore + // can lead to incorrect code. + if (isa<AllocaInst>(BundleMember->Inst) || + BundleMember->Inst->mayReadOrWriteMemory()) { for (Instruction *I = BundleMember->Inst->getNextNode(); I != ScheduleEnd; I = I->getNextNode()) { if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) && @@ -9661,17 +11001,15 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { // If the current instruction is a load, update MaxWidth to reflect the // width of the loaded value. - if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) || - isa<ExtractValueInst>(I)) + if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I)) Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty)); // Otherwise, we need to visit the operands of the instruction. We only // handle the interesting cases from buildTree here. If an operand is an // instruction we haven't yet visited and from the same basic block as the // user or the use is a PHI node, we add it to the worklist. - else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || - isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) || - isa<UnaryOperator>(I)) { + else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst, + BinaryOperator, UnaryOperator>(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast<Instruction>(U.get())) if (Visited.insert(J).second && @@ -9724,8 +11062,7 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, break; case Instruction::ZExt: case Instruction::SExt: - if (isa<ExtractElementInst>(I->getOperand(0)) || - isa<InsertElementInst>(I->getOperand(0))) + if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0))) return false; break; @@ -10026,7 +11363,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, DT->updateDFSNumbers(); // Scan the blocks in the function in post order. - for (auto BB : post_order(&F.getEntryBlock())) { + for (auto *BB : post_order(&F.getEntryBlock())) { // Start new block - clear the list of reduction roots. R.clearReductionData(); collectSeedInstructions(BB); @@ -10084,7 +11421,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, InstructionCost Cost = R.getTreeCost(); - LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); @@ -10128,7 +11465,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, ++IterCnt; CheckedPairs[Idx].set(K); CheckedPairs[K].set(Idx); - Optional<int> Diff = getPointersDiff( + std::optional<int> Diff = getPointersDiff( Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(), Stores[Idx]->getValueOperand()->getType(), Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); @@ -10211,12 +11548,17 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, unsigned MinVF = TTI->getStoreMinimumVF( R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); + if (MaxVF <= MinVF) { + LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= " + << "MinVF (" << MinVF << ")\n"); + } + // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? unsigned StartIdx = 0; for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); + ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size); if (!VectorizedStores.count(Slice.front()) && !VectorizedStores.count(Slice.back()) && vectorizeStoreChain(Slice, R, Cnt, MinVF)) { @@ -10295,7 +11637,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(VL, *TLI); if (!S.getOpcode()) return false; @@ -10377,7 +11719,9 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (R.isTreeTinyAndNotFullyVectorizable()) continue; R.reorderTopToBottom(); - R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front())); + R.reorderBottomToTop( + /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) && + !R.doesRootHaveInTreeUses()); R.buildExternalUses(); R.computeMinimumValueSizes(); @@ -10385,6 +11729,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, CandidateFound = true; MinCost = std::min(MinCost, Cost); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost + << " for VF=" << OpsWidth << "\n"); if (Cost < -SLPCostThreshold) { LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", @@ -10423,8 +11769,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!I) return false; - if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) || - isa<VectorType>(I->getType())) + if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType())) return false; Value *P = I->getParent(); @@ -10464,7 +11809,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { return tryToVectorizePair(Op0, Op1, R); // We have multiple options. Try to pick the single best. - Optional<int> BestCandidate = R.findBestRootPair(Candidates); + std::optional<int> BestCandidate = R.findBestRootPair(Candidates); if (!BestCandidate) return false; return tryToVectorizePair(Candidates[*BestCandidate].first, @@ -10522,8 +11867,8 @@ class HorizontalReduction { // select x, y, false // select x, true, y static bool isBoolLogicOp(Instruction *I) { - return match(I, m_LogicalAnd(m_Value(), m_Value())) || - match(I, m_LogicalOr(m_Value(), m_Value())); + return isa<SelectInst>(I) && + (match(I, m_LogicalAnd()) || match(I, m_LogicalOr())); } /// Checks if instruction is associative and can be vectorized. @@ -10749,7 +12094,7 @@ class HorizontalReduction { /// Checks if the instruction is in basic block \p BB. /// For a cmp+sel min/max reduction check that both ops are in \p BB. static bool hasSameParent(Instruction *I, BasicBlock *BB) { - if (isCmpSelMinMax(I) || (isBoolLogicOp(I) && isa<SelectInst>(I))) { + if (isCmpSelMinMax(I) || isBoolLogicOp(I)) { auto *Sel = cast<SelectInst>(I); auto *Cmp = dyn_cast<Instruction>(Sel->getCondition()); return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB; @@ -10800,6 +12145,13 @@ class HorizontalReduction { return I->getOperand(getFirstOperandIndex(I) + 1); } + static bool isGoodForReduction(ArrayRef<Value *> Data) { + int Sz = Data.size(); + auto *I = dyn_cast<Instruction>(Data.front()); + return Sz > 1 || isConstant(Data.front()) || + (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode())); + } + public: HorizontalReduction() = default; @@ -10895,6 +12247,9 @@ public: MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>> PossibleReducedVals; initReductionOps(Inst); + DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap; + SmallSet<size_t, 2> LoadKeyUsed; + SmallPtrSet<Value *, 4> DoNotReverseVals; while (!Worklist.empty()) { Instruction *TreeN = Worklist.pop_back_val(); SmallVector<Value *> Args; @@ -10916,18 +12271,36 @@ public: size_t Key, Idx; std::tie(Key, Idx) = generateKeySubkey( V, &TLI, - [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) { - auto It = PossibleReducedVals.find(Key); - if (It != PossibleReducedVals.end()) { - for (const auto &LoadData : It->second) { - auto *RLI = cast<LoadInst>(LoadData.second.front().first); - if (getPointersDiff(RLI->getType(), - RLI->getPointerOperand(), LI->getType(), - LI->getPointerOperand(), DL, SE, - /*StrictCheck=*/true)) - return hash_value(RLI->getPointerOperand()); + [&](size_t Key, LoadInst *LI) { + Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); + if (LoadKeyUsed.contains(Key)) { + auto LIt = LoadsMap.find(Ptr); + if (LIt != LoadsMap.end()) { + for (LoadInst *RLI: LIt->second) { + if (getPointersDiff( + RLI->getType(), RLI->getPointerOperand(), + LI->getType(), LI->getPointerOperand(), DL, SE, + /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + for (LoadInst *RLI : LIt->second) { + if (arePointersCompatible(RLI->getPointerOperand(), + LI->getPointerOperand(), TLI)) { + hash_code SubKey = hash_value(RLI->getPointerOperand()); + DoNotReverseVals.insert(RLI); + return SubKey; + } + } + if (LIt->second.size() > 2) { + hash_code SubKey = + hash_value(LIt->second.back()->getPointerOperand()); + DoNotReverseVals.insert(LIt->second.back()); + return SubKey; + } } } + LoadKeyUsed.insert(Key); + LoadsMap.try_emplace(Ptr).first->second.push_back(LI); return hash_value(LI->getPointerOperand()); }, /*AllowAlternate=*/false); @@ -10941,17 +12314,35 @@ public: size_t Key, Idx; std::tie(Key, Idx) = generateKeySubkey( TreeN, &TLI, - [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) { - auto It = PossibleReducedVals.find(Key); - if (It != PossibleReducedVals.end()) { - for (const auto &LoadData : It->second) { - auto *RLI = cast<LoadInst>(LoadData.second.front().first); - if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), - LI->getType(), LI->getPointerOperand(), - DL, SE, /*StrictCheck=*/true)) - return hash_value(RLI->getPointerOperand()); + [&](size_t Key, LoadInst *LI) { + Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); + if (LoadKeyUsed.contains(Key)) { + auto LIt = LoadsMap.find(Ptr); + if (LIt != LoadsMap.end()) { + for (LoadInst *RLI: LIt->second) { + if (getPointersDiff(RLI->getType(), + RLI->getPointerOperand(), LI->getType(), + LI->getPointerOperand(), DL, SE, + /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + for (LoadInst *RLI : LIt->second) { + if (arePointersCompatible(RLI->getPointerOperand(), + LI->getPointerOperand(), TLI)) { + hash_code SubKey = hash_value(RLI->getPointerOperand()); + DoNotReverseVals.insert(RLI); + return SubKey; + } + } + if (LIt->second.size() > 2) { + hash_code SubKey = hash_value(LIt->second.back()->getPointerOperand()); + DoNotReverseVals.insert(LIt->second.back()); + return SubKey; + } } } + LoadKeyUsed.insert(Key); + LoadsMap.try_emplace(Ptr).first->second.push_back(LI); return hash_value(LI->getPointerOperand()); }, /*AllowAlternate=*/false); @@ -10977,9 +12368,27 @@ public: stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { return P1.size() > P2.size(); }); - ReducedVals.emplace_back(); - for (ArrayRef<Value *> Data : PossibleRedValsVect) - ReducedVals.back().append(Data.rbegin(), Data.rend()); + int NewIdx = -1; + for (ArrayRef<Value *> Data : PossibleRedValsVect) { + if (isGoodForReduction(Data) || + (isa<LoadInst>(Data.front()) && NewIdx >= 0 && + isa<LoadInst>(ReducedVals[NewIdx].front()) && + getUnderlyingObject( + cast<LoadInst>(Data.front())->getPointerOperand()) == + getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front()) + ->getPointerOperand()))) { + if (NewIdx < 0) { + NewIdx = ReducedVals.size(); + ReducedVals.emplace_back(); + } + if (DoNotReverseVals.contains(Data.front())) + ReducedVals[NewIdx].append(Data.begin(), Data.end()); + else + ReducedVals[NewIdx].append(Data.rbegin(), Data.rend()); + } else { + ReducedVals.emplace_back().append(Data.rbegin(), Data.rend()); + } + } } // Sort the reduced values by number of same/alternate opcode and/or pointer // operand. @@ -10990,25 +12399,36 @@ public: } /// Attempt to vectorize the tree found by matchAssociativeReduction. - Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, + const TargetLibraryInfo &TLI) { constexpr int ReductionLimit = 4; constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. - unsigned NumReducedVals = std::accumulate( - ReducedVals.begin(), ReducedVals.end(), 0, - [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); }); - if (NumReducedVals < ReductionLimit) + size_t NumReducedVals = + std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0, + [](size_t Num, ArrayRef<Value *> Vals) { + if (!isGoodForReduction(Vals)) + return Num; + return Num + Vals.size(); + }); + if (NumReducedVals < ReductionLimit) { + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) + V.analyzedReductionRoot(cast<Instruction>(RdxOp)); return nullptr; + } IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); // Track the reduced values in case if they are replaced by extractelement // because of the vectorization. - DenseMap<Value *, WeakTrackingVH> TrackedVals; + DenseMap<Value *, WeakTrackingVH> TrackedVals( + ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size()); BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; + ExternallyUsedValues.reserve(ExtraArgs.size() + 1); // The same extra argument may be used several times, so log each attempt // to use it. for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { @@ -11031,7 +12451,8 @@ public: // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; - SmallDenseSet<Value *> IgnoreList; + SmallDenseSet<Value *> IgnoreList(ReductionOps.size() * + ReductionOps.front().size()); for (ReductionOpsType &RdxOps : ReductionOps) for (Value *RdxOp : RdxOps) { if (!RdxOp) @@ -11046,15 +12467,19 @@ public: for (Value *V : Candidates) TrackedVals.try_emplace(V, V); - DenseMap<Value *, unsigned> VectorizedVals; + DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size()); + // List of the values that were reduced in other trees as part of gather + // nodes and thus requiring extract if fully vectorized in other trees. + SmallPtrSet<Value *, 4> RequiredExtract; Value *VectorizedTree = nullptr; bool CheckForReusedReductionOps = false; // Try to vectorize elements based on their type. for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { ArrayRef<Value *> OrigReducedVals = ReducedVals[I]; - InstructionsState S = getSameOpcode(OrigReducedVals); + InstructionsState S = getSameOpcode(OrigReducedVals, TLI); SmallVector<Value *> Candidates; - DenseMap<Value *, Value *> TrackedToOrig; + Candidates.reserve(2 * OrigReducedVals.size()); + DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size()); for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second; // Check if the reduction value was not overriden by the extractelement @@ -11071,7 +12496,7 @@ public: // Try to handle shuffled extractelements. if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && I + 1 < E) { - InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]); + InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI); if (NextS.getOpcode() == Instruction::ExtractElement && !NextS.isAltShuffle()) { SmallVector<Value *> CommonCandidates(Candidates); @@ -11179,37 +12604,49 @@ public: }); } // Number of uses of the candidates in the vector of values. - SmallDenseMap<Value *, unsigned> NumUses; + SmallDenseMap<Value *, unsigned> NumUses(Candidates.size()); for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { Value *V = Candidates[Cnt]; - if (NumUses.count(V) > 0) - continue; - NumUses[V] = std::count(VL.begin(), VL.end(), V); + ++NumUses.try_emplace(V, 0).first->getSecond(); } for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { Value *V = Candidates[Cnt]; - if (NumUses.count(V) > 0) - continue; - NumUses[V] = std::count(VL.begin(), VL.end(), V); + ++NumUses.try_emplace(V, 0).first->getSecond(); } + SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end()); // Gather externally used values. SmallPtrSet<Value *, 4> Visited; for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { - Value *V = Candidates[Cnt]; - if (!Visited.insert(V).second) + Value *RdxVal = Candidates[Cnt]; + if (!Visited.insert(RdxVal).second) continue; - unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; - if (NumOps != ReducedValsToOps.find(V)->second.size()) - LocalExternallyUsedValues[V]; + // Check if the scalar was vectorized as part of the vectorization + // tree but not the top node. + if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) { + LocalExternallyUsedValues[RdxVal]; + continue; + } + unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal]; + if (NumOps != ReducedValsToOps.find(RdxVal)->second.size()) + LocalExternallyUsedValues[RdxVal]; } for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { - Value *V = Candidates[Cnt]; - if (!Visited.insert(V).second) + Value *RdxVal = Candidates[Cnt]; + if (!Visited.insert(RdxVal).second) continue; - unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; - if (NumOps != ReducedValsToOps.find(V)->second.size()) - LocalExternallyUsedValues[V]; + // Check if the scalar was vectorized as part of the vectorization + // tree but not the top node. + if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) { + LocalExternallyUsedValues[RdxVal]; + continue; + } + unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal]; + if (NumOps != ReducedValsToOps.find(RdxVal)->second.size()) + LocalExternallyUsedValues[RdxVal]; } + for (Value *RdxVal : VL) + if (RequiredExtract.contains(RdxVal)) + LocalExternallyUsedValues[RdxVal]; V.buildExternalUses(LocalExternallyUsedValues); V.computeMinimumValueSizes(); @@ -11224,11 +12661,25 @@ public: InstructionCost TreeCost = V.getTreeCost(VL); InstructionCost ReductionCost = getReductionCost(TTI, VL, ReduxWidth, RdxFMF); + if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) { + Instruction *MainOp = V.getFirstNodeMainOp(); + for (Value *V : VL) { + auto *VI = dyn_cast<LoadInst>(V); + // Add the costs of scalar GEP pointers, to be removed from the + // code. + if (!VI || VI == MainOp) + continue; + auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand()); + if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices()) + continue; + TreeCost -= TTI->getArithmeticInstrCost( + Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput); + } + } InstructionCost Cost = TreeCost + ReductionCost; - if (!Cost.isValid()) { - LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); + if (!Cost.isValid()) return nullptr; - } if (Cost >= -SLPCostThreshold) { V.getORE()->emit([&]() { return OptimizationRemarkMissed( @@ -11257,21 +12708,23 @@ public: Builder.setFastMathFlags(RdxFMF); - // Vectorize a tree. - Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues); - // Emit a reduction. If the root is a select (min/max idiom), the insert // point is the compare condition of that select. Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); + Instruction *InsertPt = RdxRootInst; if (IsCmpSelMinMax) - Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst)); - else - Builder.SetInsertPoint(RdxRootInst); + InsertPt = GetCmpForMinMaxReduction(RdxRootInst); + + // Vectorize a tree. + Value *VectorizedRoot = + V.vectorizeTree(LocalExternallyUsedValues, InsertPt); + + Builder.SetInsertPoint(InsertPt); // To prevent poison from leaking across what used to be sequential, // safe, scalar boolean logic operations, the reduction operand must be // frozen. - if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) + if (isBoolLogicOp(RdxRootInst)) VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); Value *ReducedSubTree = @@ -11288,23 +12741,59 @@ public: ReducedSubTree, "op.rdx", ReductionOps); } // Count vectorized reduced values to exclude them from final reduction. - for (Value *V : VL) - ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0) + for (Value *RdxVal : VL) { + ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0) .first->getSecond(); + if (!V.isVectorized(RdxVal)) + RequiredExtract.insert(RdxVal); + } Pos += ReduxWidth; Start = Pos; ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); } } if (VectorizedTree) { + // Reorder operands of bool logical op in the natural order to avoid + // possible problem with poison propagation. If not possible to reorder + // (both operands are originally RHS), emit an extra freeze instruction + // for the LHS operand. + //I.e., if we have original code like this: + // RedOp1 = select i1 ?, i1 LHS, i1 false + // RedOp2 = select i1 RHS, i1 ?, i1 false + + // Then, we swap LHS/RHS to create a new op that matches the poison + // semantics of the original code. + + // If we have original code like this and both values could be poison: + // RedOp1 = select i1 ?, i1 LHS, i1 false + // RedOp2 = select i1 ?, i1 RHS, i1 false + + // Then, we must freeze LHS in the new op. + auto &&FixBoolLogicalOps = + [&Builder, VectorizedTree](Value *&LHS, Value *&RHS, + Instruction *RedOp1, Instruction *RedOp2) { + if (!isBoolLogicOp(RedOp1)) + return; + if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS || + isGuaranteedNotToBePoison(LHS)) + return; + if (!isBoolLogicOp(RedOp2)) + return; + if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS || + isGuaranteedNotToBePoison(RHS)) { + std::swap(LHS, RHS); + return; + } + LHS = Builder.CreateFreeze(LHS); + }; // Finish the reduction. // Need to add extra arguments and not vectorized possible reduction // values. // Try to avoid dependencies between the scalar remainders after // reductions. auto &&FinalGen = - [this, &Builder, - &TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) { + [this, &Builder, &TrackedVals, &FixBoolLogicalOps]( + ArrayRef<std::pair<Instruction *, Value *>> InstVals) { unsigned Sz = InstVals.size(); SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2); @@ -11321,6 +12810,11 @@ public: auto It2 = TrackedVals.find(RdxVal2); if (It2 != TrackedVals.end()) StableRdxVal2 = It2->second; + // To prevent poison from leaking across what used to be + // sequential, safe, scalar boolean logic operations, the + // reduction operand must be frozen. + FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first, + RedOp); Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1, StableRdxVal2, "op.rdx", ReductionOps); ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed); @@ -11330,6 +12824,8 @@ public: return ExtraReds; }; SmallVector<std::pair<Instruction *, Value *>> ExtraReductions; + ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot), + VectorizedTree); SmallPtrSet<Value *, 8> Visited; for (ArrayRef<Value *> Candidates : ReducedVals) { for (Value *RdxVal : Candidates) { @@ -11337,7 +12833,7 @@ public: continue; unsigned NumOps = VectorizedVals.lookup(RdxVal); for (Instruction *RedOp : - makeArrayRef(ReducedValsToOps.find(RdxVal)->second) + ArrayRef(ReducedValsToOps.find(RdxVal)->second) .drop_back(NumOps)) ExtraReductions.emplace_back(RedOp, RdxVal); } @@ -11349,22 +12845,12 @@ public: } // Iterate through all not-vectorized reduction values/extra arguments. while (ExtraReductions.size() > 1) { + VectorizedTree = ExtraReductions.front().second; SmallVector<std::pair<Instruction *, Value *>> NewReds = FinalGen(ExtraReductions); ExtraReductions.swap(NewReds); } - // Final reduction. - if (ExtraReductions.size() == 1) { - Instruction *RedOp = ExtraReductions.back().first; - Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); - Value *RdxVal = ExtraReductions.back().second; - Value *StableRdxVal = RdxVal; - auto It = TrackedVals.find(RdxVal); - if (It != TrackedVals.end()) - StableRdxVal = It->second; - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - StableRdxVal, "op.rdx", ReductionOps); - } + VectorizedTree = ExtraReductions.front().second; ReductionRoot->replaceAllUsesWith(VectorizedTree); @@ -11495,7 +12981,7 @@ private: } // end anonymous namespace -static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { +static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) { if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) return cast<FixedVectorType>(IE->getType())->getNumElements(); @@ -11506,7 +12992,7 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { if (auto *ST = dyn_cast<StructType>(CurrentType)) { for (auto *Elt : ST->elements()) if (Elt != ST->getElementType(0)) // check homogeneity - return None; + return std::nullopt; AggregateSize *= ST->getNumElements(); CurrentType = ST->getElementType(0); } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { @@ -11518,7 +13004,7 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { } else if (CurrentType->isSingleValueType()) { return AggregateSize; } else { - return None; + return std::nullopt; } } while (true); } @@ -11530,12 +13016,11 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst, unsigned OperandOffset) { do { Value *InsertedOperand = LastInsertInst->getOperand(1); - Optional<unsigned> OperandIndex = + std::optional<unsigned> OperandIndex = getInsertIndex(LastInsertInst, OperandOffset); if (!OperandIndex) return; - if (isa<InsertElementInst>(InsertedOperand) || - isa<InsertValueInst>(InsertedOperand)) { + if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) { findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, BuildVectorOpds, InsertElts, *OperandIndex); @@ -11545,8 +13030,7 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst, } LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); } while (LastInsertInst != nullptr && - (isa<InsertValueInst>(LastInsertInst) || - isa<InsertElementInst>(LastInsertInst)) && + isa<InsertValueInst, InsertElementInst>(LastInsertInst) && LastInsertInst->hasOneUse()); } @@ -11576,7 +13060,7 @@ static bool findBuildAggregate(Instruction *LastInsertInst, assert((BuildVectorOpds.empty() && InsertElts.empty()) && "Expected empty result vectors!"); - Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); + std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); if (!AggregateSize) return false; BuildVectorOpds.resize(*AggregateSize); @@ -11660,28 +13144,19 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { return false; } -/// Attempt to reduce a horizontal reduction. -/// If it is legal to match a horizontal reduction feeding the phi node \a P -/// with reduction operators \a Root (or one of its operands) in a basic block -/// \a BB, then check if it can be done. If horizontal reduction is not found -/// and root instruction is a binary operation, vectorization of the operands is -/// attempted. -/// \returns true if a horizontal reduction was matched and reduced or operands -/// of one of the binary instruction were vectorized. -/// \returns false if a horizontal reduction was not matched (or not possible) -/// or no vectorization of any binary operation feeding \a Root instruction was -/// performed. -static bool tryToVectorizeHorReductionOrInstOperands( - PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL, - const TargetLibraryInfo &TLI, - const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { +bool SLPVectorizerPass::vectorizeHorReduction( + PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, + SmallVectorImpl<WeakTrackingVH> &PostponedInsts) { if (!ShouldVectorizeHor) return false; + auto *Root = dyn_cast_or_null<Instruction>(V); if (!Root) return false; + if (!isa<BinaryOperator>(Root)) + P = nullptr; + if (Root->getParent() != BB || isa<PHINode>(Root)) return false; // Start analysis starting from Root instruction. If horizontal reduction is @@ -11693,25 +13168,22 @@ static bool tryToVectorizeHorReductionOrInstOperands( // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. - // Skip the analysis of CmpInsts. Compiler implements postanalysis of the - // CmpInsts so we can skip extra attempts in - // tryToVectorizeHorReductionOrInstOperands and save compile time. + // If a horizintal reduction was not matched or vectorized we collect + // instructions for possible later attempts for vectorization. std::queue<std::pair<Instruction *, unsigned>> Stack; Stack.emplace(Root, 0); SmallPtrSet<Value *, 8> VisitedInstrs; - SmallVector<WeakTrackingVH> PostponedInsts; bool Res = false; - auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst, - Value *&B0, - Value *&B1) -> Value * { + auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0, + Value *&B1) -> Value * { if (R.isAnalyzedReductionRoot(Inst)) return nullptr; bool IsBinop = matchRdxBop(Inst, B0, B1); bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); if (IsBinop || IsSelect) { HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI)) - return HorRdx.tryToReduce(R, TTI); + if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI)) + return HorRdx.tryToReduce(R, TTI, *TLI); } return nullptr; }; @@ -11752,9 +13224,8 @@ static bool tryToVectorizeHorReductionOrInstOperands( // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; - // Do not try to vectorize CmpInst operands, this is done separately. - // Final attempt for binop args vectorization should happen after the loop - // to try to find reductions. + // Do not collect CmpInst or InsertElementInst/InsertValueInst as their + // analysis is done separately. if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst)) PostponedInsts.push_back(Inst); } @@ -11772,29 +13243,25 @@ static bool tryToVectorizeHorReductionOrInstOperands( !R.isDeleted(I) && I->getParent() == BB) Stack.emplace(I, Level); } - // Try to vectorized binops where reductions were not found. - for (Value *V : PostponedInsts) - if (auto *Inst = dyn_cast<Instruction>(V)) - if (!R.isDeleted(Inst)) - Res |= Vectorize(Inst, R); return Res; } bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI) { - auto *I = dyn_cast_or_null<Instruction>(V); - if (!I) - return false; + SmallVector<WeakTrackingVH> PostponedInsts; + bool Res = vectorizeHorReduction(P, V, BB, R, TTI, PostponedInsts); + Res |= tryToVectorize(PostponedInsts, R); + return Res; +} - if (!isa<BinaryOperator>(I)) - P = nullptr; - // Try to match and vectorize a horizontal reduction. - auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { - return tryToVectorize(I, R); - }; - return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL, - *TLI, ExtraVectorization); +bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts, + BoUpSLP &R) { + bool Res = false; + for (Value *V : Insts) + if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst)) + Res |= tryToVectorize(Inst, R); + return Res; } bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, @@ -11864,7 +13331,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // same/alternate ops only, this may result in some extra final // vectorization. if (NumElts > 1 && - TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { + TryToVectorizeHelper(ArrayRef(IncIt, NumElts), LimitForRegisterSize)) { // Success start over because instructions might have been changed. Changed = true; } else if (NumElts < Limit(*IncIt) && @@ -11886,8 +13353,9 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) ++SameTypeIt; unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts), - /*LimitForRegisterSize=*/false)) + if (NumElts > 1 && + TryToVectorizeHelper(ArrayRef(It, NumElts), + /*LimitForRegisterSize=*/false)) Changed = true; It = SameTypeIt; } @@ -11909,7 +13377,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, /// predicate of the second or the operands IDs are less than the operands IDs /// of the second cmp instruction. template <bool IsCompatibility> -static bool compareCmp(Value *V, Value *V2, +static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, function_ref<bool(Instruction *)> IsDeleted) { auto *CI1 = cast<CmpInst>(V); auto *CI2 = cast<CmpInst>(V2); @@ -11945,7 +13413,7 @@ static bool compareCmp(Value *V, Value *V2, if (auto *I2 = dyn_cast<Instruction>(Op2)) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, TLI); if (S.getOpcode()) continue; return false; @@ -11954,25 +13422,35 @@ static bool compareCmp(Value *V, Value *V2, return IsCompatibility; } -bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, - bool AtTerminator) { +bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions, + BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { bool OpsChanged = false; SmallVector<Instruction *, 4> PostponedCmps; + SmallVector<WeakTrackingVH> PostponedInsts; + // pass1 - try to vectorize reductions only for (auto *I : reverse(Instructions)) { if (R.isDeleted(I)) continue; + if (isa<CmpInst>(I)) { + PostponedCmps.push_back(I); + continue; + } + OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts); + } + // pass2 - try to match and vectorize a buildvector sequence. + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I) || isa<CmpInst>(I)) + continue; if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - } else if (isa<CmpInst>(I)) { - PostponedCmps.push_back(I); - continue; } - // Try to find reductions in buildvector sequnces. - OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI); } + // Now try to vectorize postponed instructions. + OpsChanged |= tryToVectorize(PostponedInsts, R); + if (AtTerminator) { // Try to find reductions first. for (Instruction *I : PostponedCmps) { @@ -11989,15 +13467,15 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( } // Try to vectorize list of compares. // Sort by type, compare predicate, etc. - auto &&CompareSorter = [&R](Value *V, Value *V2) { - return compareCmp<false>(V, V2, + auto CompareSorter = [&](Value *V, Value *V2) { + return compareCmp<false>(V, V2, *TLI, [&R](Instruction *I) { return R.isDeleted(I); }); }; - auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { + auto AreCompatibleCompares = [&](Value *V1, Value *V2) { if (V1 == V2) return true; - return compareCmp<true>(V1, V2, + return compareCmp<true>(V1, V2, *TLI, [&R](Instruction *I) { return R.isDeleted(I); }); }; auto Limit = [&R](Value *V) { @@ -12025,9 +13503,10 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( /*LimitForRegisterSize=*/true); Instructions.clear(); } else { + Instructions.clear(); // Insert in reverse order since the PostponedCmps vector was filled in // reverse order. - Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); + Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend()); } return OpsChanged; } @@ -12056,7 +13535,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return true; if (Opcodes1.size() > Opcodes2.size()) return false; - Optional<bool> ConstOrder; + std::optional<bool> ConstOrder; for (int I = 0, E = Opcodes1.size(); I < E; ++I) { // Undefs are compatible with any other value. if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { @@ -12078,7 +13557,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { "Different nodes should have different DFS numbers"); if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); if (S.getOpcode()) continue; return I1->getOpcode() < I2->getOpcode(); @@ -12095,7 +13574,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } return ConstOrder && *ConstOrder; }; - auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { + auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) { if (V1 == V2) return true; if (V1->getType() != V2->getType()) @@ -12112,7 +13591,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); if (S.getOpcode()) continue; return false; @@ -12180,7 +13659,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { VisitedInstrs.clear(); - SmallVector<Instruction *, 8> PostProcessInstructions; + InstSetVector PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // Skip instructions with scalable type. The num of elements is unknown at @@ -12232,8 +13711,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { !DT->isReachableFromEntry(P->getIncomingBlock(I))) continue; - Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), - P->getIncomingBlock(I), R, TTI); + // Postponed instructions should not be vectorized here, delay their + // vectorization. + if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I)); + PI && !PostProcessInstructions.contains(PI)) + Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), + P->getIncomingBlock(I), R, TTI); } continue; } @@ -12241,14 +13724,31 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Ran into an instruction without users, like terminator, or function call // with ignored return value, store. Ignore unused instructions (basing on // instruction type, except for CallInst and InvokeInst). - if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) || - isa<InvokeInst>(it))) { + if (it->use_empty() && + (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) { KeyNodes.insert(&*it); bool OpsChanged = false; - if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) { + auto *SI = dyn_cast<StoreInst>(it); + bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; + if (SI) { + auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand())); + // Try to vectorize chain in store, if this is the only store to the + // address in the block. + // TODO: This is just a temporarily solution to save compile time. Need + // to investigate if we can safely turn on slp-vectorize-hor-store + // instead to allow lookup for reduction chains in all non-vectorized + // stores (need to check side effects and compile time). + TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) && + SI->getValueOperand()->hasOneUse(); + } + if (TryToVectorizeRoot) { for (auto *V : it->operand_values()) { - // Try to match and vectorize a horizontal reduction. - OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); + // Postponed instructions should not be vectorized here, delay their + // vectorization. + if (auto *VI = dyn_cast<Instruction>(V); + VI && !PostProcessInstructions.contains(VI)) + // Try to match and vectorize a horizontal reduction. + OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); } } // Start vectorization of post-process list of instructions from the @@ -12266,9 +13766,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } } - if (isa<InsertElementInst>(it) || isa<CmpInst>(it) || - isa<InsertValueInst>(it)) - PostProcessInstructions.push_back(&*it); + if (isa<CmpInst, InsertElementInst, InsertValueInst>(it)) + PostProcessInstructions.insert(&*it); } return Changed; @@ -12395,7 +13894,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { "Different nodes should have different DFS numbers"); if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); if (S.getOpcode()) return false; return I1->getOpcode() < I2->getOpcode(); @@ -12407,7 +13906,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { V2->getValueOperand()->getValueID(); }; - auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) { + auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { if (V1 == V2) return true; if (V1->getPointerOperandType() != V2->getPointerOperandType()) @@ -12420,7 +13919,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); return S.getOpcode() > 0; } if (isa<Constant>(V1->getValueOperand()) && |