diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 69 |
1 files changed, 49 insertions, 20 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8e22b54f002d..055fbb00871f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6894,6 +6894,31 @@ protected: }; } // namespace +/// Returns the cost of the shuffle instructions with the given \p Kind, vector +/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert +/// subvector pattern. +static InstructionCost +getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, + VectorType *Tp, ArrayRef<int> Mask = std::nullopt, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, + int Index = 0, VectorType *SubTp = nullptr, + ArrayRef<const Value *> Args = std::nullopt) { + if (Kind != TTI::SK_PermuteTwoSrc) + return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); + int NumSrcElts = Tp->getElementCount().getKnownMinValue(); + int NumSubElts; + if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( + Mask, NumSrcElts, NumSubElts, Index)) { + if (Index + NumSubElts > NumSrcElts && + Index + NumSrcElts <= static_cast<int>(Mask.size())) + return TTI.getShuffleCost( + TTI::SK_InsertSubvector, + FixedVectorType::get(Tp->getElementType(), Mask.size()), std::nullopt, + TTI::TCK_RecipThroughput, Index, Tp); + } + return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); +} + /// Merges shuffle masks and emits final shuffle instruction, if required. It /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, /// when the actual shuffle instruction is generated only if this is actually @@ -7141,15 +7166,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { std::optional<TTI::ShuffleKind> RegShuffleKind = CheckPerRegistersShuffle(SubMask); if (!RegShuffleKind) { - Cost += TTI.getShuffleCost( - *ShuffleKinds[Part], + Cost += ::getShuffleCost( + TTI, *ShuffleKinds[Part], FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice); continue; } if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { - Cost += TTI.getShuffleCost( - *RegShuffleKind, + Cost += ::getShuffleCost( + TTI, *RegShuffleKind, FixedVectorType::get(VL.front()->getType(), EltsPerVector), SubMask); } @@ -7222,8 +7247,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); if (isEmptyOrIdentity(Mask, VF)) return TTI::TCC_Free; - return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, - cast<VectorType>(V1->getType()), Mask); + return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, + cast<VectorType>(V1->getType()), Mask); } InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const { // Empty mask or identity mask are free. @@ -8101,7 +8126,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) Mask[I] = ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I; - Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); + Cost += + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); } } return Cost; @@ -8428,8 +8454,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, return I->getOpcode() == E->getAltOpcode(); }, Mask); - VecCost += TTIRef.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); // Patterns like [fadd,fsub] can be combined into a single instruction // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we // need to take into account their order when looking for the most used @@ -9133,7 +9159,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { auto *FTy = FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF); InstructionCost C = - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask); + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of vector node and external " "insertelement users.\n"; @@ -11991,8 +12017,12 @@ Value *BoUpSLP::vectorizeTree( IRBuilder<>::InsertPointGuard Guard(Builder); if (auto *IVec = dyn_cast<Instruction>(Vec)) Builder.SetInsertPoint(IVec->getNextNonDebugInstruction()); - Vec = Builder.CreateIntCast(Vec, VU->getType(), - BWIt->second.second); + Vec = Builder.CreateIntCast( + Vec, + FixedVectorType::get( + cast<VectorType>(VU->getType())->getElementType(), + cast<FixedVectorType>(Vec->getType())->getNumElements()), + BWIt->second.second); VectorCasts.try_emplace(Scalar, Vec); } else { Vec = VecIt->second; @@ -13070,10 +13100,14 @@ bool BoUpSLP::collectValuesToDemote( if (isa<Constant>(V)) return true; - // If the value is not a vectorized instruction in the expression with only - // one use, it cannot be demoted. + // If the value is not a vectorized instruction in the expression and not used + // by the insertelement instruction and not used in multiple vector nodes, it + // cannot be demoted. auto *I = dyn_cast<Instruction>(V); - if (!I || !I->hasOneUse() || !getTreeEntry(I) || !Visited.insert(I).second) + if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) || + !Visited.insert(I).second || all_of(I->users(), [&](User *U) { + return isa<InsertElementInst>(U) && !getTreeEntry(U); + })) return false; unsigned Start = 0; @@ -13144,11 +13178,6 @@ bool BoUpSLP::collectValuesToDemote( } void BoUpSLP::computeMinimumValueSizes() { - // If there are no external uses, the expression tree must be rooted by a - // store. We can't demote in-memory values, so there is nothing to do here. - if (ExternalUses.empty()) - return; - // We only attempt to truncate integer expressions. auto &TreeRoot = VectorizableTree[0]->Scalars; auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); |