diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-02 21:49:08 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-05-14 11:43:49 +0000 |
commit | 4824e7fd18a1223177218d4aec1b3c6c5c4a444e (patch) | |
tree | 5ca6493b1b0bf6a41f257794c0116d5e50fbf37c /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | |
parent | 5e801ac66d24704442eba426ed13c3effb8a34e7 (diff) | |
parent | f65dcba83ce5035ab88a85fe17628b447eb56e1b (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 612 |
1 files changed, 469 insertions, 143 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e3ef0b794f68..95061e9053fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) { return false; } +/// Checks if the given value is actually an undefined constant vector. +static bool isUndefVector(const Value *V) { + if (isa<UndefValue>(V)) + return true; + auto *C = dyn_cast<Constant>(V); + if (!C) + return false; + if (!C->containsUndefOrPoisonElement()) + return false; + auto *VecTy = dyn_cast<FixedVectorType>(C->getType()); + if (!VecTy) + return false; + for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { + if (Constant *Elem = C->getAggregateElement(I)) + if (!isa<UndefValue>(Elem)) + return false; + } + return true; +} + /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) { /// TargetTransformInfo::getInstructionThroughput? static Optional<TargetTransformInfo::ShuffleKind> isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { - auto *EI0 = cast<ExtractElementInst>(VL[0]); + const auto *It = + find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); }); + if (It == VL.end()) + return None; + auto *EI0 = cast<ExtractElementInst>(*It); if (isa<ScalableVectorType>(EI0->getVectorOperandType())) return None; unsigned Size = @@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; + Mask.assign(VL.size(), UndefMaskElem); for (unsigned I = 0, E = VL.size(); I < E; ++I) { + // Undef can be represented as an undef element in a vector. + if (isa<UndefValue>(VL[I])) + continue; auto *EI = cast<ExtractElementInst>(VL[I]); + if (isa<ScalableVectorType>(EI->getVectorOperandType())) + return None; auto *Vec = EI->getVectorOperand(); + // We can extractelement from undef or poison vector. + if (isUndefVector(Vec)) + continue; // All vector operands must have the same number of vector elements. if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) return None; + if (isa<UndefValue>(EI->getIndexOperand())) + continue; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) return None; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) { - Mask.push_back(UndefMaskElem); + if (Idx->getValue().uge(Size)) continue; - } unsigned IntIdx = Idx->getValue().getZExtValue(); - Mask.push_back(IntIdx); - // We can extractelement from undef or poison vector. - if (isa<UndefValue>(Vec)) - continue; + Mask[I] = IntIdx; // For correct shuffling we have to have at most 2 different vector operands // in all extractelement instructions. - if (!Vec1 || Vec1 == Vec) + if (!Vec1 || Vec1 == Vec) { Vec1 = Vec; - else if (!Vec2 || Vec2 == Vec) + } else if (!Vec2 || Vec2 == Vec) { Vec2 = Vec; - else + Mask[I] += Size; + } else { return None; + } if (CommonShuffleMode == Permute) continue; // If the extract index is not the same as the operation number, it is a @@ -1680,6 +1712,28 @@ private: return IsSame(Scalars, ReuseShuffleIndices); } + /// \returns true if current entry has same operands as \p TE. + bool hasEqualOperands(const TreeEntry &TE) const { + if (TE.getNumOperands() != getNumOperands()) + return false; + SmallBitVector Used(getNumOperands()); + for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { + unsigned PrevCount = Used.count(); + for (unsigned K = 0; K < E; ++K) { + if (Used.test(K)) + continue; + if (getOperand(K) == TE.getOperand(I)) { + Used.set(K); + break; + } + } + // Check if we actually found the matching operand. + if (PrevCount == Used.count()) + return false; + } + return true; + } + /// \return Final vectorization factor for the node. Defined by the total /// number of vectorized scalars, including those, used several times in the /// entry and counted in the \a ReuseShuffleIndices, if any. @@ -1773,6 +1827,12 @@ private: return Operands[OpIdx]; } + /// \returns the \p OpIdx operand of this TreeEntry. + ArrayRef<Value *> getOperand(unsigned OpIdx) const { + assert(OpIdx < Operands.size() && "Off bounds"); + return Operands[OpIdx]; + } + /// \returns the number of operands. unsigned getNumOperands() const { return Operands.size(); } @@ -2078,7 +2138,7 @@ private: SmallPtrSet<const Value *, 32> EphValues; /// Holds all of the instructions that we gathered. - SetVector<Instruction *> GatherSeq; + SetVector<Instruction *> GatherShuffleSeq; /// A list of blocks that we are going to CSE. SetVector<BasicBlock *> CSEBlocks; @@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, bool IsGather) { DenseMap<Value *, int> ExtractVectorsTys; for (auto *V : VL) { + if (isa<UndefValue>(V)) + continue; // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || - (IsGather && ScalarToTreeEntry.count(V))) + if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) continue; auto *EE = cast<ExtractElementInst>(V); - unsigned Idx = *getExtractIndex(EE); + Optional<unsigned> EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; if (TTIRef.getNumberOfParts(VecTy) != TTIRef.getNumberOfParts(EE->getVectorOperandType())) { auto It = @@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, for (const auto &Data : ExtractVectorsTys) { auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); unsigned NumElts = VecTy->getNumElements(); + if (Data.second % NumElts == 0) + continue; if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); @@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // broadcast. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } - if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && - allSameBlock(VL) && - !isa<ScalableVectorType>( - cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) { + if ((E->getOpcode() == Instruction::ExtractElement || + all_of(E->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && + allSameType(VL)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. SmallVector<int> Mask; @@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); })); - if (isa<UndefValue>(FirstInsert->getOperand(0))) { + if (isUndefVector(FirstInsert->getOperand(0))) { Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); } else { SmallVector<int> InsertMask(NumElts); @@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. InstructionCost VecCost = 0; - if (Instruction::isBinaryOp(E->getOpcode())) { + // Try to find the previous shuffle node with the same operands and same + // main/alternate ops. + auto &&TryFindNodeWithEqualOperands = [this, E]() { + for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + if (TE.get() == E) + break; + if (TE->isAltShuffle() && + ((TE->getOpcode() == E->getOpcode() && + TE->getAltOpcode() == E->getAltOpcode()) || + (TE->getOpcode() == E->getAltOpcode() && + TE->getAltOpcode() == E->getOpcode())) && + TE->hasEqualOperands(*E)) + return true; + } + return false; + }; + if (TryFindNodeWithEqualOperands()) { + LLVM_DEBUG({ + dbgs() << "SLP: diamond match for alternate node found.\n"; + E->dump(); + }); + // No need to add new vector costs here since we're going to reuse + // same main/alternate vector ops, just do different shuffling. + } else if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); @@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { [this](Value *V) { return EphValues.contains(V); }) && (allConstant(TE->Scalars) || isSplat(TE->Scalars) || TE->Scalars.size() < Limit || - (TE->getOpcode() == Instruction::ExtractElement && + ((TE->getOpcode() == Instruction::ExtractElement || + all_of(TE->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && isFixedVectorShuffle(TE->Scalars, Mask)) || (TE->State == TreeEntry::NeedToGather && TE->getOpcode() == Instruction::Load && !TE->isAltShuffle())); @@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } +/// Check if two insertelement instructions are from the same buildvector. +static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, + InsertElementInst *V) { + // Instructions must be from the same basic blocks. + if (VU->getParent() != V->getParent()) + return false; + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + // Multiple used inserts are separate nodes. + if (!VU->hasOneUse() && !V->hasOneUse()) + return false; + auto *IE1 = VU; + auto *IE2 = V; + // Go through the vector operand of insertelement instructions trying to find + // either VU as the original vector for IE2 or V as the original vector for + // IE1. + do { + if (IE2 == VU || IE1 == V) + return true; + if (IE1) { + if (IE1 != VU && !IE1->hasOneUse()) + IE1 = nullptr; + else + IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); + } + if (IE2) { + if (IE2 != V && !IE2->hasOneUse()) + IE2 = nullptr; + else + IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); + } + } while (IE1 || IE2); + return false; +} + InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " @@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { SmallVector<APInt> DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) + if (!isa_and_nonnull<InsertElementInst>(EU.User) && + !ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be @@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. - if (isa_and_nonnull<InsertElementInst>(EU.User)) { - if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { - Optional<int> InsertIdx = getInsertIndex(EU.User, 0); + if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { + if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { + Optional<int> InsertIdx = getInsertIndex(VU, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - Value *VU = EU.User; auto *It = find_if(FirstUsers, [VU](Value *V) { - // Checks if 2 insertelements are from the same buildvector. - if (VU->getType() != V->getType()) - return false; - auto *IE1 = cast<InsertElementInst>(VU); - auto *IE2 = cast<InsertElementInst>(V); - // Go through of insertelement instructions trying to find either VU - // as the original vector for IE2 or V as the original vector for IE1. - do { - if (IE1 == VU || IE2 == V) - return true; - if (IE1) - IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); - if (IE2) - IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); - } while (IE1 || IE2); - return false; + return areTwoInsertFromSameBuildVector(VU, + cast<InsertElementInst>(V)); }); int VecId = -1; if (It == FirstUsers.end()) { VF.push_back(FTy->getNumElements()); ShuffleMask.emplace_back(VF.back(), UndefMaskElem); - FirstUsers.push_back(EU.User); + // Find the insertvector, vectorized in tree, if any. + Value *Base = VU; + while (isa<InsertElementInst>(Base)) { + // Build the mask for the vectorized insertelement instructions. + if (const TreeEntry *E = getTreeEntry(Base)) { + VU = cast<InsertElementInst>(Base); + do { + int Idx = E->findLaneForValue(Base); + ShuffleMask.back()[Idx] = Idx; + Base = cast<InsertElementInst>(Base)->getOperand(0); + } while (E == getTreeEntry(Base)); + break; + } + Base = cast<InsertElementInst>(Base)->getOperand(0); + } + FirstUsers.push_back(VU); DemandedElts.push_back(APInt::getZero(VF.back())); VecId = FirstUsers.size() - 1; } else { @@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { int Idx = *InsertIdx; ShuffleMask[VecId][Idx] = EU.Lane; DemandedElts[VecId].setBit(Idx); + continue; } } @@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - // For the very first element - simple shuffle of the source vector. - int Limit = ShuffleMask[I].size() * 2; - if (I == 0 && - all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && - !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { + if (FirstUsers.size() == 1) { + int Limit = ShuffleMask.front().size() * 2; + if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, - cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); + cast<FixedVectorType>(FirstUsers.front()->getType()), + ShuffleMask.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users " << *VectorizableTree.front()->Scalars.front() << ".\n" << "SLP: Current total cost = " << Cost << "\n"); Cost += C; - continue; } - // Other elements - permutation of 2 vectors (the initial one and the next - // Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int &Mask = ShuffleMask[I][Idx]; - Mask = Mask == UndefMaskElem ? Idx : VF + Mask; - } - InstructionCost C = TTI->getShuffleCost( - TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), - ShuffleMask[I]); - LLVM_DEBUG( - dbgs() - << "SLP: Adding cost " << C - << " for final shuffle of vector node and external insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, - /*Extract*/ false); + cast<FixedVectorType>(FirstUsers.front()->getType()), + DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); Cost -= InsertCost; + } else if (FirstUsers.size() >= 2) { + unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); + // Combined masks of the first 2 vectors. + SmallVector<int> CombinedMask(MaxVF, UndefMaskElem); + copy(ShuffleMask.front(), CombinedMask.begin()); + APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); + auto *VecTy = FixedVectorType::get( + cast<VectorType>(FirstUsers.front()->getType())->getElementType(), + MaxVF); + for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { + if (ShuffleMask[1][I] != UndefMaskElem) { + CombinedMask[I] = ShuffleMask[1][I] + MaxVF; + CombinedDemandedElts.setBit(I); + } + } + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost << " for insertelements gather.\n" << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + for (int I = 2, E = FirstUsers.size(); I < E; ++I) { + // Other elements - permutation of 2 vectors (the initial one and the + // next Ith incoming vector). + unsigned VF = ShuffleMask[I].size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int Mask = ShuffleMask[I][Idx]; + if (Mask != UndefMaskElem) + CombinedMask[Idx] = MaxVF + Mask; + else if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + } + for (unsigned Idx = VF; Idx < MaxVF; ++Idx) + if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + } } #ifndef NDEBUG @@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { auto *InsElt = dyn_cast<InsertElementInst>(Vec); if (!InsElt) return Vec; - GatherSeq.insert(InsElt); + GatherShuffleSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(V)) { @@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder { const unsigned VF = 0; bool IsFinalized = false; SmallVector<int, 4> Mask; + /// Holds all of the instructions that we gathered. + SetVector<Instruction *> &GatherShuffleSeq; + /// A list of blocks that we are going to CSE. + SetVector<BasicBlock *> &CSEBlocks; public: - ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF) - : Builder(Builder), VF(VF) {} + ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF, + SetVector<Instruction *> &GatherShuffleSeq, + SetVector<BasicBlock *> &CSEBlocks) + : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq), + CSEBlocks(CSEBlocks) {} /// Adds a mask, inverting it before applying. void addInversedMask(ArrayRef<unsigned> SubMask) { @@ -5804,7 +5983,12 @@ public: if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) return V; - return Builder.CreateShuffleVector(V, Mask, "shuffle"); + Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; } ~ShuffleInstructionBuilder() { @@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { std::iota(UniformMask.begin(), UniformMask.end(), 0); V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); } + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } return V; } @@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { VL = UniqueValues; } - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } return Vec; } @@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); if (E->State == TreeEntry::NeedToGather) { if (E->getMainOp()) setInsertPointAfterBundle(E); @@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { "Expected shuffle of 1 or 2 entries."); Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } else { Vec = gather(E->Scalars); } if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = Vec; return Vec; @@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IsIdentity &= *InsertIdx - Offset == I; Mask[*InsertIdx - Offset] = I; } - if (!IsIdentity || NumElts != NumScalars) + if (!IsIdentity || NumElts != NumScalars) { V = Builder.CreateShuffleVector(V, Mask); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } if ((!IsIdentity || Offset != 0 || - !isa<UndefValue>(FirstInsert->getOperand(0))) && + !isUndefVector(FirstInsert->getOperand(0))) && NumElts != NumScalars) { SmallVector<int> InsertMask(NumElts); std::iota(InsertMask.begin(), InsertMask.end(), 0); @@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V = Builder.CreateShuffleVector( FirstInsert->getOperand(0), V, InsertMask, cast<Instruction>(E->Scalars.back())->getName()); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } ++NumVectorInstructions; @@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V1 = Builder.CreateCast( static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); } + // Add V0 and V1 to later analysis to try to find and remove matching + // instruction, if any. + for (Value *V : {V0, V1}) { + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } // Create shuffle to take alternate operations from the vector. // Also, gather up main and alt scalar ops to propagate IR flags to @@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { propagateIRFlags(V1, AltScalars); Value *V = Builder.CreateShuffleVector(V0, V1, Mask); - if (Instruction *I = dyn_cast<Instruction>(V)) + if (auto *I = dyn_cast<Instruction>(V)) { V = propagateMetadata(I, E->Scalars); + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } void BoUpSLP::optimizeGatherSequence() { - LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *I : GatherSeq) { + for (Instruction *I : GatherShuffleSeq) { if (isDeleted(I)) continue; @@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() { // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't // hoist this instruction. - auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); - auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); - if (Op0 && L->contains(Op0)) - continue; - if (Op1 && L->contains(Op1)) + if (any_of(I->operands(), [L](Value *V) { + auto *OpI = dyn_cast<Instruction>(V); + return OpI && L->contains(OpI); + })) continue; // We can hoist this instruction. Move it to the pre-header. @@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() { return A->getDFSNumIn() < B->getDFSNumIn(); }); - // Perform O(N^2) search over the gather sequences and merge identical + // Less defined shuffles can be replaced by the more defined copies. + // Between two shuffles one is less defined if it has the same vector operands + // and its mask indeces are the same as in the first one or undefs. E.g. + // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, + // poison, <0, 0, 0, 0>. + auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, + SmallVectorImpl<int> &NewMask) { + if (I1->getType() != I2->getType()) + return false; + auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); + auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); + if (!SI1 || !SI2) + return I1->isIdenticalTo(I2); + if (SI1->isIdenticalTo(SI2)) + return true; + for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) + if (SI1->getOperand(I) != SI2->getOperand(I)) + return false; + // Check if the second instruction is more defined than the first one. + NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); + ArrayRef<int> SM1 = SI1->getShuffleMask(); + // Count trailing undefs in the mask to check the final number of used + // registers. + unsigned LastUndefsCnt = 0; + for (int I = 0, E = NewMask.size(); I < E; ++I) { + if (SM1[I] == UndefMaskElem) + ++LastUndefsCnt; + else + LastUndefsCnt = 0; + if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem && + NewMask[I] != SM1[I]) + return false; + if (NewMask[I] == UndefMaskElem) + NewMask[I] = SM1[I]; + } + // Check if the last undefs actually change the final number of used vector + // registers. + return SM1.size() - LastUndefsCnt > 1 && + TTI->getNumberOfParts(SI1->getType()) == + TTI->getNumberOfParts( + FixedVectorType::get(SI1->getType()->getElementType(), + SM1.size() - LastUndefsCnt)); + }; + // Perform O(N^2) search over the gather/shuffle sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; @@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() { if (isDeleted(&In)) continue; if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) && - !isa<ShuffleVectorInst>(&In)) + !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In)) continue; // Check if we can replace this instruction with any of the // visited instructions. bool Replaced = false; - for (Instruction *v : Visited) { - if (In.isIdenticalTo(v) && - DT->dominates(v->getParent(), In.getParent())) { - In.replaceAllUsesWith(v); + for (Instruction *&V : Visited) { + SmallVector<int> NewMask; + if (IsIdenticalOrLessDefined(&In, V, NewMask) && + DT->dominates(V->getParent(), In.getParent())) { + In.replaceAllUsesWith(V); eraseInstruction(&In); + if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + Replaced = true; + break; + } + if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && + GatherShuffleSeq.contains(V) && + IsIdenticalOrLessDefined(V, &In, NewMask) && + DT->dominates(In.getParent(), V->getParent())) { + In.moveAfter(V); + V->replaceAllUsesWith(&In); + eraseInstruction(V); + if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + V = &In; Replaced = true; break; } @@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() { } } CSEBlocks.clear(); - GatherSeq.clear(); + GatherShuffleSeq.clear(); } // Groups the instructions to a bundle (which is then a single scheduling entity) @@ -8791,6 +9057,8 @@ private: assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + assert(RdxKind != RecurKind::FMulAdd && + "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, @@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector<Value *, 16> BuildVectorOpds; SmallVector<int> Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || - (llvm::all_of(BuildVectorOpds, - [](Value *V) { return isa<ExtractElementInst>(V); }) && + (llvm::all_of( + BuildVectorOpds, + [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) && isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; @@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, return tryToVectorizeList(BuildVectorInsts, R); } -bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, - bool AtTerminator) { - bool OpsChanged = false; - SmallVector<Instruction *, 4> PostponedCmps; - for (auto *I : reverse(Instructions)) { - if (R.isDeleted(I)) - continue; - if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) - OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); - else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) - OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (isa<CmpInst>(I)) - PostponedCmps.push_back(I); - } - if (AtTerminator) { - // Try to find reductions first. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - for (Value *Op : I->operands()) - OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); - } - // Try to vectorize operands as vector bundles. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - OpsChanged |= tryToVectorize(I, R); - } - Instructions.clear(); - } else { - // Insert in reverse order since the PostponedCmps vector was filled in - // reverse order. - Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); - } - return OpsChanged; -} - template <typename T> static bool tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, @@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +bool SLPVectorizerPass::vectorizeSimpleInstructions( + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { + bool OpsChanged = false; + SmallVector<Instruction *, 4> PostponedCmps; + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I)) + continue; + if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) + OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); + else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) + OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); + else if (isa<CmpInst>(I)) + PostponedCmps.push_back(I); + } + if (AtTerminator) { + // Try to find reductions first. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + for (Value *Op : I->operands()) + OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); + } + // Try to vectorize operands as vector bundles. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + OpsChanged |= tryToVectorize(I, R); + } + // Try to vectorize list of compares. + // Sort by type, compare predicate, etc. + // TODO: Add analysis on the operand opcodes (profitable to vectorize + // instructions with same/alternate opcodes/const values). + auto &&CompareSorter = [&R](Value *V, Value *V2) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return true; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + return CI1->getPredicate() < CI2->getPredicate() || + (CI1->getPredicate() > CI2->getPredicate() && + CI1->getPredicate() < + CmpInst::getSwappedPredicate(CI2->getPredicate())); + }; + + auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { + if (V1 == V2) + return true; + auto *CI1 = cast<CmpInst>(V1); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) + return false; + return CI1->getPredicate() == CI2->getPredicate() || + CI1->getPredicate() == + CmpInst::getSwappedPredicate(CI2->getPredicate()); + }; + auto Limit = [&R](Value *V) { + unsigned EltSize = R.getVectorElementSize(V); + return std::max(2U, R.getMaxVecRegSize() / EltSize); + }; + + SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end()); + OpsChanged |= tryToVectorizeSequence<Value>( + Vals, Limit, CompareSorter, AreCompatibleCompares, + [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { + // Exclude possible reductions from other blocks. + bool ArePossiblyReducedInOtherBlock = + any_of(Candidates, [](Value *V) { + return any_of(V->users(), [V](User *U) { + return isa<SelectInst>(U) && + cast<SelectInst>(U)->getParent() != + cast<Instruction>(V)->getParent(); + }); + }); + if (ArePossiblyReducedInOtherBlock) + return false; + return tryToVectorizeList(Candidates, R, LimitForRegisterSize); + }, + /*LimitForRegisterSize=*/true); + Instructions.clear(); + } else { + // Insert in reverse order since the PostponedCmps vector was filled in + // reverse order. + Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); + } + return OpsChanged; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; |