aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-12-02 21:49:08 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-05-14 11:43:49 +0000
commit4824e7fd18a1223177218d4aec1b3c6c5c4a444e (patch)
tree5ca6493b1b0bf6a41f257794c0116d5e50fbf37c /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent5e801ac66d24704442eba426ed13c3effb8a34e7 (diff)
parentf65dcba83ce5035ab88a85fe17628b447eb56e1b (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp612
1 files changed, 469 insertions, 143 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3ef0b794f68..95061e9053fa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) {
return false;
}
+/// Checks if the given value is actually an undefined constant vector.
+static bool isUndefVector(const Value *V) {
+ if (isa<UndefValue>(V))
+ return true;
+ auto *C = dyn_cast<Constant>(V);
+ if (!C)
+ return false;
+ if (!C->containsUndefOrPoisonElement())
+ return false;
+ auto *VecTy = dyn_cast<FixedVectorType>(C->getType());
+ if (!VecTy)
+ return false;
+ for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+ if (Constant *Elem = C->getAggregateElement(I))
+ if (!isa<UndefValue>(Elem))
+ return false;
+ }
+ return true;
+}
+
/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
@@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) {
/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
- auto *EI0 = cast<ExtractElementInst>(VL[0]);
+ const auto *It =
+ find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
+ if (It == VL.end())
+ return None;
+ auto *EI0 = cast<ExtractElementInst>(*It);
if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
return None;
unsigned Size =
@@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
+ Mask.assign(VL.size(), UndefMaskElem);
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ // Undef can be represented as an undef element in a vector.
+ if (isa<UndefValue>(VL[I]))
+ continue;
auto *EI = cast<ExtractElementInst>(VL[I]);
+ if (isa<ScalableVectorType>(EI->getVectorOperandType()))
+ return None;
auto *Vec = EI->getVectorOperand();
+ // We can extractelement from undef or poison vector.
+ if (isUndefVector(Vec))
+ continue;
// All vector operands must have the same number of vector elements.
if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
return None;
+ if (isa<UndefValue>(EI->getIndexOperand()))
+ continue;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
return None;
// Undefined behavior if Idx is negative or >= Size.
- if (Idx->getValue().uge(Size)) {
- Mask.push_back(UndefMaskElem);
+ if (Idx->getValue().uge(Size))
continue;
- }
unsigned IntIdx = Idx->getValue().getZExtValue();
- Mask.push_back(IntIdx);
- // We can extractelement from undef or poison vector.
- if (isa<UndefValue>(Vec))
- continue;
+ Mask[I] = IntIdx;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
- if (!Vec1 || Vec1 == Vec)
+ if (!Vec1 || Vec1 == Vec) {
Vec1 = Vec;
- else if (!Vec2 || Vec2 == Vec)
+ } else if (!Vec2 || Vec2 == Vec) {
Vec2 = Vec;
- else
+ Mask[I] += Size;
+ } else {
return None;
+ }
if (CommonShuffleMode == Permute)
continue;
// If the extract index is not the same as the operation number, it is a
@@ -1680,6 +1712,28 @@ private:
return IsSame(Scalars, ReuseShuffleIndices);
}
+ /// \returns true if current entry has same operands as \p TE.
+ bool hasEqualOperands(const TreeEntry &TE) const {
+ if (TE.getNumOperands() != getNumOperands())
+ return false;
+ SmallBitVector Used(getNumOperands());
+ for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
+ unsigned PrevCount = Used.count();
+ for (unsigned K = 0; K < E; ++K) {
+ if (Used.test(K))
+ continue;
+ if (getOperand(K) == TE.getOperand(I)) {
+ Used.set(K);
+ break;
+ }
+ }
+ // Check if we actually found the matching operand.
+ if (PrevCount == Used.count())
+ return false;
+ }
+ return true;
+ }
+
/// \return Final vectorization factor for the node. Defined by the total
/// number of vectorized scalars, including those, used several times in the
/// entry and counted in the \a ReuseShuffleIndices, if any.
@@ -1773,6 +1827,12 @@ private:
return Operands[OpIdx];
}
+ /// \returns the \p OpIdx operand of this TreeEntry.
+ ArrayRef<Value *> getOperand(unsigned OpIdx) const {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ return Operands[OpIdx];
+ }
+
/// \returns the number of operands.
unsigned getNumOperands() const { return Operands.size(); }
@@ -2078,7 +2138,7 @@ private:
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
- SetVector<Instruction *> GatherSeq;
+ SetVector<Instruction *> GatherShuffleSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
@@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
bool IsGather) {
DenseMap<Value *, int> ExtractVectorsTys;
for (auto *V : VL) {
+ if (isa<UndefValue>(V))
+ continue;
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
- if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
- (IsGather && ScalarToTreeEntry.count(V)))
+ if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))
continue;
auto *EE = cast<ExtractElementInst>(V);
- unsigned Idx = *getExtractIndex(EE);
+ Optional<unsigned> EEIdx = getExtractIndex(EE);
+ if (!EEIdx)
+ continue;
+ unsigned Idx = *EEIdx;
if (TTIRef.getNumberOfParts(VecTy) !=
TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
auto It =
@@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
for (const auto &Data : ExtractVectorsTys) {
auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
unsigned NumElts = VecTy->getNumElements();
+ if (Data.second % NumElts == 0)
+ continue;
if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
unsigned Idx = (Data.second / NumElts) * NumElts;
unsigned EENumElts = EEVTy->getNumElements();
@@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// broadcast.
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
}
- if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
- allSameBlock(VL) &&
- !isa<ScalableVectorType>(
- cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) {
+ if ((E->getOpcode() == Instruction::ExtractElement ||
+ all_of(E->Scalars,
+ [](Value *V) {
+ return isa<ExtractElementInst, UndefValue>(V);
+ })) &&
+ allSameType(VL)) {
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
SmallVector<int> Mask;
@@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
return !is_contained(E->Scalars,
cast<Instruction>(V)->getOperand(0));
}));
- if (isa<UndefValue>(FirstInsert->getOperand(0))) {
+ if (isUndefVector(FirstInsert->getOperand(0))) {
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
} else {
SmallVector<int> InsertMask(NumElts);
@@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
InstructionCost VecCost = 0;
- if (Instruction::isBinaryOp(E->getOpcode())) {
+ // Try to find the previous shuffle node with the same operands and same
+ // main/alternate ops.
+ auto &&TryFindNodeWithEqualOperands = [this, E]() {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ if (TE.get() == E)
+ break;
+ if (TE->isAltShuffle() &&
+ ((TE->getOpcode() == E->getOpcode() &&
+ TE->getAltOpcode() == E->getAltOpcode()) ||
+ (TE->getOpcode() == E->getAltOpcode() &&
+ TE->getAltOpcode() == E->getOpcode())) &&
+ TE->hasEqualOperands(*E))
+ return true;
+ }
+ return false;
+ };
+ if (TryFindNodeWithEqualOperands()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: diamond match for alternate node found.\n";
+ E->dump();
+ });
+ // No need to add new vector costs here since we're going to reuse
+ // same main/alternate vector ops, just do different shuffling.
+ } else if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
@@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
[this](Value *V) { return EphValues.contains(V); }) &&
(allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
TE->Scalars.size() < Limit ||
- (TE->getOpcode() == Instruction::ExtractElement &&
+ ((TE->getOpcode() == Instruction::ExtractElement ||
+ all_of(TE->Scalars,
+ [](Value *V) {
+ return isa<ExtractElementInst, UndefValue>(V);
+ })) &&
isFixedVectorShuffle(TE->Scalars, Mask)) ||
(TE->State == TreeEntry::NeedToGather &&
TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
@@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}
+/// Check if two insertelement instructions are from the same buildvector.
+static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
+ InsertElementInst *V) {
+ // Instructions must be from the same basic blocks.
+ if (VU->getParent() != V->getParent())
+ return false;
+ // Checks if 2 insertelements are from the same buildvector.
+ if (VU->getType() != V->getType())
+ return false;
+ // Multiple used inserts are separate nodes.
+ if (!VU->hasOneUse() && !V->hasOneUse())
+ return false;
+ auto *IE1 = VU;
+ auto *IE2 = V;
+ // Go through the vector operand of insertelement instructions trying to find
+ // either VU as the original vector for IE2 or V as the original vector for
+ // IE1.
+ do {
+ if (IE2 == VU || IE1 == V)
+ return true;
+ if (IE1) {
+ if (IE1 != VU && !IE1->hasOneUse())
+ IE1 = nullptr;
+ else
+ IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+ }
+ if (IE2) {
+ if (IE2 != V && !IE2->hasOneUse())
+ IE2 = nullptr;
+ else
+ IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+ }
+ } while (IE1 || IE2);
+ return false;
+}
+
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<APInt> DemandedElts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
- if (!ExtractCostCalculated.insert(EU.Scalar).second)
+ if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
+ !ExtractCostCalculated.insert(EU.Scalar).second)
continue;
// Uses by ephemeral values are free (because the ephemeral value will be
@@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
- if (isa_and_nonnull<InsertElementInst>(EU.User)) {
- if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
- Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
+ if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
+ if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
+ Optional<int> InsertIdx = getInsertIndex(VU, 0);
if (!InsertIdx || *InsertIdx == UndefMaskElem)
continue;
- Value *VU = EU.User;
auto *It = find_if(FirstUsers, [VU](Value *V) {
- // Checks if 2 insertelements are from the same buildvector.
- if (VU->getType() != V->getType())
- return false;
- auto *IE1 = cast<InsertElementInst>(VU);
- auto *IE2 = cast<InsertElementInst>(V);
- // Go through of insertelement instructions trying to find either VU
- // as the original vector for IE2 or V as the original vector for IE1.
- do {
- if (IE1 == VU || IE2 == V)
- return true;
- if (IE1)
- IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
- if (IE2)
- IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
- } while (IE1 || IE2);
- return false;
+ return areTwoInsertFromSameBuildVector(VU,
+ cast<InsertElementInst>(V));
});
int VecId = -1;
if (It == FirstUsers.end()) {
VF.push_back(FTy->getNumElements());
ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
- FirstUsers.push_back(EU.User);
+ // Find the insertvector, vectorized in tree, if any.
+ Value *Base = VU;
+ while (isa<InsertElementInst>(Base)) {
+ // Build the mask for the vectorized insertelement instructions.
+ if (const TreeEntry *E = getTreeEntry(Base)) {
+ VU = cast<InsertElementInst>(Base);
+ do {
+ int Idx = E->findLaneForValue(Base);
+ ShuffleMask.back()[Idx] = Idx;
+ Base = cast<InsertElementInst>(Base)->getOperand(0);
+ } while (E == getTreeEntry(Base));
+ break;
+ }
+ Base = cast<InsertElementInst>(Base)->getOperand(0);
+ }
+ FirstUsers.push_back(VU);
DemandedElts.push_back(APInt::getZero(VF.back()));
VecId = FirstUsers.size() - 1;
} else {
@@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
int Idx = *InsertIdx;
ShuffleMask[VecId][Idx] = EU.Lane;
DemandedElts[VecId].setBit(Idx);
+ continue;
}
}
@@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
- // For the very first element - simple shuffle of the source vector.
- int Limit = ShuffleMask[I].size() * 2;
- if (I == 0 &&
- all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&
- !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
+ if (FirstUsers.size() == 1) {
+ int Limit = ShuffleMask.front().size() * 2;
+ if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
+ !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
InstructionCost C = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc,
- cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
+ cast<FixedVectorType>(FirstUsers.front()->getType()),
+ ShuffleMask.front());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of insertelement external users "
<< *VectorizableTree.front()->Scalars.front() << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
Cost += C;
- continue;
}
- // Other elements - permutation of 2 vectors (the initial one and the next
- // Ith incoming vector).
- unsigned VF = ShuffleMask[I].size();
- for (unsigned Idx = 0; Idx < VF; ++Idx) {
- int &Mask = ShuffleMask[I][Idx];
- Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
- }
- InstructionCost C = TTI->getShuffleCost(
- TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
- ShuffleMask[I]);
- LLVM_DEBUG(
- dbgs()
- << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
- /*Insert*/ true,
- /*Extract*/ false);
+ cast<FixedVectorType>(FirstUsers.front()->getType()),
+ DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
+ LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+ << " for insertelements gather.\n"
+ << "SLP: Current total cost = " << Cost << "\n");
Cost -= InsertCost;
+ } else if (FirstUsers.size() >= 2) {
+ unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
+ // Combined masks of the first 2 vectors.
+ SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
+ copy(ShuffleMask.front(), CombinedMask.begin());
+ APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
+ auto *VecTy = FixedVectorType::get(
+ cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
+ MaxVF);
+ for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
+ if (ShuffleMask[1][I] != UndefMaskElem) {
+ CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
+ CombinedDemandedElts.setBit(I);
+ }
+ }
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external "
+ "insertelement users "
+ << *VectorizableTree.front()->Scalars.front() << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ InstructionCost InsertCost = TTI->getScalarizationOverhead(
+ VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
<< " for insertelements gather.\n"
<< "SLP: Current total cost = " << Cost << "\n");
+ Cost -= InsertCost;
+ for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
+ // Other elements - permutation of 2 vectors (the initial one and the
+ // next Ith incoming vector).
+ unsigned VF = ShuffleMask[I].size();
+ for (unsigned Idx = 0; Idx < VF; ++Idx) {
+ int Mask = ShuffleMask[I][Idx];
+ if (Mask != UndefMaskElem)
+ CombinedMask[Idx] = MaxVF + Mask;
+ else if (CombinedMask[Idx] != UndefMaskElem)
+ CombinedMask[Idx] = Idx;
+ }
+ for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
+ if (CombinedMask[Idx] != UndefMaskElem)
+ CombinedMask[Idx] = Idx;
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external "
+ "insertelement users "
+ << *VectorizableTree.front()->Scalars.front() << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ InstructionCost InsertCost = TTI->getScalarizationOverhead(
+ cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
+ /*Insert*/ true, /*Extract*/ false);
+ LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+ << " for insertelements gather.\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost -= InsertCost;
+ }
}
#ifndef NDEBUG
@@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
if (!InsElt)
return Vec;
- GatherSeq.insert(InsElt);
+ GatherShuffleSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
if (TreeEntry *Entry = getTreeEntry(V)) {
@@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder {
const unsigned VF = 0;
bool IsFinalized = false;
SmallVector<int, 4> Mask;
+ /// Holds all of the instructions that we gathered.
+ SetVector<Instruction *> &GatherShuffleSeq;
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> &CSEBlocks;
public:
- ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)
- : Builder(Builder), VF(VF) {}
+ ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF,
+ SetVector<Instruction *> &GatherShuffleSeq,
+ SetVector<BasicBlock *> &CSEBlocks)
+ : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq),
+ CSEBlocks(CSEBlocks) {}
/// Adds a mask, inverting it before applying.
void addInversedMask(ArrayRef<unsigned> SubMask) {
@@ -5804,7 +5983,12 @@ public:
if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))
return V;
- return Builder.CreateShuffleVector(V, Mask, "shuffle");
+ Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
}
~ShuffleInstructionBuilder() {
@@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
std::iota(UniformMask.begin(), UniformMask.end(), 0);
V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
}
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
return V;
}
@@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
VL = UniqueValues;
}
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+ CSEBlocks);
Value *Vec = gather(VL);
if (!ReuseShuffleIndicies.empty()) {
ShuffleBuilder.addMask(ReuseShuffleIndicies);
Vec = ShuffleBuilder.finalize(Vec);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
return Vec;
}
@@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
unsigned VF = E->getVectorFactor();
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+ CSEBlocks);
if (E->State == TreeEntry::NeedToGather) {
if (E->getMainOp())
setInsertPointAfterBundle(E);
@@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
"Expected shuffle of 1 or 2 entries.");
Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
Entries.back()->VectorizedValue, Mask);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
} else {
Vec = gather(E->Scalars);
}
if (NeedToShuffleReuses) {
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
Vec = ShuffleBuilder.finalize(Vec);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
E->VectorizedValue = Vec;
return Vec;
@@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IsIdentity &= *InsertIdx - Offset == I;
Mask[*InsertIdx - Offset] = I;
}
- if (!IsIdentity || NumElts != NumScalars)
+ if (!IsIdentity || NumElts != NumScalars) {
V = Builder.CreateShuffleVector(V, Mask);
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
if ((!IsIdentity || Offset != 0 ||
- !isa<UndefValue>(FirstInsert->getOperand(0))) &&
+ !isUndefVector(FirstInsert->getOperand(0))) &&
NumElts != NumScalars) {
SmallVector<int> InsertMask(NumElts);
std::iota(InsertMask.begin(), InsertMask.end(), 0);
@@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = Builder.CreateShuffleVector(
FirstInsert->getOperand(0), V, InsertMask,
cast<Instruction>(E->Scalars.back())->getName());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
++NumVectorInstructions;
@@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V1 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
}
+ // Add V0 and V1 to later analysis to try to find and remove matching
+ // instruction, if any.
+ for (Value *V : {V0, V1}) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
// Create shuffle to take alternate operations from the vector.
// Also, gather up main and alt scalar ops to propagate IR flags to
@@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
- if (Instruction *I = dyn_cast<Instruction>(V))
+ if (auto *I = dyn_cast<Instruction>(V)) {
V = propagateMetadata(I, E->Scalars);
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
@@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
void BoUpSLP::optimizeGatherSequence() {
- LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (Instruction *I : GatherSeq) {
+ for (Instruction *I : GatherShuffleSeq) {
if (isDeleted(I))
continue;
@@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() {
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
- if (Op0 && L->contains(Op0))
- continue;
- if (Op1 && L->contains(Op1))
+ if (any_of(I->operands(), [L](Value *V) {
+ auto *OpI = dyn_cast<Instruction>(V);
+ return OpI && L->contains(OpI);
+ }))
continue;
// We can hoist this instruction. Move it to the pre-header.
@@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() {
return A->getDFSNumIn() < B->getDFSNumIn();
});
- // Perform O(N^2) search over the gather sequences and merge identical
+ // Less defined shuffles can be replaced by the more defined copies.
+ // Between two shuffles one is less defined if it has the same vector operands
+ // and its mask indeces are the same as in the first one or undefs. E.g.
+ // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
+ // poison, <0, 0, 0, 0>.
+ auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
+ SmallVectorImpl<int> &NewMask) {
+ if (I1->getType() != I2->getType())
+ return false;
+ auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
+ auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
+ if (!SI1 || !SI2)
+ return I1->isIdenticalTo(I2);
+ if (SI1->isIdenticalTo(SI2))
+ return true;
+ for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
+ if (SI1->getOperand(I) != SI2->getOperand(I))
+ return false;
+ // Check if the second instruction is more defined than the first one.
+ NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
+ ArrayRef<int> SM1 = SI1->getShuffleMask();
+ // Count trailing undefs in the mask to check the final number of used
+ // registers.
+ unsigned LastUndefsCnt = 0;
+ for (int I = 0, E = NewMask.size(); I < E; ++I) {
+ if (SM1[I] == UndefMaskElem)
+ ++LastUndefsCnt;
+ else
+ LastUndefsCnt = 0;
+ if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&
+ NewMask[I] != SM1[I])
+ return false;
+ if (NewMask[I] == UndefMaskElem)
+ NewMask[I] = SM1[I];
+ }
+ // Check if the last undefs actually change the final number of used vector
+ // registers.
+ return SM1.size() - LastUndefsCnt > 1 &&
+ TTI->getNumberOfParts(SI1->getType()) ==
+ TTI->getNumberOfParts(
+ FixedVectorType::get(SI1->getType()->getElementType(),
+ SM1.size() - LastUndefsCnt));
+ };
+ // Perform O(N^2) search over the gather/shuffle sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
@@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() {
if (isDeleted(&In))
continue;
if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
- !isa<ShuffleVectorInst>(&In))
+ !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
bool Replaced = false;
- for (Instruction *v : Visited) {
- if (In.isIdenticalTo(v) &&
- DT->dominates(v->getParent(), In.getParent())) {
- In.replaceAllUsesWith(v);
+ for (Instruction *&V : Visited) {
+ SmallVector<int> NewMask;
+ if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
+ DT->dominates(V->getParent(), In.getParent())) {
+ In.replaceAllUsesWith(V);
eraseInstruction(&In);
+ if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
+ if (!NewMask.empty())
+ SI->setShuffleMask(NewMask);
+ Replaced = true;
+ break;
+ }
+ if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
+ GatherShuffleSeq.contains(V) &&
+ IsIdenticalOrLessDefined(V, &In, NewMask) &&
+ DT->dominates(In.getParent(), V->getParent())) {
+ In.moveAfter(V);
+ V->replaceAllUsesWith(&In);
+ eraseInstruction(V);
+ if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
+ if (!NewMask.empty())
+ SI->setShuffleMask(NewMask);
+ V = &In;
Replaced = true;
break;
}
@@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() {
}
}
CSEBlocks.clear();
- GatherSeq.clear();
+ GatherShuffleSeq.clear();
}
// Groups the instructions to a bundle (which is then a single scheduling entity)
@@ -8791,6 +9057,8 @@ private:
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ assert(RdxKind != RecurKind::FMulAdd &&
+ "A call to the llvm.fmuladd intrinsic is not handled yet");
++NumVectorInstructions;
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
@@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
SmallVector<Value *, 16> BuildVectorOpds;
SmallVector<int> Mask;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
- (llvm::all_of(BuildVectorOpds,
- [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+ (llvm::all_of(
+ BuildVectorOpds,
+ [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&
isFixedVectorShuffle(BuildVectorOpds, Mask)))
return false;
@@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
return tryToVectorizeList(BuildVectorInsts, R);
}
-bool SLPVectorizerPass::vectorizeSimpleInstructions(
- SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
- bool AtTerminator) {
- bool OpsChanged = false;
- SmallVector<Instruction *, 4> PostponedCmps;
- for (auto *I : reverse(Instructions)) {
- if (R.isDeleted(I))
- continue;
- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
- OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
- OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- else if (isa<CmpInst>(I))
- PostponedCmps.push_back(I);
- }
- if (AtTerminator) {
- // Try to find reductions first.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- for (Value *Op : I->operands())
- OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
- }
- // Try to vectorize operands as vector bundles.
- for (Instruction *I : PostponedCmps) {
- if (R.isDeleted(I))
- continue;
- OpsChanged |= tryToVectorize(I, R);
- }
- Instructions.clear();
- } else {
- // Insert in reverse order since the PostponedCmps vector was filled in
- // reverse order.
- Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
- }
- return OpsChanged;
-}
-
template <typename T>
static bool
tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
@@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
return Changed;
}
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
+ bool AtTerminator) {
+ bool OpsChanged = false;
+ SmallVector<Instruction *, 4> PostponedCmps;
+ for (auto *I : reverse(Instructions)) {
+ if (R.isDeleted(I))
+ continue;
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+ else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+ else if (isa<CmpInst>(I))
+ PostponedCmps.push_back(I);
+ }
+ if (AtTerminator) {
+ // Try to find reductions first.
+ for (Instruction *I : PostponedCmps) {
+ if (R.isDeleted(I))
+ continue;
+ for (Value *Op : I->operands())
+ OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
+ }
+ // Try to vectorize operands as vector bundles.
+ for (Instruction *I : PostponedCmps) {
+ if (R.isDeleted(I))
+ continue;
+ OpsChanged |= tryToVectorize(I, R);
+ }
+ // Try to vectorize list of compares.
+ // Sort by type, compare predicate, etc.
+ // TODO: Add analysis on the operand opcodes (profitable to vectorize
+ // instructions with same/alternate opcodes/const values).
+ auto &&CompareSorter = [&R](Value *V, Value *V2) {
+ auto *CI1 = cast<CmpInst>(V);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType()->getTypeID() <
+ CI2->getOperand(0)->getType()->getTypeID())
+ return true;
+ if (CI1->getOperand(0)->getType()->getTypeID() >
+ CI2->getOperand(0)->getType()->getTypeID())
+ return false;
+ return CI1->getPredicate() < CI2->getPredicate() ||
+ (CI1->getPredicate() > CI2->getPredicate() &&
+ CI1->getPredicate() <
+ CmpInst::getSwappedPredicate(CI2->getPredicate()));
+ };
+
+ auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
+ if (V1 == V2)
+ return true;
+ auto *CI1 = cast<CmpInst>(V1);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())
+ return false;
+ return CI1->getPredicate() == CI2->getPredicate() ||
+ CI1->getPredicate() ==
+ CmpInst::getSwappedPredicate(CI2->getPredicate());
+ };
+ auto Limit = [&R](Value *V) {
+ unsigned EltSize = R.getVectorElementSize(V);
+ return std::max(2U, R.getMaxVecRegSize() / EltSize);
+ };
+
+ SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());
+ OpsChanged |= tryToVectorizeSequence<Value>(
+ Vals, Limit, CompareSorter, AreCompatibleCompares,
+ [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
+ // Exclude possible reductions from other blocks.
+ bool ArePossiblyReducedInOtherBlock =
+ any_of(Candidates, [](Value *V) {
+ return any_of(V->users(), [V](User *U) {
+ return isa<SelectInst>(U) &&
+ cast<SelectInst>(U)->getParent() !=
+ cast<Instruction>(V)->getParent();
+ });
+ });
+ if (ArePossiblyReducedInOtherBlock)
+ return false;
+ return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+ },
+ /*LimitForRegisterSize=*/true);
+ Instructions.clear();
+ } else {
+ // Insert in reverse order since the PostponedCmps vector was filled in
+ // reverse order.
+ Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
+ }
+ return OpsChanged;
+}
+
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;