aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4539
1 files changed, 3019 insertions, 1520 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53c11c58f73d..e3eb6b1804e7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19,7 +19,6 @@
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
@@ -94,6 +93,7 @@
#include <cstdint>
#include <iterator>
#include <memory>
+#include <optional>
#include <set>
#include <string>
#include <tuple>
@@ -205,7 +205,7 @@ static bool isValidElementType(Type *Ty) {
/// \returns True if the value is a constant (but not globals/constant
/// expressions).
static bool isConstant(Value *V) {
- return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
+ return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
}
/// Checks if \p V is one of vector-like instructions, i.e. undef,
@@ -284,24 +284,124 @@ static bool isCommutative(Instruction *I) {
return false;
}
+/// \returns inserting index of InsertElement or InsertValue instruction,
+/// using Offset as base offset for index.
+static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
+ unsigned Offset = 0) {
+ int Index = Offset;
+ if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+ const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
+ if (!VT)
+ return std::nullopt;
+ const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
+ if (!CI)
+ return std::nullopt;
+ if (CI->getValue().uge(VT->getNumElements()))
+ return std::nullopt;
+ Index *= VT->getNumElements();
+ Index += CI->getZExtValue();
+ return Index;
+ }
+
+ const auto *IV = cast<InsertValueInst>(InsertInst);
+ Type *CurrentType = IV->getType();
+ for (unsigned I : IV->indices()) {
+ if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
+ Index *= ST->getNumElements();
+ CurrentType = ST->getElementType(I);
+ } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ Index *= AT->getNumElements();
+ CurrentType = AT->getElementType();
+ } else {
+ return std::nullopt;
+ }
+ Index += I;
+ }
+ return Index;
+}
+
+namespace {
+/// Specifies the way the mask should be analyzed for undefs/poisonous elements
+/// in the shuffle mask.
+enum class UseMask {
+ FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
+ ///< check for the mask elements for the first argument (mask
+ ///< indices are in range [0:VF)).
+ SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
+ ///< for the mask elements for the second argument (mask indices
+ ///< are in range [VF:2*VF))
+ UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
+ ///< future shuffle elements and mark them as ones as being used
+ ///< in future. Non-undef elements are considered as unused since
+ ///< they're already marked as used in the mask.
+};
+} // namespace
+
+/// Prepares a use bitset for the given mask either for the first argument or
+/// for the second.
+static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
+ UseMask MaskArg) {
+ SmallBitVector UseMask(VF, true);
+ for (auto P : enumerate(Mask)) {
+ if (P.value() == UndefMaskElem) {
+ if (MaskArg == UseMask::UndefsAsMask)
+ UseMask.reset(P.index());
+ continue;
+ }
+ if (MaskArg == UseMask::FirstArg && P.value() < VF)
+ UseMask.reset(P.value());
+ else if (MaskArg == UseMask::SecondArg && P.value() >= VF)
+ UseMask.reset(P.value() - VF);
+ }
+ return UseMask;
+}
+
/// Checks if the given value is actually an undefined constant vector.
-static bool isUndefVector(const Value *V) {
- if (isa<UndefValue>(V))
- return true;
- auto *C = dyn_cast<Constant>(V);
- if (!C)
- return false;
- if (!C->containsUndefOrPoisonElement())
- return false;
- auto *VecTy = dyn_cast<FixedVectorType>(C->getType());
+/// Also, if the \p UseMask is not empty, tries to check if the non-masked
+/// elements actually mask the insertelement buildvector, if any.
+template <bool IsPoisonOnly = false>
+static SmallBitVector isUndefVector(const Value *V,
+ const SmallBitVector &UseMask = {}) {
+ SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
+ using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
+ if (isa<T>(V))
+ return Res;
+ auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
if (!VecTy)
- return false;
+ return Res.reset();
+ auto *C = dyn_cast<Constant>(V);
+ if (!C) {
+ if (!UseMask.empty()) {
+ const Value *Base = V;
+ while (auto *II = dyn_cast<InsertElementInst>(Base)) {
+ if (isa<T>(II->getOperand(1)))
+ continue;
+ Base = II->getOperand(0);
+ std::optional<unsigned> Idx = getInsertIndex(II);
+ if (!Idx)
+ continue;
+ if (*Idx < UseMask.size() && !UseMask.test(*Idx))
+ Res.reset(*Idx);
+ }
+ // TODO: Add analysis for shuffles here too.
+ if (V == Base) {
+ Res.reset();
+ } else {
+ SmallBitVector SubMask(UseMask.size(), false);
+ Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
+ }
+ } else {
+ Res.reset();
+ }
+ return Res;
+ }
for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
if (Constant *Elem = C->getAggregateElement(I))
- if (!isa<UndefValue>(Elem))
- return false;
+ if (!isa<T>(Elem) &&
+ (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
+ Res.reset(I);
}
- return true;
+ return Res;
}
/// Checks if the vector of instructions can be represented as a shuffle, like:
@@ -345,16 +445,16 @@ static bool isUndefVector(const Value *V) {
/// InstCombiner transforms this into a shuffle and vector mul
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
/// TODO: Can we split off and reuse the shuffle mask detection from
-/// TargetTransformInfo::getInstructionThroughput?
-static Optional<TargetTransformInfo::ShuffleKind>
+/// ShuffleVectorInst/getShuffleCost?
+static std::optional<TargetTransformInfo::ShuffleKind>
isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
const auto *It =
find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
if (It == VL.end())
- return None;
+ return std::nullopt;
auto *EI0 = cast<ExtractElementInst>(*It);
if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
- return None;
+ return std::nullopt;
unsigned Size =
cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
Value *Vec1 = nullptr;
@@ -368,19 +468,19 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
continue;
auto *EI = cast<ExtractElementInst>(VL[I]);
if (isa<ScalableVectorType>(EI->getVectorOperandType()))
- return None;
+ return std::nullopt;
auto *Vec = EI->getVectorOperand();
// We can extractelement from undef or poison vector.
- if (isUndefVector(Vec))
+ if (isUndefVector(Vec).all())
continue;
// All vector operands must have the same number of vector elements.
if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
- return None;
+ return std::nullopt;
if (isa<UndefValue>(EI->getIndexOperand()))
continue;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
- return None;
+ return std::nullopt;
// Undefined behavior if Idx is negative or >= Size.
if (Idx->getValue().uge(Size))
continue;
@@ -394,7 +494,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
Vec2 = Vec;
Mask[I] += Size;
} else {
- return None;
+ return std::nullopt;
}
if (CommonShuffleMode == Permute)
continue;
@@ -415,6 +515,24 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
: TargetTransformInfo::SK_PermuteSingleSrc;
}
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static std::optional<unsigned> getExtractIndex(Instruction *E) {
+ unsigned Opcode = E->getOpcode();
+ assert((Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue) &&
+ "Expected extractelement or extractvalue instruction.");
+ if (Opcode == Instruction::ExtractElement) {
+ auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+ if (!CI)
+ return std::nullopt;
+ return CI->getZExtValue();
+ }
+ auto *EI = cast<ExtractValueInst>(E);
+ if (EI->getNumIndices() != 1)
+ return std::nullopt;
+ return *EI->idx_begin();
+}
+
namespace {
/// Main data required for vectorization of instructions.
@@ -473,24 +591,49 @@ static bool isValidForAlternation(unsigned Opcode) {
}
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ const TargetLibraryInfo &TLI,
unsigned BaseIndex = 0);
/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
/// compatible instructions or constants, or just some other regular values.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
- Value *Op1) {
+ Value *Op1, const TargetLibraryInfo &TLI) {
return (isConstant(BaseOp0) && isConstant(Op0)) ||
(isConstant(BaseOp1) && isConstant(Op1)) ||
(!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
!isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
- getSameOpcode({BaseOp0, Op0}).getOpcode() ||
- getSameOpcode({BaseOp1, Op1}).getOpcode();
+ BaseOp0 == Op0 || BaseOp1 == Op1 ||
+ getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
+ getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
+}
+
+/// \returns true if a compare instruction \p CI has similar "look" and
+/// same predicate as \p BaseCI, "as is" or with its operands and predicate
+/// swapped, false otherwise.
+static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
+ const TargetLibraryInfo &TLI) {
+ assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
+ "Assessing comparisons of different types?");
+ CmpInst::Predicate BasePred = BaseCI->getPredicate();
+ CmpInst::Predicate Pred = CI->getPredicate();
+ CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);
+
+ Value *BaseOp0 = BaseCI->getOperand(0);
+ Value *BaseOp1 = BaseCI->getOperand(1);
+ Value *Op0 = CI->getOperand(0);
+ Value *Op1 = CI->getOperand(1);
+
+ return (BasePred == Pred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
+ (BasePred == SwappedPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
}
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ const TargetLibraryInfo &TLI,
unsigned BaseIndex) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
@@ -508,9 +651,19 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
// Check for one alternate opcode from another BinaryOperator.
// TODO - generalize to support all operators (types, calls etc.).
+ auto *IBase = cast<Instruction>(VL[BaseIndex]);
+ Intrinsic::ID BaseID = 0;
+ SmallVector<VFInfo> BaseMappings;
+ if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
+ BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
+ BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
+ if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ }
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
- unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
- if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+ auto *I = cast<Instruction>(VL[Cnt]);
+ unsigned InstOpcode = I->getOpcode();
+ if (IsBinOp && isa<BinaryOperator>(I)) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
@@ -519,9 +672,11 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltIndex = Cnt;
continue;
}
- } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
- Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
- Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+ } else if (IsCastOp && isa<CastInst>(I)) {
+ Value *Op0 = IBase->getOperand(0);
+ Type *Ty0 = Op0->getType();
+ Value *Op1 = I->getOperand(0);
+ Type *Ty1 = Op1->getType();
if (Ty0 == Ty1) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
@@ -534,59 +689,79 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
continue;
}
}
- } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {
- auto *BaseInst = cast<Instruction>(VL[BaseIndex]);
- auto *Inst = cast<Instruction>(VL[Cnt]);
+ } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
+ auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
Type *Ty0 = BaseInst->getOperand(0)->getType();
Type *Ty1 = Inst->getOperand(0)->getType();
if (Ty0 == Ty1) {
- Value *BaseOp0 = BaseInst->getOperand(0);
- Value *BaseOp1 = BaseInst->getOperand(1);
- Value *Op0 = Inst->getOperand(0);
- Value *Op1 = Inst->getOperand(1);
- CmpInst::Predicate CurrentPred =
- cast<CmpInst>(VL[Cnt])->getPredicate();
- CmpInst::Predicate SwappedCurrentPred =
- CmpInst::getSwappedPredicate(CurrentPred);
+ assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
// Check for compatible operands. If the corresponding operands are not
// compatible - need to perform alternate vectorization.
- if (InstOpcode == Opcode) {
- if (BasePred == CurrentPred &&
- areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1))
- continue;
- if (BasePred == SwappedCurrentPred &&
- areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0))
- continue;
- if (E == 2 &&
- (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
- continue;
- auto *AltInst = cast<CmpInst>(VL[AltIndex]);
- CmpInst::Predicate AltPred = AltInst->getPredicate();
- Value *AltOp0 = AltInst->getOperand(0);
- Value *AltOp1 = AltInst->getOperand(1);
- // Check if operands are compatible with alternate operands.
- if (AltPred == CurrentPred &&
- areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1))
- continue;
- if (AltPred == SwappedCurrentPred &&
- areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))
+ CmpInst::Predicate CurrentPred = Inst->getPredicate();
+ CmpInst::Predicate SwappedCurrentPred =
+ CmpInst::getSwappedPredicate(CurrentPred);
+
+ if (E == 2 &&
+ (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
+ continue;
+
+ if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
+ continue;
+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+ if (AltIndex != BaseIndex) {
+ if (isCmpSameOrSwapped(AltInst, Inst, TLI))
continue;
- }
- if (BaseIndex == AltIndex && BasePred != CurrentPred) {
- assert(isValidForAlternation(Opcode) &&
- isValidForAlternation(InstOpcode) &&
- "Cast isn't safe for alternation, logic needs to be updated!");
+ } else if (BasePred != CurrentPred) {
+ assert(
+ isValidForAlternation(InstOpcode) &&
+ "CmpInst isn't safe for alternation, logic needs to be updated!");
AltIndex = Cnt;
continue;
}
- auto *AltInst = cast<CmpInst>(VL[AltIndex]);
CmpInst::Predicate AltPred = AltInst->getPredicate();
if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
AltPred == CurrentPred || AltPred == SwappedCurrentPred)
continue;
}
- } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
+ if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+ if (Gep->getNumOperands() != 2 ||
+ Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
+ if (!isVectorLikeInstWithConstOps(EI))
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+ auto *BaseLI = cast<LoadInst>(IBase);
+ if (!LI->isSimple() || !BaseLI->isSimple())
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ } else if (auto *Call = dyn_cast<CallInst>(I)) {
+ auto *CallBase = cast<CallInst>(IBase);
+ if (Call->getCalledFunction() != CallBase->getCalledFunction())
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ if (Call->hasOperandBundles() &&
+ !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
+ Call->op_begin() + Call->getBundleOperandsEndIndex(),
+ CallBase->op_begin() +
+ CallBase->getBundleOperandsStartIndex()))
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
+ if (ID != BaseID)
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ if (!ID) {
+ SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
+ if (Mappings.size() != BaseMappings.size() ||
+ Mappings.front().ISA != BaseMappings.front().ISA ||
+ Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
+ Mappings.front().VectorName != BaseMappings.front().VectorName ||
+ Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
+ Mappings.front().Shape.Parameters !=
+ BaseMappings.front().Shape.Parameters)
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ }
+ }
continue;
+ }
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
@@ -605,24 +780,6 @@ static bool allSameType(ArrayRef<Value *> VL) {
return true;
}
-/// \returns True if Extract{Value,Element} instruction extracts element Idx.
-static Optional<unsigned> getExtractIndex(Instruction *E) {
- unsigned Opcode = E->getOpcode();
- assert((Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::ExtractValue) &&
- "Expected extractelement or extractvalue instruction.");
- if (Opcode == Instruction::ExtractElement) {
- auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
- if (!CI)
- return None;
- return CI->getZExtValue();
- }
- ExtractValueInst *EI = cast<ExtractValueInst>(E);
- if (EI->getNumIndices() != 1)
- return None;
- return *EI->idx_begin();
-}
-
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
@@ -644,7 +801,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
return (CI->getArgOperand(i) == Scalar);
}
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
}
default:
return false;
@@ -735,40 +892,6 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
Mask[Indices[I]] = I;
}
-/// \returns inserting index of InsertElement or InsertValue instruction,
-/// using Offset as base offset for index.
-static Optional<unsigned> getInsertIndex(const Value *InsertInst,
- unsigned Offset = 0) {
- int Index = Offset;
- if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
- if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
- auto *VT = cast<FixedVectorType>(IE->getType());
- if (CI->getValue().uge(VT->getNumElements()))
- return None;
- Index *= VT->getNumElements();
- Index += CI->getZExtValue();
- return Index;
- }
- return None;
- }
-
- const auto *IV = cast<InsertValueInst>(InsertInst);
- Type *CurrentType = IV->getType();
- for (unsigned I : IV->indices()) {
- if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
- Index *= ST->getNumElements();
- CurrentType = ST->getElementType(I);
- } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
- Index *= AT->getNumElements();
- CurrentType = AT->getElementType();
- } else {
- return None;
- }
- Index += I;
- }
- return Index;
-}
-
/// Reorders the list of scalars in accordance with the given \p Mask.
static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
ArrayRef<int> Mask) {
@@ -839,6 +962,7 @@ namespace slpvectorizer {
class BoUpSLP {
struct TreeEntry;
struct ScheduleData;
+ class ShuffleInstructionBuilder;
public:
using ValueList = SmallVector<Value *, 8>;
@@ -867,7 +991,7 @@ public:
else
MaxVecRegSize =
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
- .getFixedSize();
+ .getFixedValue();
if (MinVectorRegSizeOption.getNumOccurrences())
MinVecRegSize = MinVectorRegSizeOption;
@@ -882,7 +1006,8 @@ public:
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
- Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
+ Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ Instruction *ReductionRoot = nullptr);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
@@ -890,7 +1015,7 @@ public:
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);
+ InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -900,6 +1025,24 @@ public:
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);
+ /// Checks if the very first tree node is going to be vectorized.
+ bool isVectorizedFirstNode() const {
+ return !VectorizableTree.empty() &&
+ VectorizableTree.front()->State == TreeEntry::Vectorize;
+ }
+
+ /// Returns the main instruction for the very first node.
+ Instruction *getFirstNodeMainOp() const {
+ assert(!VectorizableTree.empty() && "No tree to get the first node from");
+ return VectorizableTree.front()->getMainOp();
+ }
+
+ /// Returns whether the root node has in-tree uses.
+ bool doesRootHaveInTreeUses() const {
+ return !VectorizableTree.empty() &&
+ !VectorizableTree.front()->UserTreeIndices.empty();
+ }
+
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -912,6 +1055,7 @@ public:
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MustGather.clear();
+ EntryToLastInstruction.clear();
ExternalUses.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
@@ -931,17 +1075,17 @@ public:
/// shuffled vector entry + (possibly) permutation with other gathers. It
/// implements the checks only for possibly ordered scalars (Loads,
/// ExtractElement, ExtractValue), which can be part of the graph.
- Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
/// Sort loads into increasing pointers offsets to allow greater clustering.
- Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
+ std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
/// Gets reordering data for the given tree entry. If the entry is vectorized
/// - just return ReorderIndices, otherwise check if the scalars can be
/// reordered and return the most optimal order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
- Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
+ std::optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -1052,6 +1196,7 @@ public:
/// A helper class used for scoring candidates for two consecutive lanes.
class LookAheadHeuristics {
+ const TargetLibraryInfo &TLI;
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
@@ -1059,9 +1204,11 @@ public:
int MaxLevel; // The maximum recursion depth for accumulating score.
public:
- LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,
- const BoUpSLP &R, int NumLanes, int MaxLevel)
- : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
+ LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
+ ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
+ int MaxLevel)
+ : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
+ MaxLevel(MaxLevel) {}
// The hard-coded scores listed here are not very important, though it shall
// be higher for better matches to improve the resulting cost. When
@@ -1083,6 +1230,8 @@ public:
static const int ScoreSplatLoads = 3;
/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreReversedLoads = 3;
+ /// A load candidate for masked gather.
+ static const int ScoreMaskedGatherCandidate = 1;
/// ExtractElementInst from same vector and consecutive indexes.
static const int ScoreConsecutiveExtracts = 4;
/// ExtractElementInst from same vector and reversed indices.
@@ -1108,6 +1257,10 @@ public:
/// MainAltOps.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
ArrayRef<Value *> MainAltOps) const {
+ if (!isValidElementType(V1->getType()) ||
+ !isValidElementType(V2->getType()))
+ return LookAheadHeuristics::ScoreFail;
+
if (V1 == V2) {
if (isa<LoadInst>(V1)) {
// Retruns true if the users of V1 and V2 won't need to be extracted.
@@ -1137,18 +1290,26 @@ public:
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
- if (LI1->getParent() != LI2->getParent())
+ if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
+ !LI2->isSimple())
return LookAheadHeuristics::ScoreFail;
- Optional<int> Dist = getPointersDiff(
+ std::optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
- if (!Dist || *Dist == 0)
+ if (!Dist || *Dist == 0) {
+ if (getUnderlyingObject(LI1->getPointerOperand()) ==
+ getUnderlyingObject(LI2->getPointerOperand()) &&
+ R.TTI->isLegalMaskedGather(
+ FixedVectorType::get(LI1->getType(), NumLanes),
+ LI1->getAlign()))
+ return LookAheadHeuristics::ScoreMaskedGatherCandidate;
return LookAheadHeuristics::ScoreFail;
+ }
// The distance is too large - still may be profitable to use masked
// loads/gathers.
if (std::abs(*Dist) > NumLanes / 2)
- return LookAheadHeuristics::ScoreAltOpcodes;
+ return LookAheadHeuristics::ScoreMaskedGatherCandidate;
// This still will detect consecutive loads, but we might have "holes"
// in some cases. It is ok for non-power-2 vectorization and may produce
// better results. It should not affect current vectorization.
@@ -1177,7 +1338,7 @@ public:
// Undefs are always profitable for extractelements.
if (!Ex2Idx)
return LookAheadHeuristics::ScoreConsecutiveExtracts;
- if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
+ if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
return LookAheadHeuristics::ScoreConsecutiveExtracts;
if (EV2 == EV1) {
int Idx1 = Ex1Idx->getZExtValue();
@@ -1205,7 +1366,7 @@ public:
SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
Ops.push_back(I1);
Ops.push_back(I2);
- InstructionsState S = getSameOpcode(Ops);
+ InstructionsState S = getSameOpcode(Ops, TLI);
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
if (S.getOpcode() &&
@@ -1300,7 +1461,7 @@ public:
// Recursively calculate the cost at each level
int TmpScore =
getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
- I1, I2, CurrLevel + 1, None);
+ I1, I2, CurrLevel + 1, std::nullopt);
// Look for the best score.
if (TmpScore > LookAheadHeuristics::ScoreFail &&
TmpScore > MaxTmpScore) {
@@ -1381,6 +1542,7 @@ public:
/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
+ const TargetLibraryInfo &TLI;
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
@@ -1464,7 +1626,7 @@ public:
auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
return 0;
- return R.areAllUsersVectorized(IdxLaneI, None)
+ return R.areAllUsersVectorized(IdxLaneI, std::nullopt)
? LookAheadHeuristics::ScoreAllUserVectorized
: 0;
}
@@ -1482,7 +1644,7 @@ public:
int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
int Lane, unsigned OpIdx, unsigned Idx,
bool &IsUsed) {
- LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),
+ LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
LookAheadMaxDepth);
// Keep track of the instruction stack as we recurse into the operands
// during the look-ahead score exploration.
@@ -1520,8 +1682,8 @@ public:
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
- // If no good match can be found, return None.
- Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ // If no good match can be found, return std::nullopt.
+ std::optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
ArrayRef<ReorderingMode> ReorderingModes,
ArrayRef<Value *> MainAltOps) {
unsigned NumOperands = getNumOperands();
@@ -1532,7 +1694,7 @@ public:
// Our strategy mode for OpIdx.
ReorderingMode RMode = ReorderingModes[OpIdx];
if (RMode == ReorderingMode::Failed)
- return None;
+ return std::nullopt;
// The linearized opcode of the operand at OpIdx, Lane.
bool OpIdxAPO = getData(OpIdx, Lane).APO;
@@ -1541,7 +1703,7 @@ public:
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
// are using the score to differentiate between the two.
struct BestOpData {
- Optional<unsigned> Idx = None;
+ std::optional<unsigned> Idx;
unsigned Score = 0;
} BestOp;
BestOp.Score =
@@ -1600,8 +1762,8 @@ public:
getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
return BestOp.Idx;
}
- // If we could not find a good match return None.
- return None;
+ // If we could not find a good match return std::nullopt.
+ return std::nullopt;
}
/// Helper for reorderOperandVecs.
@@ -1704,7 +1866,7 @@ public:
// Use Boyer-Moore majority voting for finding the majority opcode and
// the number of times it occurs.
if (auto *I = dyn_cast<Instruction>(OpData.V)) {
- if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
+ if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
I->getParent() != Parent) {
if (NumOpsWithSameOpcodeParent == 0) {
NumOpsWithSameOpcodeParent = 1;
@@ -1806,9 +1968,9 @@ public:
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
- VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
- ScalarEvolution &SE, const BoUpSLP &R)
- : DL(DL), SE(SE), R(R) {
+ VLOperands(ArrayRef<Value *> RootVL, const TargetLibraryInfo &TLI,
+ const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
+ : TLI(TLI), DL(DL), SE(SE), R(R) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
@@ -1930,7 +2092,7 @@ public:
// Look for a good match for each operand.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
- Optional<unsigned> BestIdx = getBestOperand(
+ std::optional<unsigned> BestIdx = getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
// By not selecting a value, we allow the operands that follow to
// select a better matching value. We will get a non-null value in
@@ -1949,7 +2111,7 @@ public:
if (MainAltOps[OpIdx].size() != 2) {
OperandData &AltOp = getData(OpIdx, Lane);
InstructionsState OpS =
- getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});
+ getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
if (OpS.getOpcode() && OpS.isAltShuffle())
MainAltOps[OpIdx].push_back(AltOp.V);
}
@@ -2018,21 +2180,21 @@ public:
/// Evaluate each pair in \p Candidates and return index into \p Candidates
/// for a pair which have highest score deemed to have best chance to form
- /// root of profitable tree to vectorize. Return None if no candidate scored
- /// above the LookAheadHeuristics::ScoreFail.
- /// \param Limit Lower limit of the cost, considered to be good enough score.
- Optional<int>
+ /// root of profitable tree to vectorize. Return std::nullopt if no candidate
+ /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
+ /// of the cost, considered to be good enough score.
+ std::optional<int>
findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
int Limit = LookAheadHeuristics::ScoreFail) {
- LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
+ LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
RootLookAheadMaxDepth);
int BestScore = Limit;
- Optional<int> Index = None;
+ std::optional<int> Index;
for (int I : seq<int>(0, Candidates.size())) {
int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
Candidates[I].second,
/*U1=*/nullptr, /*U2=*/nullptr,
- /*Level=*/1, None);
+ /*Level=*/1, std::nullopt);
if (Score > BestScore) {
BestScore = Score;
Index = I;
@@ -2063,7 +2225,7 @@ public:
}
/// Checks if the provided list of reduced values was checked already for
/// vectorization.
- bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
+ bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
return AnalyzedReductionVals.contains(hash_value(VL));
}
/// Adds the list of reduced values to list of already checked values for the
@@ -2081,6 +2243,9 @@ public:
return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
}
+ /// Check if the value is vectorized in the tree.
+ bool isVectorized(Value *V) const { return getTreeEntry(V); }
+
~BoUpSLP();
private:
@@ -2097,6 +2262,10 @@ private:
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps);
+ /// Checks if the given \p TE is a gather node with clustered reused scalars
+ /// and reorders it per given \p Mask.
+ void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
+
/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
/// if any. If it is not vectorized (gather node), returns nullptr.
TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
@@ -2123,6 +2292,11 @@ private:
bool areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const;
+ /// Return information about the vector formed for the specified index
+ /// of a vector of (the same) instruction.
+ TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,
+ unsigned OpIdx);
+
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals);
@@ -2142,13 +2316,14 @@ private:
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
- /// Vectorize a single entry in the tree, starting in \p VL.
- Value *vectorizeTree(ArrayRef<Value *> VL);
+ /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
+ /// \p E.
+ Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
/// Create a new vector from a list of scalar values. Produces a sequence
/// which exploits values reused across lanes, and arranges the inserts
/// for ease of later optimization.
- Value *createBuildVector(ArrayRef<Value *> VL);
+ Value *createBuildVector(const TreeEntry *E);
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars. If \p
@@ -2158,12 +2333,22 @@ private:
const APInt &ShuffledIndices,
bool NeedToShuffle) const;
+ /// Returns the instruction in the bundle, which can be used as a base point
+ /// for scheduling. Usually it is the last instruction in the bundle, except
+ /// for the case when all operands are external (in this case, it is the first
+ /// instruction in the list).
+ Instruction &getLastInstructionInBundle(const TreeEntry *E);
+
/// Checks if the gathered \p VL can be represented as shuffle(s) of previous
/// tree entries.
+ /// \param TE Tree entry checked for permutation.
+ /// \param VL List of scalars (a subset of the TE scalar), checked for
+ /// permutations.
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
/// previous tree entries. \p Mask is filled with the shuffle mask.
- Optional<TargetTransformInfo::ShuffleKind>
- isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
+ std::optional<TargetTransformInfo::ShuffleKind>
+ isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+ SmallVectorImpl<int> &Mask,
SmallVectorImpl<const TreeEntry *> &Entries);
/// \returns the scalarization cost for this list of values. Assuming that
@@ -2184,12 +2369,10 @@ private:
/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
- static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const DataLayout &DL,
- ScalarEvolution &SE,
- const BoUpSLP &R);
+ static void reorderInputsAccordingToOpcode(
+ ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI,
+ const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
@@ -2198,10 +2381,10 @@ private:
collectUserStores(const BoUpSLP::TreeEntry *TE) const;
/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
- /// stores in \p StoresVec can for a vector instruction. If so it returns true
+ /// stores in \p StoresVec can form a vector instruction. If so it returns true
/// and populates \p ReorderIndices with the shuffle indices of the the stores
/// when compared to the sorted vector.
- bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+ bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
OrdersType &ReorderIndices) const;
/// Iterates through the users of \p TE, looking for scalar stores that can be
@@ -2247,6 +2430,12 @@ private:
return IsSame(Scalars, ReuseShuffleIndices);
}
+ bool isOperandGatherNode(const EdgeInfo &UserEI) const {
+ return State == TreeEntry::NeedToGather &&
+ UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
+ UserTreeIndices.front().UserTE == UserEI.UserTE;
+ }
+
/// \returns true if current entry has same operands as \p TE.
bool hasEqualOperands(const TreeEntry &TE) const {
if (TE.getNumOperands() != getNumOperands())
@@ -2508,11 +2697,11 @@ private:
#endif
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
- ArrayRef<int> ReuseShuffleIndices = None,
- ArrayRef<unsigned> ReorderIndices = None) {
+ ArrayRef<int> ReuseShuffleIndices = std::nullopt,
+ ArrayRef<unsigned> ReorderIndices = std::nullopt) {
TreeEntry::EntryState EntryState =
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
@@ -2521,11 +2710,11 @@ private:
TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
TreeEntry::EntryState EntryState,
- Optional<ScheduleData *> Bundle,
+ std::optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
- ArrayRef<int> ReuseShuffleIndices = None,
- ArrayRef<unsigned> ReorderIndices = None) {
+ ArrayRef<int> ReuseShuffleIndices = std::nullopt,
+ ArrayRef<unsigned> ReorderIndices = std::nullopt) {
assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
(Bundle && EntryState != TreeEntry::NeedToGather)) &&
"Need to vectorize gather entry?");
@@ -2547,7 +2736,7 @@ private:
return UndefValue::get(VL.front()->getType());
return VL[Idx];
});
- InstructionsState S = getSameOpcode(Last->Scalars);
+ InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
Last->setOperations(S);
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
}
@@ -2611,6 +2800,14 @@ private:
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
+ /// A map between the vectorized entries and the last instructions in the
+ /// bundles. The bundles are built in use order, not in the def order of the
+ /// instructions. So, we cannot rely directly on the last instruction in the
+ /// bundle being the last instruction in the program order during
+ /// vectorization process since the basic blocks are affected, need to
+ /// pre-gather them before.
+ DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
+
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, int L)
@@ -2635,9 +2832,9 @@ private:
Instruction *Inst2) {
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
- Optional<bool> &result = AliasCache[key];
+ std::optional<bool> &result = AliasCache[key];
if (result) {
- return result.value();
+ return *result;
}
bool aliased = true;
if (Loc1.Ptr && isSimple(Inst1))
@@ -2651,7 +2848,7 @@ private:
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
- DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+ DenseMap<AliasCacheKey, std::optional<bool>> AliasCache;
// Cache for pointerMayBeCaptured calls inside AA. This is preserved
// globally through SLP because we don't perform any action which
@@ -2680,8 +2877,9 @@ private:
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
- /// Holds all of the instructions that we gathered.
- SetVector<Instruction *> GatherShuffleSeq;
+ /// Holds all of the instructions that we gathered, shuffle instructions and
+ /// extractelements.
+ SetVector<Instruction *> GatherShuffleExtractSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
@@ -2994,7 +3192,7 @@ private:
// okay.
auto *In = BundleMember->Inst;
assert(In &&
- (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+ (isa<ExtractValueInst, ExtractElementInst>(In) ||
In->getNumOperands() == TE->getNumOperands()) &&
"Missed TreeEntry operands?");
(void)In; // fake use to avoid build failure when assertions disabled
@@ -3102,9 +3300,9 @@ private:
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
- /// \returns the scheduling bundle. The returned Optional value is non-None
- /// if \p VL is allowed to be scheduled.
- Optional<ScheduleData *>
+ /// \returns the scheduling bundle. The returned Optional value is not
+ /// std::nullopt if \p VL is allowed to be scheduled.
+ std::optional<ScheduleData *>
tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S);
@@ -3319,9 +3517,10 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
std::string Str;
raw_string_ostream OS(Str);
+ OS << Entry->Idx << ".\n";
if (isSplat(Entry->Scalars))
OS << "<splat> ";
- for (auto V : Entry->Scalars) {
+ for (auto *V : Entry->Scalars) {
OS << *V;
if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
return EU.Scalar == V;
@@ -3336,6 +3535,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
const BoUpSLP *) {
if (Entry->State == TreeEntry::NeedToGather)
return "color=red";
+ if (Entry->State == TreeEntry::ScatterVectorize)
+ return "color=blue";
return "";
}
};
@@ -3407,7 +3608,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
fixupOrderingIndices(Order);
}
-Optional<BoUpSLP::OrdersType>
+std::optional<BoUpSLP::OrdersType>
BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
unsigned NumScalars = TE.Scalars.size();
@@ -3427,11 +3628,11 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
STE = LocalSTE;
else if (STE != LocalSTE)
// Take the order only from the single vector node.
- return None;
+ return std::nullopt;
unsigned Lane =
std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
if (Lane >= NumScalars)
- return None;
+ return std::nullopt;
if (CurrentOrder[Lane] != NumScalars) {
if (Lane != I)
continue;
@@ -3470,7 +3671,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
}
return CurrentOrder;
}
- return None;
+ return std::nullopt;
}
namespace {
@@ -3478,12 +3679,31 @@ namespace {
enum class LoadsState { Gather, Vectorize, ScatterVectorize };
} // anonymous namespace
+static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
+ const TargetLibraryInfo &TLI,
+ bool CompareOpcodes = true) {
+ if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
+ return false;
+ auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
+ if (!GEP1)
+ return false;
+ auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
+ if (!GEP2)
+ return false;
+ return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
+ ((isConstant(GEP1->getOperand(1)) &&
+ isConstant(GEP2->getOperand(1))) ||
+ !CompareOpcodes ||
+ getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
+ .getOpcode());
+}
+
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
const TargetTransformInfo &TTI,
const DataLayout &DL, ScalarEvolution &SE,
- LoopInfo &LI,
+ LoopInfo &LI, const TargetLibraryInfo &TLI,
SmallVectorImpl<unsigned> &Order,
SmallVectorImpl<Value *> &PointerOps) {
// Check that a vectorized load would load the same memory as a scalar
@@ -3513,18 +3733,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
Order.clear();
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
- if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {
- if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front()))
- return false;
- auto *GEP = dyn_cast<GetElementPtrInst>(P);
- if (!GEP)
- return false;
- auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());
- return GEP->getNumOperands() == 2 &&
- ((isConstant(GEP->getOperand(1)) &&
- isConstant(GEP0->getOperand(1))) ||
- getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)})
- .getOpcode());
+ if (IsSorted || all_of(PointerOps, [&](Value *P) {
+ return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
if (IsSorted) {
Value *Ptr0;
@@ -3536,7 +3746,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
Ptr0 = PointerOps[Order.front()];
PtrN = PointerOps[Order.back()];
}
- Optional<int> Diff =
+ std::optional<int> Diff =
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
// Check that the sorted loads are consecutive.
if (static_cast<unsigned>(*Diff) == VL.size() - 1)
@@ -3584,7 +3794,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
unsigned Cnt = 1;
for (Value *Ptr : VL.drop_front()) {
bool Found = any_of(Bases, [&](auto &Base) {
- Optional<int> Diff =
+ std::optional<int> Diff =
getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
/*StrictCheck=*/true);
if (!Diff)
@@ -3636,7 +3846,7 @@ bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
return true;
}
-Optional<BoUpSLP::OrdersType>
+std::optional<BoUpSLP::OrdersType>
BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
Type *ScalarTy = TE.Scalars[0]->getType();
@@ -3646,27 +3856,176 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
for (Value *V : TE.Scalars) {
auto *L = dyn_cast<LoadInst>(V);
if (!L || !L->isSimple())
- return None;
+ return std::nullopt;
Ptrs.push_back(L->getPointerOperand());
}
BoUpSLP::OrdersType Order;
if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
return Order;
- return None;
+ return std::nullopt;
+}
+
+/// Check if two insertelement instructions are from the same buildvector.
+static bool areTwoInsertFromSameBuildVector(
+ InsertElementInst *VU, InsertElementInst *V,
+ function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
+ // Instructions must be from the same basic blocks.
+ if (VU->getParent() != V->getParent())
+ return false;
+ // Checks if 2 insertelements are from the same buildvector.
+ if (VU->getType() != V->getType())
+ return false;
+ // Multiple used inserts are separate nodes.
+ if (!VU->hasOneUse() && !V->hasOneUse())
+ return false;
+ auto *IE1 = VU;
+ auto *IE2 = V;
+ std::optional<unsigned> Idx1 = getInsertIndex(IE1);
+ std::optional<unsigned> Idx2 = getInsertIndex(IE2);
+ if (Idx1 == std::nullopt || Idx2 == std::nullopt)
+ return false;
+ // Go through the vector operand of insertelement instructions trying to find
+ // either VU as the original vector for IE2 or V as the original vector for
+ // IE1.
+ do {
+ if (IE2 == VU)
+ return VU->hasOneUse();
+ if (IE1 == V)
+ return V->hasOneUse();
+ if (IE1) {
+ if ((IE1 != VU && !IE1->hasOneUse()) ||
+ getInsertIndex(IE1).value_or(*Idx2) == *Idx2)
+ IE1 = nullptr;
+ else
+ IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
+ }
+ if (IE2) {
+ if ((IE2 != V && !IE2->hasOneUse()) ||
+ getInsertIndex(IE2).value_or(*Idx1) == *Idx1)
+ IE2 = nullptr;
+ else
+ IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
+ }
+ } while (IE1 || IE2);
+ return false;
}
-Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
+std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
bool TopToBottom) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
- if (!TE.ReuseShuffleIndices.empty())
- return None;
+ if (!TE.ReuseShuffleIndices.empty()) {
+ // Check if reuse shuffle indices can be improved by reordering.
+ // For this, check that reuse mask is "clustered", i.e. each scalar values
+ // is used once in each submask of size <number_of_scalars>.
+ // Example: 4 scalar values.
+ // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
+ // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
+ // element 3 is used twice in the second submask.
+ unsigned Sz = TE.Scalars.size();
+ if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
+ Sz))
+ return std::nullopt;
+ unsigned VF = TE.getVectorFactor();
+ // Try build correct order for extractelement instructions.
+ SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
+ TE.ReuseShuffleIndices.end());
+ if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
+ all_of(TE.Scalars, [Sz](Value *V) {
+ std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
+ return Idx && *Idx < Sz;
+ })) {
+ SmallVector<int> ReorderMask(Sz, UndefMaskElem);
+ if (TE.ReorderIndices.empty())
+ std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
+ else
+ inversePermutation(TE.ReorderIndices, ReorderMask);
+ for (unsigned I = 0; I < VF; ++I) {
+ int &Idx = ReusedMask[I];
+ if (Idx == UndefMaskElem)
+ continue;
+ Value *V = TE.Scalars[ReorderMask[Idx]];
+ std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
+ Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
+ }
+ }
+ // Build the order of the VF size, need to reorder reuses shuffles, they are
+ // always of VF size.
+ OrdersType ResOrder(VF);
+ std::iota(ResOrder.begin(), ResOrder.end(), 0);
+ auto *It = ResOrder.begin();
+ for (unsigned K = 0; K < VF; K += Sz) {
+ OrdersType CurrentOrder(TE.ReorderIndices);
+ SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
+ if (SubMask.front() == UndefMaskElem)
+ std::iota(SubMask.begin(), SubMask.end(), 0);
+ reorderOrder(CurrentOrder, SubMask);
+ transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
+ std::advance(It, Sz);
+ }
+ if (all_of(enumerate(ResOrder),
+ [](const auto &Data) { return Data.index() == Data.value(); }))
+ return {}; // Use identity order.
+ return ResOrder;
+ }
if (TE.State == TreeEntry::Vectorize &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
!TE.isAltShuffle())
return TE.ReorderIndices;
+ if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
+ auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {
+ if (!V1->hasOneUse() || !V2->hasOneUse())
+ return false;
+ auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
+ auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
+ if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
+ if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
+ if (!areTwoInsertFromSameBuildVector(
+ IE1, IE2,
+ [](InsertElementInst *II) { return II->getOperand(0); }))
+ return false;
+ std::optional<unsigned> Idx1 = getInsertIndex(IE1);
+ std::optional<unsigned> Idx2 = getInsertIndex(IE2);
+ if (Idx1 == std::nullopt || Idx2 == std::nullopt)
+ return false;
+ return *Idx1 < *Idx2;
+ }
+ if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
+ if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
+ if (EE1->getOperand(0) != EE2->getOperand(0))
+ return false;
+ std::optional<unsigned> Idx1 = getExtractIndex(EE1);
+ std::optional<unsigned> Idx2 = getExtractIndex(EE2);
+ if (Idx1 == std::nullopt || Idx2 == std::nullopt)
+ return false;
+ return *Idx1 < *Idx2;
+ }
+ return false;
+ };
+ auto IsIdentityOrder = [](const OrdersType &Order) {
+ for (unsigned Idx : seq<unsigned>(0, Order.size()))
+ if (Idx != Order[Idx])
+ return false;
+ return true;
+ };
+ if (!TE.ReorderIndices.empty())
+ return TE.ReorderIndices;
+ DenseMap<Value *, unsigned> PhiToId;
+ SmallVector<Value *, 4> Phis;
+ OrdersType ResOrder(TE.Scalars.size());
+ for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {
+ PhiToId[TE.Scalars[Id]] = Id;
+ Phis.push_back(TE.Scalars[Id]);
+ }
+ llvm::stable_sort(Phis, PHICompare);
+ for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
+ ResOrder[Id] = PhiToId[Phis[Id]];
+ if (IsIdentityOrder(ResOrder))
+ return {};
+ return ResOrder;
+ }
if (TE.State == TreeEntry::NeedToGather) {
// TODO: add analysis of other gather nodes with extractelement
// instructions and other values/instructions, not only undefs.
@@ -3694,13 +4053,55 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
return CurrentOrder;
}
}
- if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+ if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
return CurrentOrder;
if (TE.Scalars.size() >= 4)
- if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
+ if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
return Order;
}
- return None;
+ return std::nullopt;
+}
+
+/// Checks if the given mask is a "clustered" mask with the same clusters of
+/// size \p Sz, which are not identity submasks.
+static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
+ unsigned Sz) {
+ ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
+ if (ShuffleVectorInst::isIdentityMask(FirstCluster))
+ return false;
+ for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
+ ArrayRef<int> Cluster = Mask.slice(I, Sz);
+ if (Cluster != FirstCluster)
+ return false;
+ }
+ return true;
+}
+
+void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
+ // Reorder reuses mask.
+ reorderReuses(TE.ReuseShuffleIndices, Mask);
+ const unsigned Sz = TE.Scalars.size();
+ // For vectorized and non-clustered reused no need to do anything else.
+ if (TE.State != TreeEntry::NeedToGather ||
+ !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
+ Sz) ||
+ !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
+ return;
+ SmallVector<int> NewMask;
+ inversePermutation(TE.ReorderIndices, NewMask);
+ addMask(NewMask, TE.ReuseShuffleIndices);
+ // Clear reorder since it is going to be applied to the new mask.
+ TE.ReorderIndices.clear();
+ // Try to improve gathered nodes with clustered reuses, if possible.
+ ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
+ SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
+ inversePermutation(NewOrder, NewMask);
+ reorderScalars(TE.Scalars, NewMask);
+ // Fill the reuses mask with the identity submasks.
+ for (auto *It = TE.ReuseShuffleIndices.begin(),
+ *End = TE.ReuseShuffleIndices.end();
+ It != End; std::advance(It, Sz))
+ std::iota(It, std::next(It, Sz), 0);
}
void BoUpSLP::reorderTopToBottom() {
@@ -3710,6 +4111,9 @@ void BoUpSLP::reorderTopToBottom() {
// their ordering.
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+ // Phi nodes can have preferred ordering based on their result users
+ DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
+
// AltShuffles can also have a preferred ordering that leads to fewer
// instructions, e.g., the addsub instruction in x86.
DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
@@ -3724,13 +4128,13 @@ void BoUpSLP::reorderTopToBottom() {
// extracts.
for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
&GathersToOrders, &ExternalUserReorderMap,
- &AltShufflesToOrders](
+ &AltShufflesToOrders, &PhisToOrders](
const std::unique_ptr<TreeEntry> &TE) {
// Look for external users that will probably be vectorized.
SmallVector<OrdersType, 1> ExternalUserReorderIndices =
findExternalStoreUsersReorderIndices(TE.get());
if (!ExternalUserReorderIndices.empty()) {
- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
ExternalUserReorderMap.try_emplace(TE.get(),
std::move(ExternalUserReorderIndices));
}
@@ -3750,13 +4154,13 @@ void BoUpSLP::reorderTopToBottom() {
OpcodeMask.set(Lane);
// If this pattern is supported by the target then we consider the order.
if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
}
// TODO: Check the reverse order too.
}
- if (Optional<OrdersType> CurrentOrder =
+ if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/true)) {
// Do not include ordering for nodes used in the alt opcode vectorization,
// better to reorder them during bottom-to-top stage. If follow the order
@@ -3778,14 +4182,17 @@ void BoUpSLP::reorderTopToBottom() {
UserTE = UserTE->UserTreeIndices.back().UserTE;
++Cnt;
}
- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
- if (TE->State != TreeEntry::Vectorize)
+ VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
+ if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+ if (TE->State == TreeEntry::Vectorize &&
+ TE->getOpcode() == Instruction::PHI)
+ PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
}
});
// Reorder the graph nodes according to their vectorization factor.
- for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
+ for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
VF /= 2) {
auto It = VFToOrderedEntries.find(VF);
if (It == VFToOrderedEntries.end())
@@ -3803,12 +4210,13 @@ void BoUpSLP::reorderTopToBottom() {
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
// just need to merge reordering shuffle and the reuse shuffle.
- if (!OpTE->ReuseShuffleIndices.empty())
+ if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
continue;
// Count number of orders uses.
- const auto &Order = [OpTE, &GathersToOrders,
- &AltShufflesToOrders]() -> const OrdersType & {
- if (OpTE->State == TreeEntry::NeedToGather) {
+ const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
+ &PhisToOrders]() -> const OrdersType & {
+ if (OpTE->State == TreeEntry::NeedToGather ||
+ !OpTE->ReuseShuffleIndices.empty()) {
auto It = GathersToOrders.find(OpTE);
if (It != GathersToOrders.end())
return It->second;
@@ -3818,14 +4226,28 @@ void BoUpSLP::reorderTopToBottom() {
if (It != AltShufflesToOrders.end())
return It->second;
}
+ if (OpTE->State == TreeEntry::Vectorize &&
+ OpTE->getOpcode() == Instruction::PHI) {
+ auto It = PhisToOrders.find(OpTE);
+ if (It != PhisToOrders.end())
+ return It->second;
+ }
return OpTE->ReorderIndices;
}();
// First consider the order of the external scalar users.
auto It = ExternalUserReorderMap.find(OpTE);
if (It != ExternalUserReorderMap.end()) {
const auto &ExternalUserReorderIndices = It->second;
- for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
- ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+ // If the OpTE vector factor != number of scalars - use natural order,
+ // it is an attempt to reorder node with reused scalars but with
+ // external uses.
+ if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
+ OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
+ ExternalUserReorderIndices.size();
+ } else {
+ for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
+ ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+ }
// No other useful reorder data in this entry.
if (Order.empty())
continue;
@@ -3885,7 +4307,7 @@ void BoUpSLP::reorderTopToBottom() {
"All users must be of VF size.");
// Update ordering of the operands with the smaller VF than the given
// one.
- reorderReuses(TE->ReuseShuffleIndices, Mask);
+ reorderNodeWithReuses(*TE, Mask);
}
continue;
}
@@ -3982,10 +4404,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
const std::unique_ptr<TreeEntry> &TE) {
if (TE->State != TreeEntry::Vectorize)
NonVectorized.push_back(TE.get());
- if (Optional<OrdersType> CurrentOrder =
+ if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
- if (TE->State != TreeEntry::Vectorize)
+ if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
});
@@ -4057,10 +4479,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
TreeEntry *OpTE = Op.second;
if (!VisitedOps.insert(OpTE).second)
continue;
- if (!OpTE->ReuseShuffleIndices.empty())
+ if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
continue;
const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
- if (OpTE->State == TreeEntry::NeedToGather)
+ if (OpTE->State == TreeEntry::NeedToGather ||
+ !OpTE->ReuseShuffleIndices.empty())
return GathersToOrders.find(OpTE)->second;
return OpTE->ReorderIndices;
}();
@@ -4166,8 +4589,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (!VisitedOps.insert(TE).second)
continue;
if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
- // Just reorder reuses indices.
- reorderReuses(TE->ReuseShuffleIndices, Mask);
+ reorderNodeWithReuses(*TE, Mask);
continue;
}
// Gathers are processed separately.
@@ -4322,7 +4744,7 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
return PtrToStoresMap;
}
-bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
OrdersType &ReorderIndices) const {
// We check whether the stores in StoreVec can form a vector by sorting them
// and checking whether they are consecutive.
@@ -4336,7 +4758,7 @@ bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
Value *S0Ptr = S0->getPointerOperand();
for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
StoreInst *SI = StoresVec[Idx];
- Optional<int> Diff =
+ std::optional<int> Diff =
getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
SI->getPointerOperand(), *DL, *SE,
/*StrictCheck=*/true);
@@ -4416,7 +4838,7 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
// If the stores are not consecutive then abandon this StoresVec.
OrdersType ReorderIndices;
- if (!CanFormVector(StoresVec, ReorderIndices))
+ if (!canFormVector(StoresVec, ReorderIndices))
continue;
// We now know that the scalars in StoresVec can form a vector instruction,
@@ -4472,24 +4894,24 @@ static std::pair<size_t, size_t> generateKeySubkey(
hash_code SubKey = hash_value(0);
// Sort the loads by the distance between the pointers.
if (auto *LI = dyn_cast<LoadInst>(V)) {
- Key = hash_combine(hash_value(Instruction::Load), Key);
+ Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
if (LI->isSimple())
SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
else
- SubKey = hash_value(LI);
+ Key = SubKey = hash_value(LI);
} else if (isVectorLikeInstWithConstOps(V)) {
// Sort extracts by the vector operands.
if (isa<ExtractElementInst, UndefValue>(V))
Key = hash_value(Value::UndefValueVal + 1);
if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
- if (!isUndefVector(EI->getVectorOperand()) &&
+ if (!isUndefVector(EI->getVectorOperand()).all() &&
!isa<UndefValue>(EI->getIndexOperand()))
SubKey = hash_value(EI->getVectorOperand());
}
} else if (auto *I = dyn_cast<Instruction>(V)) {
// Sort other instructions just by the opcodes except for CMPInst.
// For CMP also sort by the predicate kind.
- if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+ if ((isa<BinaryOperator, CastInst>(I)) &&
isValidForAlternation(I->getOpcode())) {
if (AllowAlternate)
Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
@@ -4504,7 +4926,7 @@ static std::pair<size_t, size_t> generateKeySubkey(
if (isa<CastInst>(I)) {
std::pair<size_t, size_t> OpVals =
generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
- /*=AllowAlternate*/ true);
+ /*AllowAlternate=*/true);
Key = hash_combine(OpVals.first, Key);
SubKey = hash_combine(OpVals.first, SubKey);
}
@@ -4547,6 +4969,13 @@ static std::pair<size_t, size_t> generateKeySubkey(
return std::make_pair(Key, SubKey);
}
+/// Checks if the specified instruction \p I is an alternate operation for
+/// the given \p MainOp and \p AltOp instructions.
+static bool isAlternateInstruction(const Instruction *I,
+ const Instruction *MainOp,
+ const Instruction *AltOp,
+ const TargetLibraryInfo &TLI);
+
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -4557,7 +4986,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
&UserTreeIdx,
this](const InstructionsState &S) {
// Check that every instruction appears once in this bundle.
- DenseMap<Value *, unsigned> UniquePositions;
+ DenseMap<Value *, unsigned> UniquePositions(VL.size());
for (Value *V : VL) {
if (isConstant(V)) {
ReuseShuffleIndicies.emplace_back(
@@ -4583,7 +5012,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
})) ||
!llvm::isPowerOf2_32(NumUniqueScalarValues)) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
}
VL = UniqueValues;
@@ -4591,7 +5020,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return true;
};
- InstructionsState S = getSameOpcode(VL);
+ InstructionsState S = getSameOpcode(VL, *TLI);
// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
// a load), in which case peek through to include it in the tree, without
@@ -4607,7 +5036,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
})))) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4618,7 +5047,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4627,14 +5056,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (S.OpValue->getType()->isVectorTy() &&
!isa<InsertElementInst>(S.OpValue)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return;
}
@@ -4715,14 +5144,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BB &&
sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
SortedIndices));
- if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts ||
+ if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
S.OpValue) &&
!all_of(VL, isVectorLikeInstWithConstOps)) ||
NotProfitableForVectorization(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4736,7 +5165,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (EphValues.count(V)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is ephemeral.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return;
}
}
@@ -4748,7 +5177,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4768,7 +5197,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is already in tree.\n");
if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4780,7 +5209,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (UserIgnoreList && UserIgnoreList->contains(V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4789,9 +5218,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Special processing for sorted pointers for ScatterVectorize node with
// constant indeces only.
- if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) &&
- UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {
+ if (AreAllSameInsts && UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+ !(S.getOpcode() && allSameBlock(VL))) {
assert(S.OpValue->getType()->isPointerTy() &&
count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
2 &&
@@ -4799,7 +5228,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Reset S to make it GetElementPtr kind of node.
const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
assert(It != VL.end() && "Expected at least one GEP.");
- S = getSameOpcode(*It);
+ S = getSameOpcode(*It, *TLI);
}
// Check that all of the users of the scalars that we want to vectorize are
@@ -4811,7 +5240,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return;
}
@@ -4820,7 +5249,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// place to insert a shuffle if we need to, so just avoid that issue.
if (isa<CatchSwitchInst>(BB->getTerminator())) {
LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return;
}
@@ -4834,7 +5263,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef;
- Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+ std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
@@ -4844,7 +5273,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4864,7 +5293,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -4931,7 +5360,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
BS.cancelScheduling(VL, VL0);
return;
@@ -4944,7 +5373,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ValueSet SourceVectors;
for (Value *V : VL) {
SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
- assert(getInsertIndex(V) != None && "Non-constant or undef index?");
+ assert(getInsertIndex(V) != std::nullopt &&
+ "Non-constant or undef index?");
}
if (count_if(VL, [&SourceVectors](Value *V) {
@@ -4953,7 +5383,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Found 2nd source vector - cancel.
LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
BS.cancelScheduling(VL, VL0);
return;
}
@@ -4979,7 +5409,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (IsIdentity)
CurrentOrder.clear();
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- None, CurrentOrder);
+ std::nullopt, CurrentOrder);
LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
constexpr int NumOps = 2;
@@ -5003,8 +5433,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<Value *> PointerOps;
OrdersType CurrentOrder;
TreeEntry *TE = nullptr;
- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,
- PointerOps)) {
+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI,
+ CurrentOrder, PointerOps)) {
case LoadsState::Vectorize:
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
@@ -5030,7 +5460,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
break;
case LoadsState::Gather:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
#ifndef NDEBUG
Type *ScalarTy = VL0->getType();
@@ -5065,7 +5495,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering casts with different src types.\n");
@@ -5098,7 +5528,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering cmp with different predicate.\n");
@@ -5115,7 +5545,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
@@ -5162,7 +5592,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -5190,7 +5620,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (I->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -5208,7 +5638,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -5230,7 +5660,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
return;
}
@@ -5298,7 +5728,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
return;
@@ -5313,7 +5743,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
auto *SI = cast<StoreInst>(V);
if (!SI->isSimple()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
return;
@@ -5336,7 +5766,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Ptr0 = PointerOps[CurrentOrder.front()];
PtrN = PointerOps[CurrentOrder.back()];
}
- Optional<int> Dist =
+ std::optional<int> Dist =
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
// Check that the sorted pointer operands are consecutive.
if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
@@ -5361,7 +5791,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
@@ -5379,7 +5809,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!VecFunc && !isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
@@ -5398,7 +5828,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
<< "\n");
@@ -5411,7 +5841,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(j);
if (ScalarArgs[j] != A1J) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument " << ScalarArgs[j] << "!=" << A1J
@@ -5426,7 +5856,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n');
@@ -5457,7 +5887,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// then do not vectorize this instruction.
if (!S.isAltShuffle()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
@@ -5473,31 +5903,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!CI || all_of(VL, [](Value *V) {
return cast<CmpInst>(V)->isCommutative();
})) {
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE,
+ *this);
} else {
- CmpInst::Predicate P0 = CI->getPredicate();
- CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();
- assert(P0 != AltP0 &&
+ auto *MainCI = cast<CmpInst>(S.MainOp);
+ auto *AltCI = cast<CmpInst>(S.AltOp);
+ CmpInst::Predicate MainP = MainCI->getPredicate();
+ CmpInst::Predicate AltP = AltCI->getPredicate();
+ assert(MainP != AltP &&
"Expected different main/alternate predicates.");
- CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
- Value *BaseOp0 = VL0->getOperand(0);
- Value *BaseOp1 = VL0->getOperand(1);
// Collect operands - commute if it uses the swapped predicate or
// alternate operation.
for (Value *V : VL) {
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);
- CmpInst::Predicate CurrentPred = Cmp->getPredicate();
- if (P0 == AltP0Swapped) {
- if (CI != Cmp && S.AltOp != Cmp &&
- ((P0 == CurrentPred &&
- !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
- (AltP0 == CurrentPred &&
- areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))))
+
+ if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
+ if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
+ std::swap(LHS, RHS);
+ } else {
+ if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
std::swap(LHS, RHS);
- } else if (P0 != CurrentPred && AltP0 != CurrentPred) {
- std::swap(LHS, RHS);
}
Left.push_back(LHS);
Right.push_back(RHS);
@@ -5523,7 +5950,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
default:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
@@ -5534,8 +5961,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N = 1;
Type *EltTy = T;
- while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
- isa<VectorType>(EltTy)) {
+ while (isa<StructType, ArrayType, VectorType>(EltTy)) {
if (auto *ST = dyn_cast<StructType>(EltTy)) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
@@ -5617,7 +6043,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
if (isa<UndefValue>(EE->getIndexOperand()))
continue;
- Optional<unsigned> Idx = getExtractIndex(Inst);
+ std::optional<unsigned> Idx = getExtractIndex(Inst);
if (!Idx)
break;
const unsigned ExtIdx = *Idx;
@@ -5785,32 +6211,388 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
}
}
-/// Checks if the specified instruction \p I is an alternate operation for the
-/// given \p MainOp and \p AltOp instructions.
static bool isAlternateInstruction(const Instruction *I,
const Instruction *MainOp,
- const Instruction *AltOp) {
- if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) {
- auto *AltCI0 = cast<CmpInst>(AltOp);
+ const Instruction *AltOp,
+ const TargetLibraryInfo &TLI) {
+ if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
+ auto *AltCI = cast<CmpInst>(AltOp);
+ CmpInst::Predicate MainP = MainCI->getPredicate();
+ CmpInst::Predicate AltP = AltCI->getPredicate();
+ assert(MainP != AltP && "Expected different main/alternate predicates.");
auto *CI = cast<CmpInst>(I);
- CmpInst::Predicate P0 = CI0->getPredicate();
- CmpInst::Predicate AltP0 = AltCI0->getPredicate();
- assert(P0 != AltP0 && "Expected different main/alternate predicates.");
- CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
- CmpInst::Predicate CurrentPred = CI->getPredicate();
- if (P0 == AltP0Swapped)
- return I == AltCI0 ||
- (I != MainOp &&
- !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
- CI->getOperand(0), CI->getOperand(1)));
- return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
+ if (isCmpSameOrSwapped(MainCI, CI, TLI))
+ return false;
+ if (isCmpSameOrSwapped(AltCI, CI, TLI))
+ return true;
+ CmpInst::Predicate P = CI->getPredicate();
+ CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);
+
+ assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
+ "CmpInst expected to match either main or alternate predicate or "
+ "their swap.");
+ (void)AltP;
+ return MainP != P && MainP != SwappedP;
}
return I->getOpcode() == AltOp->getOpcode();
}
+TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,
+ unsigned OpIdx) {
+ assert(!VL.empty());
+ const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof));
+ const auto *Op0 = I0->getOperand(OpIdx);
+
+ const bool IsConstant = all_of(VL, [&](Value *V) {
+ // TODO: We should allow undef elements here
+ const auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return true;
+ auto *Op = I->getOperand(OpIdx);
+ return isConstant(Op) && !isa<UndefValue>(Op);
+ });
+ const bool IsUniform = all_of(VL, [&](Value *V) {
+ // TODO: We should allow undef elements here
+ const auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+ return I->getOperand(OpIdx) == Op0;
+ });
+ const bool IsPowerOfTwo = all_of(VL, [&](Value *V) {
+ // TODO: We should allow undef elements here
+ const auto *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ assert((isa<UndefValue>(V) ||
+ I0->getOpcode() == Instruction::GetElementPtr) &&
+ "Expected undef or GEP.");
+ return true;
+ }
+ auto *Op = I->getOperand(OpIdx);
+ if (auto *CI = dyn_cast<ConstantInt>(Op))
+ return CI->getValue().isPowerOf2();
+ return false;
+ });
+ const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) {
+ // TODO: We should allow undef elements here
+ const auto *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ assert((isa<UndefValue>(V) ||
+ I0->getOpcode() == Instruction::GetElementPtr) &&
+ "Expected undef or GEP.");
+ return true;
+ }
+ const auto *Op = I->getOperand(OpIdx);
+ if (auto *CI = dyn_cast<ConstantInt>(Op))
+ return CI->getValue().isNegatedPowerOf2();
+ return false;
+ });
+
+ TTI::OperandValueKind VK = TTI::OK_AnyValue;
+ if (IsConstant && IsUniform)
+ VK = TTI::OK_UniformConstantValue;
+ else if (IsConstant)
+ VK = TTI::OK_NonUniformConstantValue;
+ else if (IsUniform)
+ VK = TTI::OK_UniformValue;
+
+ TTI::OperandValueProperties VP = TTI::OP_None;
+ VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
+ VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
+
+ return {VK, VP};
+}
+
+namespace {
+/// The base class for shuffle instruction emission and shuffle cost estimation.
+class BaseShuffleAnalysis {
+protected:
+ /// Checks if the mask is an identity mask.
+ /// \param IsStrict if is true the function returns false if mask size does
+ /// not match vector size.
+ static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
+ bool IsStrict) {
+ int Limit = Mask.size();
+ int VF = VecTy->getNumElements();
+ return (VF == Limit || !IsStrict) &&
+ all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask);
+ }
+
+ /// Tries to combine 2 different masks into single one.
+ /// \param LocalVF Vector length of the permuted input vector. \p Mask may
+ /// change the size of the vector, \p LocalVF is the original size of the
+ /// shuffled vector.
+ static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
+ ArrayRef<int> ExtMask) {
+ unsigned VF = Mask.size();
+ SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+ if (ExtMask[I] == UndefMaskElem)
+ continue;
+ int MaskedIdx = Mask[ExtMask[I] % VF];
+ NewMask[I] =
+ MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF;
+ }
+ Mask.swap(NewMask);
+ }
+
+ /// Looks through shuffles trying to reduce final number of shuffles in the
+ /// code. The function looks through the previously emitted shuffle
+ /// instructions and properly mark indices in mask as undef.
+ /// For example, given the code
+ /// \code
+ /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
+ /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
+ /// \endcode
+ /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
+ /// look through %s1 and %s2 and select vectors %0 and %1 with mask
+ /// <0, 1, 2, 3> for the shuffle.
+ /// If 2 operands are of different size, the smallest one will be resized and
+ /// the mask recalculated properly.
+ /// For example, given the code
+ /// \code
+ /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
+ /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
+ /// \endcode
+ /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
+ /// look through %s1 and %s2 and select vectors %0 and %1 with mask
+ /// <0, 1, 2, 3> for the shuffle.
+ /// So, it tries to transform permutations to simple vector merge, if
+ /// possible.
+ /// \param V The input vector which must be shuffled using the given \p Mask.
+ /// If the better candidate is found, \p V is set to this best candidate
+ /// vector.
+ /// \param Mask The input mask for the shuffle. If the best candidate is found
+ /// during looking-through-shuffles attempt, it is updated accordingly.
+ /// \param SinglePermute true if the shuffle operation is originally a
+ /// single-value-permutation. In this case the look-through-shuffles procedure
+ /// may look for resizing shuffles as the best candidates.
+ /// \return true if the shuffle results in the non-resizing identity shuffle
+ /// (and thus can be ignored), false - otherwise.
+ static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
+ bool SinglePermute) {
+ Value *Op = V;
+ ShuffleVectorInst *IdentityOp = nullptr;
+ SmallVector<int> IdentityMask;
+ while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
+ // Exit if not a fixed vector type or changing size shuffle.
+ auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
+ if (!SVTy)
+ break;
+ // Remember the identity or broadcast mask, if it is not a resizing
+ // shuffle. If no better candidates are found, this Op and Mask will be
+ // used in the final shuffle.
+ if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
+ if (!IdentityOp || !SinglePermute ||
+ (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
+ !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {
+ IdentityOp = SV;
+ // Store current mask in the IdentityMask so later we did not lost
+ // this info if IdentityOp is selected as the best candidate for the
+ // permutation.
+ IdentityMask.assign(Mask);
+ }
+ }
+ // Remember the broadcast mask. If no better candidates are found, this Op
+ // and Mask will be used in the final shuffle.
+ // Zero splat can be used as identity too, since it might be used with
+ // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
+ // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
+ // expensive, the analysis founds out, that the source vector is just a
+ // broadcast, this original mask can be transformed to identity mask <0,
+ // 1, 2, 3>.
+ // \code
+ // %0 = shuffle %v, poison, zeroinitalizer
+ // %res = shuffle %0, poison, <3, 1, 2, 0>
+ // \endcode
+ // may be transformed to
+ // \code
+ // %0 = shuffle %v, poison, zeroinitalizer
+ // %res = shuffle %0, poison, <0, 1, 2, 3>
+ // \endcode
+ if (SV->isZeroEltSplat()) {
+ IdentityOp = SV;
+ IdentityMask.assign(Mask);
+ }
+ int LocalVF = Mask.size();
+ if (auto *SVOpTy =
+ dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
+ LocalVF = SVOpTy->getNumElements();
+ SmallVector<int> ExtMask(Mask.size(), UndefMaskElem);
+ for (auto [Idx, I] : enumerate(Mask)) {
+ if (I == UndefMaskElem)
+ continue;
+ ExtMask[Idx] = SV->getMaskValue(I);
+ }
+ bool IsOp1Undef =
+ isUndefVector(SV->getOperand(0),
+ buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
+ .all();
+ bool IsOp2Undef =
+ isUndefVector(SV->getOperand(1),
+ buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
+ .all();
+ if (!IsOp1Undef && !IsOp2Undef) {
+ // Update mask and mark undef elems.
+ for (int &I : Mask) {
+ if (I == UndefMaskElem)
+ continue;
+ if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
+ UndefMaskElem)
+ I = UndefMaskElem;
+ }
+ break;
+ }
+ SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
+ SV->getShuffleMask().end());
+ combineMasks(LocalVF, ShuffleMask, Mask);
+ Mask.swap(ShuffleMask);
+ if (IsOp2Undef)
+ Op = SV->getOperand(0);
+ else
+ Op = SV->getOperand(1);
+ }
+ if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
+ !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) {
+ if (IdentityOp) {
+ V = IdentityOp;
+ assert(Mask.size() == IdentityMask.size() &&
+ "Expected masks of same sizes.");
+ // Clear known poison elements.
+ for (auto [I, Idx] : enumerate(Mask))
+ if (Idx == UndefMaskElem)
+ IdentityMask[I] = UndefMaskElem;
+ Mask.swap(IdentityMask);
+ auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
+ return SinglePermute &&
+ (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
+ /*IsStrict=*/true) ||
+ (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
+ Shuffle->isZeroEltSplat() &&
+ ShuffleVectorInst::isZeroEltSplatMask(Mask)));
+ }
+ V = Op;
+ return false;
+ }
+ V = Op;
+ return true;
+ }
+
+ /// Smart shuffle instruction emission, walks through shuffles trees and
+ /// tries to find the best matching vector for the actual shuffle
+ /// instruction.
+ template <typename ShuffleBuilderTy>
+ static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
+ ShuffleBuilderTy &Builder) {
+ assert(V1 && "Expected at least one vector value.");
+ int VF = Mask.size();
+ if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
+ VF = FTy->getNumElements();
+ if (V2 &&
+ !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
+ // Peek through shuffles.
+ Value *Op1 = V1;
+ Value *Op2 = V2;
+ int VF =
+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+ SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ for (int I = 0, E = Mask.size(); I < E; ++I) {
+ if (Mask[I] < VF)
+ CombinedMask1[I] = Mask[I];
+ else
+ CombinedMask2[I] = Mask[I] - VF;
+ }
+ Value *PrevOp1;
+ Value *PrevOp2;
+ do {
+ PrevOp1 = Op1;
+ PrevOp2 = Op2;
+ (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
+ (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
+ // Check if we have 2 resizing shuffles - need to peek through operands
+ // again.
+ if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
+ if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
+ SmallVector<int> ExtMask1(Mask.size(), UndefMaskElem);
+ for (auto [Idx, I] : enumerate(CombinedMask1)) {
+ if (I == UndefMaskElem)
+ continue;
+ ExtMask1[Idx] = SV1->getMaskValue(I);
+ }
+ SmallBitVector UseMask1 = buildUseMask(
+ cast<FixedVectorType>(SV1->getOperand(1)->getType())
+ ->getNumElements(),
+ ExtMask1, UseMask::SecondArg);
+ SmallVector<int> ExtMask2(CombinedMask2.size(), UndefMaskElem);
+ for (auto [Idx, I] : enumerate(CombinedMask2)) {
+ if (I == UndefMaskElem)
+ continue;
+ ExtMask2[Idx] = SV2->getMaskValue(I);
+ }
+ SmallBitVector UseMask2 = buildUseMask(
+ cast<FixedVectorType>(SV2->getOperand(1)->getType())
+ ->getNumElements(),
+ ExtMask2, UseMask::SecondArg);
+ if (SV1->getOperand(0)->getType() ==
+ SV2->getOperand(0)->getType() &&
+ SV1->getOperand(0)->getType() != SV1->getType() &&
+ isUndefVector(SV1->getOperand(1), UseMask1).all() &&
+ isUndefVector(SV2->getOperand(1), UseMask2).all()) {
+ Op1 = SV1->getOperand(0);
+ Op2 = SV2->getOperand(0);
+ SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
+ SV1->getShuffleMask().end());
+ int LocalVF = ShuffleMask1.size();
+ if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
+ LocalVF = FTy->getNumElements();
+ combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
+ CombinedMask1.swap(ShuffleMask1);
+ SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
+ SV2->getShuffleMask().end());
+ LocalVF = ShuffleMask2.size();
+ if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
+ LocalVF = FTy->getNumElements();
+ combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
+ CombinedMask2.swap(ShuffleMask2);
+ }
+ }
+ } while (PrevOp1 != Op1 || PrevOp2 != Op2);
+ Builder.resizeToMatch(Op1, Op2);
+ VF = std::max(cast<VectorType>(Op1->getType())
+ ->getElementCount()
+ .getKnownMinValue(),
+ cast<VectorType>(Op2->getType())
+ ->getElementCount()
+ .getKnownMinValue());
+ for (int I = 0, E = Mask.size(); I < E; ++I) {
+ if (CombinedMask2[I] != UndefMaskElem) {
+ assert(CombinedMask1[I] == UndefMaskElem &&
+ "Expected undefined mask element");
+ CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
+ }
+ }
+ return Builder.createShuffleVector(
+ Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
+ CombinedMask1);
+ }
+ if (isa<PoisonValue>(V1))
+ return PoisonValue::get(FixedVectorType::get(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+ SmallVector<int> NewMask(Mask.begin(), Mask.end());
+ bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
+ assert(V1 && "Expected non-null value after looking through shuffles.");
+
+ if (!IsIdentity)
+ return Builder.createShuffleVector(V1, NewMask);
+ return V1;
+ }
+};
+} // namespace
+
InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals) {
- ArrayRef<Value*> VL = E->Scalars;
+ ArrayRef<Value *> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -5832,9 +6614,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
// FIXME: it tries to fix a problem with MSVC buildbots.
- TargetTransformInfo &TTIRef = *TTI;
- auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
- VectorizedVals, E](InstructionCost &Cost) {
+ TargetTransformInfo *TTI = this->TTI;
+ auto AdjustExtractsCost = [=](InstructionCost &Cost) {
+ // If the resulting type is scalarized, do not adjust the cost.
+ unsigned VecNumParts = TTI->getNumberOfParts(VecTy);
+ if (VecNumParts == VecTy->getNumElements())
+ return;
DenseMap<Value *, int> ExtractVectorsTys;
SmallPtrSet<Value *, 4> CheckedExtracts;
for (auto *V : VL) {
@@ -5852,12 +6637,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
(VE && VE != E))
continue;
auto *EE = cast<ExtractElementInst>(V);
- Optional<unsigned> EEIdx = getExtractIndex(EE);
+ std::optional<unsigned> EEIdx = getExtractIndex(EE);
if (!EEIdx)
continue;
unsigned Idx = *EEIdx;
- if (TTIRef.getNumberOfParts(VecTy) !=
- TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
+ if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) {
auto It =
ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
It->getSecond() = std::min<int>(It->second, Idx);
@@ -5865,23 +6649,23 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// Take credit for instruction that will become dead.
if (EE->hasOneUse()) {
Instruction *Ext = EE->user_back();
- if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
- all_of(Ext->users(),
- [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+ return isa<GetElementPtrInst>(U);
+ })) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
Cost -=
- TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
- EE->getVectorOperandType(), Idx);
+ TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+ EE->getVectorOperandType(), Idx);
// Add back the cost of s|zext which is subtracted separately.
- Cost += TTIRef.getCastInstrCost(
+ Cost += TTI->getCastInstrCost(
Ext->getOpcode(), Ext->getType(), EE->getType(),
TTI::getCastContextHint(Ext), CostKind, Ext);
continue;
}
}
- Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement,
- EE->getVectorOperandType(), Idx);
+ Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+ Idx);
}
// Add a cost for subvector extracts/inserts if required.
for (const auto &Data : ExtractVectorsTys) {
@@ -5889,13 +6673,13 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
unsigned NumElts = VecTy->getNumElements();
if (Data.second % NumElts == 0)
continue;
- if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
+ if (TTI->getNumberOfParts(EEVTy) > VecNumParts) {
unsigned Idx = (Data.second / NumElts) * NumElts;
unsigned EENumElts = EEVTy->getNumElements();
if (Idx + NumElts <= EENumElts) {
Cost +=
- TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, None, Idx, VecTy);
+ TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, std::nullopt, CostKind, Idx, VecTy);
} else {
// Need to round up the subvector type vectorization factor to avoid a
// crash in cost model functions. Make SubVT so that Idx + VF of SubVT
@@ -5903,12 +6687,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
auto *SubVT =
FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
Cost +=
- TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, None, Idx, SubVT);
+ TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, std::nullopt, CostKind, Idx, SubVT);
}
} else {
- Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
- VecTy, None, 0, EEVTy);
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+ VecTy, std::nullopt, CostKind, 0, EEVTy);
}
}
};
@@ -5917,13 +6701,36 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
return 0;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
+ SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
+ // Build a mask out of the reorder indices and reorder scalars per this
+ // mask.
+ SmallVector<int> ReorderMask;
+ inversePermutation(E->ReorderIndices, ReorderMask);
+ if (!ReorderMask.empty())
+ reorderScalars(GatheredScalars, ReorderMask);
SmallVector<int> Mask;
+ std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
SmallVector<const TreeEntry *> Entries;
- Optional<TargetTransformInfo::ShuffleKind> Shuffle =
- isGatherShuffledEntry(E, Mask, Entries);
- if (Shuffle) {
+ // Do not try to look for reshuffled loads for gathered loads (they will be
+ // handled later), for vectorized scalars, and cases, which are definitely
+ // not profitable (splats and small gather nodes.)
+ if (E->getOpcode() != Instruction::Load || E->isAltShuffle() ||
+ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+ isSplat(E->Scalars) ||
+ (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+ GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+ if (GatherShuffle) {
+ // Remove shuffled elements from list of gathers.
+ for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+ if (Mask[I] != UndefMaskElem)
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ assert((Entries.size() == 1 || Entries.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
InstructionCost GatherCost = 0;
- if (ShuffleVectorInst::isIdentityMask(Mask)) {
+ int Limit = Mask.size() * 2;
+ if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
@@ -5942,8 +6749,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// previously vectorized nodes. Add the cost of the permutation rather
// than gather.
::addMask(Mask, E->ReuseShuffleIndices);
- GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask);
+ GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
}
+ if (!all_of(GatheredScalars, UndefValue::classof))
+ GatherCost += getGatherCost(GatheredScalars);
return GatherCost;
}
if ((E->getOpcode() == Instruction::ExtractElement ||
@@ -5955,7 +6764,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
SmallVector<int> Mask;
- Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
+ std::optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
isFixedVectorShuffle(VL, Mask);
if (ShuffleKind) {
// Found the bunch of extractelement instructions that must be gathered
@@ -5975,9 +6784,24 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// broadcast.
assert(VecTy == FinalVecTy &&
"No reused scalars expected for broadcast.");
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
- /*Mask=*/None, /*Index=*/0,
- /*SubTp=*/nullptr, /*Args=*/VL[0]);
+ const auto *It =
+ find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
+ // If all values are undefs - consider cost free.
+ if (It == VL.end())
+ return TTI::TCC_Free;
+ // Add broadcast for non-identity shuffle only.
+ bool NeedShuffle =
+ VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof);
+ InstructionCost InsertCost =
+ TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
+ /*Index=*/0, PoisonValue::get(VecTy), *It);
+ return InsertCost + (NeedShuffle
+ ? TTI->getShuffleCost(
+ TargetTransformInfo::SK_Broadcast, VecTy,
+ /*Mask=*/std::nullopt, CostKind,
+ /*Index=*/0,
+ /*SubTp=*/nullptr, /*Args=*/VL[0])
+ : TTI::TCC_Free);
}
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
@@ -6003,7 +6827,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
OrdersType CurrentOrder;
LoadsState LS =
canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
- CurrentOrder, PointerOps);
+ *TLI, CurrentOrder, PointerOps);
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
@@ -6046,9 +6870,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
InstructionCost ScalarsCost = 0;
for (Value *V : VectorizedLoads) {
auto *LI = cast<LoadInst>(V);
- ScalarsCost += TTI->getMemoryOpCost(
- Instruction::Load, LI->getType(), LI->getAlign(),
- LI->getPointerAddressSpace(), CostKind, LI);
+ ScalarsCost +=
+ TTI->getMemoryOpCost(Instruction::Load, LI->getType(),
+ LI->getAlign(), LI->getPointerAddressSpace(),
+ CostKind, TTI::OperandValueInfo(), LI);
}
auto *LI = cast<LoadInst>(E->getMainOp());
auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
@@ -6056,7 +6881,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
GatherCost +=
VectorizedCnt *
TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind, LI);
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI);
GatherCost += ScatterVectorizeCnt *
TTI->getGatherScatterOpCost(
Instruction::Load, LoadTy, LI->getPointerOperand(),
@@ -6064,8 +6890,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (NeedInsertSubvectorAnalysis) {
// Add the cost for the subvectors insert.
for (int I = VF, E = VL.size(); I < E; I += VF)
- GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
- None, I, LoadTy);
+ GatherCost +=
+ TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
+ std::nullopt, CostKind, I, LoadTy);
}
return ReuseShuffleCost + GatherCost - ScalarsCost;
}
@@ -6101,240 +6928,306 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ const unsigned Sz = VL.size();
+ auto GetCostDiff =
+ [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
+ function_ref<InstructionCost(InstructionCost)> VectorCost) {
+ // Calculate the cost of this instruction.
+ InstructionCost ScalarCost = 0;
+ if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
+ // For some of the instructions no need to calculate cost for each
+ // particular instruction, we can use the cost of the single
+ // instruction x total number of scalar instructions.
+ ScalarCost = Sz * ScalarEltCost(0);
+ } else {
+ for (unsigned I = 0; I < Sz; ++I)
+ ScalarCost += ScalarEltCost(I);
+ }
+
+ InstructionCost VecCost = VectorCost(CommonCost);
+ LLVM_DEBUG(
+ dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost));
+ // Disable warnings for `this` and `E` are unused. Required for
+ // `dumpTreeCosts`.
+ (void)this;
+ (void)E;
+ return VecCost - ScalarCost;
+ };
+ // Calculate cost difference from vectorizing set of GEPs.
+ // Negative value means vectorizing is profitable.
+ auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
+ InstructionCost CostSavings = 0;
+ for (Value *V : Ptrs) {
+ if (V == BasePtr)
+ continue;
+ auto *Ptr = dyn_cast<GetElementPtrInst>(V);
+ // GEPs may contain just addresses without instructions, considered free.
+ // GEPs with all constant indices also considered to have zero cost.
+ if (!Ptr || Ptr->hasAllConstantIndices())
+ continue;
+
+ // Here we differentiate two cases: when GEPs represent a regular
+ // vectorization tree node (and hence vectorized) and when the set is
+ // arguments of a set of loads or stores being vectorized. In the former
+ // case all the scalar GEPs will be removed as a result of vectorization.
+ // For any external uses of some lanes extract element instructions will
+ // be generated (which cost is estimated separately). For the latter case
+ // since the set of GEPs itself is not vectorized those used more than
+ // once will remain staying in vectorized code as well. So we should not
+ // count them as savings.
+ if (!Ptr->hasOneUse() && isa<LoadInst, StoreInst>(VL0))
+ continue;
+
+ // TODO: it is target dependent, so need to implement and then use a TTI
+ // interface.
+ CostSavings += TTI->getArithmeticInstrCost(Instruction::Add,
+ Ptr->getType(), CostKind);
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n";
+ E->dump());
+ LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n");
+ return InstructionCost() - CostSavings;
+ };
+
switch (ShuffleOrOp) {
- case Instruction::PHI:
- return 0;
+ case Instruction::PHI: {
+ // Count reused scalars.
+ InstructionCost ScalarCost = 0;
+ SmallPtrSet<const TreeEntry *, 4> CountedOps;
+ for (Value *V : VL) {
+ auto *PHI = dyn_cast<PHINode>(V);
+ if (!PHI)
+ continue;
- case Instruction::ExtractValue:
- case Instruction::ExtractElement: {
- // The common cost of removal ExtractElement/ExtractValue instructions +
- // the cost of shuffles, if required to resuffle the original vector.
- if (NeedToShuffleReuses) {
- unsigned Idx = 0;
- for (unsigned I : E->ReuseShuffleIndices) {
- if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *EE = cast<ExtractElementInst>(VL[I]);
- CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
- EE->getVectorOperandType(),
- *getExtractIndex(EE));
- } else {
- CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
- VecTy, Idx);
- ++Idx;
- }
- }
- Idx = EntryVF;
- for (Value *V : VL) {
- if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *EE = cast<ExtractElementInst>(V);
- CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
- EE->getVectorOperandType(),
- *getExtractIndex(EE));
- } else {
- --Idx;
- CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
- VecTy, Idx);
- }
- }
- }
- if (ShuffleOrOp == Instruction::ExtractValue) {
- for (unsigned I = 0, E = VL.size(); I < E; ++I) {
- auto *EI = cast<Instruction>(VL[I]);
- // Take credit for instruction that will become dead.
- if (EI->hasOneUse()) {
- Instruction *Ext = EI->user_back();
- if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
- all_of(Ext->users(),
- [](User *U) { return isa<GetElementPtrInst>(U); })) {
- // Use getExtractWithExtendCost() to calculate the cost of
- // extractelement/ext pair.
- CommonCost -= TTI->getExtractWithExtendCost(
- Ext->getOpcode(), Ext->getType(), VecTy, I);
- // Add back the cost of s|zext which is subtracted separately.
- CommonCost += TTI->getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), EI->getType(),
- TTI::getCastContextHint(Ext), CostKind, Ext);
- continue;
- }
- }
- CommonCost -=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
- }
- } else {
- AdjustExtractsCost(CommonCost);
+ ValueList Operands(PHI->getNumIncomingValues(), nullptr);
+ for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
+ Value *Op = PHI->getIncomingValue(I);
+ Operands[I] = Op;
}
- return CommonCost;
+ if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
+ if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
+ if (!OpTE->ReuseShuffleIndices.empty())
+ ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
+ OpTE->Scalars.size());
}
- case Instruction::InsertElement: {
- assert(E->ReuseShuffleIndices.empty() &&
- "Unique insertelements only are expected.");
- auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
- unsigned const NumElts = SrcVecTy->getNumElements();
- unsigned const NumScalars = VL.size();
-
- unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
-
- unsigned OffsetBeg = *getInsertIndex(VL.front());
- unsigned OffsetEnd = OffsetBeg;
- for (Value *V : VL.drop_front()) {
- unsigned Idx = *getInsertIndex(V);
- if (OffsetBeg > Idx)
- OffsetBeg = Idx;
- else if (OffsetEnd < Idx)
- OffsetEnd = Idx;
- }
- unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
- if (NumOfParts > 0)
- VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
- unsigned VecSz =
- (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
- VecScalarsSz;
- unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
- unsigned InsertVecSz = std::min<unsigned>(
- PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
- ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
- VecScalarsSz);
- bool IsWholeSubvector =
- OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
- // Check if we can safely insert a subvector. If it is not possible, just
- // generate a whole-sized vector and shuffle the source vector and the new
- // subvector.
- if (OffsetBeg + InsertVecSz > VecSz) {
- // Align OffsetBeg to generate correct mask.
- OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
- InsertVecSz = VecSz;
- }
-
- APInt DemandedElts = APInt::getZero(NumElts);
- // TODO: Add support for Instruction::InsertValue.
- SmallVector<int> Mask;
- if (!E->ReorderIndices.empty()) {
- inversePermutation(E->ReorderIndices, Mask);
- Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
+
+ return CommonCost - ScalarCost;
+ }
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement: {
+ auto GetScalarCost = [=](unsigned Idx) {
+ auto *I = cast<Instruction>(VL[Idx]);
+ VectorType *SrcVecTy;
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *EE = cast<ExtractElementInst>(I);
+ SrcVecTy = EE->getVectorOperandType();
} else {
- Mask.assign(VecSz, UndefMaskElem);
- std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
- }
- bool IsIdentity = true;
- SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
- Mask.swap(PrevMask);
- for (unsigned I = 0; I < NumScalars; ++I) {
- unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
- DemandedElts.setBit(InsertIdx);
- IsIdentity &= InsertIdx - OffsetBeg == I;
- Mask[InsertIdx - OffsetBeg] = I;
+ auto *EV = cast<ExtractValueInst>(I);
+ Type *AggregateTy = EV->getAggregateOperand()->getType();
+ unsigned NumElts;
+ if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
+ NumElts = ATy->getNumElements();
+ else
+ NumElts = AggregateTy->getStructNumElements();
+ SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
}
- assert(Offset < NumElts && "Failed to find vector index offset");
-
- InstructionCost Cost = 0;
- Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
- /*Insert*/ true, /*Extract*/ false);
-
- // First cost - resize to actual vector size if not identity shuffle or
- // need to shift the vector.
- // Do not calculate the cost if the actual size is the register size and
- // we can merge this shuffle with the following SK_Select.
- auto *InsertVecTy =
- FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
- if (!IsIdentity)
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- InsertVecTy, Mask);
- auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
- return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
- }));
- // Second cost - permutation with subvector, if some elements are from the
- // initial vector or inserting a subvector.
- // TODO: Implement the analysis of the FirstInsert->getOperand(0)
- // subvector of ActualVecTy.
- if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts &&
- !IsWholeSubvector) {
- if (InsertVecSz != VecSz) {
- auto *ActualVecTy =
- FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
- Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
- None, OffsetBeg - Offset, InsertVecTy);
- } else {
- for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
- Mask[I] = I;
- for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
- I <= End; ++I)
- if (Mask[I] != UndefMaskElem)
- Mask[I] = I + VecSz;
- for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
- Mask[I] = I;
- Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
+ if (I->hasOneUse()) {
+ Instruction *Ext = I->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ InstructionCost Cost = TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
+ // Subtract the cost of s|zext which is subtracted separately.
+ Cost -= TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), I->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
+ return Cost;
}
}
- return Cost;
+ return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
+ CostKind, *getExtractIndex(I));
+ };
+ auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
+ case Instruction::InsertElement: {
+ assert(E->ReuseShuffleIndices.empty() &&
+ "Unique insertelements only are expected.");
+ auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
+ unsigned const NumElts = SrcVecTy->getNumElements();
+ unsigned const NumScalars = VL.size();
+
+ unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+
+ SmallVector<int> InsertMask(NumElts, UndefMaskElem);
+ unsigned OffsetBeg = *getInsertIndex(VL.front());
+ unsigned OffsetEnd = OffsetBeg;
+ InsertMask[OffsetBeg] = 0;
+ for (auto [I, V] : enumerate(VL.drop_front())) {
+ unsigned Idx = *getInsertIndex(V);
+ if (OffsetBeg > Idx)
+ OffsetBeg = Idx;
+ else if (OffsetEnd < Idx)
+ OffsetEnd = Idx;
+ InsertMask[Idx] = I + 1;
+ }
+ unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
+ if (NumOfParts > 0)
+ VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
+ unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
+ VecScalarsSz;
+ unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
+ unsigned InsertVecSz = std::min<unsigned>(
+ PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
+ ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
+ bool IsWholeSubvector =
+ OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
+ // Check if we can safely insert a subvector. If it is not possible, just
+ // generate a whole-sized vector and shuffle the source vector and the new
+ // subvector.
+ if (OffsetBeg + InsertVecSz > VecSz) {
+ // Align OffsetBeg to generate correct mask.
+ OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
+ InsertVecSz = VecSz;
+ }
+
+ APInt DemandedElts = APInt::getZero(NumElts);
+ // TODO: Add support for Instruction::InsertValue.
+ SmallVector<int> Mask;
+ if (!E->ReorderIndices.empty()) {
+ inversePermutation(E->ReorderIndices, Mask);
+ Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
+ } else {
+ Mask.assign(VecSz, UndefMaskElem);
+ std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
+ }
+ bool IsIdentity = true;
+ SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
+ Mask.swap(PrevMask);
+ for (unsigned I = 0; I < NumScalars; ++I) {
+ unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
+ DemandedElts.setBit(InsertIdx);
+ IsIdentity &= InsertIdx - OffsetBeg == I;
+ Mask[InsertIdx - OffsetBeg] = I;
+ }
+ assert(Offset < NumElts && "Failed to find vector index offset");
+
+ InstructionCost Cost = 0;
+ Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
+ /*Insert*/ true, /*Extract*/ false,
+ CostKind);
+
+ // First cost - resize to actual vector size if not identity shuffle or
+ // need to shift the vector.
+ // Do not calculate the cost if the actual size is the register size and
+ // we can merge this shuffle with the following SK_Select.
+ auto *InsertVecTy =
+ FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
+ if (!IsIdentity)
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ InsertVecTy, Mask);
+ auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+ return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+ }));
+ // Second cost - permutation with subvector, if some elements are from the
+ // initial vector or inserting a subvector.
+ // TODO: Implement the analysis of the FirstInsert->getOperand(0)
+ // subvector of ActualVecTy.
+ SmallBitVector InMask =
+ isUndefVector(FirstInsert->getOperand(0),
+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
+ if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
+ if (InsertVecSz != VecSz) {
+ auto *ActualVecTy =
+ FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
+ Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
+ std::nullopt, CostKind, OffsetBeg - Offset,
+ InsertVecTy);
+ } else {
+ for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
+ Mask[I] = InMask.test(I) ? UndefMaskElem : I;
+ for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
+ I <= End; ++I)
+ if (Mask[I] != UndefMaskElem)
+ Mask[I] = I + VecSz;
+ for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
+ Mask[I] =
+ ((I >= InMask.size()) || InMask.test(I)) ? UndefMaskElem : I;
+ Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
+ }
}
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
+ return Cost;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ auto GetScalarCost = [=](unsigned Idx) {
+ auto *VI = cast<Instruction>(VL[Idx]);
+ return TTI->getCastInstrCost(E->getOpcode(), ScalarTy,
+ VI->getOperand(0)->getType(),
+ TTI::getCastContextHint(VI), CostKind, VI);
+ };
+ auto GetVectorCost = [=](InstructionCost CommonCost) {
Type *SrcTy = VL0->getOperand(0)->getType();
- InstructionCost ScalarEltCost =
- TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
- TTI::getCastContextHint(VL0), CostKind, VL0);
- if (NeedToShuffleReuses) {
- CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
- }
-
- // Calculate the cost of this instruction.
- InstructionCost ScalarCost = VL.size() * ScalarEltCost;
-
auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
- InstructionCost VecCost = 0;
+ InstructionCost VecCost = CommonCost;
// Check if the values are candidates to demote.
- if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
- VecCost = CommonCost + TTI->getCastInstrCost(
- E->getOpcode(), VecTy, SrcVecTy,
- TTI::getCastContextHint(VL0), CostKind, VL0);
- }
- LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
- return VecCost - ScalarCost;
- }
- case Instruction::FCmp:
- case Instruction::ICmp:
- case Instruction::Select: {
- // Calculate the cost of this instruction.
- InstructionCost ScalarEltCost =
- TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
- CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
- if (NeedToShuffleReuses) {
- CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
- }
+ if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
+ VecCost +=
+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
+ return VecCost;
+ };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ case Instruction::Select: {
+ CmpInst::Predicate VecPred, SwappedVecPred;
+ auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
+ if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
+ match(VL0, MatchCmp))
+ SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
+ else
+ SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
+ ? CmpInst::BAD_FCMP_PREDICATE
+ : CmpInst::BAD_ICMP_PREDICATE;
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *VI = cast<Instruction>(VL[Idx]);
+ CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
+ ? CmpInst::BAD_FCMP_PREDICATE
+ : CmpInst::BAD_ICMP_PREDICATE;
+ auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+ if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
+ !match(VI, MatchCmp)) ||
+ (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
+ VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
+ ? CmpInst::BAD_FCMP_PREDICATE
+ : CmpInst::BAD_ICMP_PREDICATE;
+
+ return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+ Builder.getInt1Ty(), CurrentPred, CostKind,
+ VI);
+ };
+ auto GetVectorCost = [&](InstructionCost CommonCost) {
auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
- InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-
- // Check if all entries in VL are either compares or selects with compares
- // as condition that have the same predicates.
- CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
- bool First = true;
- for (auto *V : VL) {
- CmpInst::Predicate CurrentPred;
- auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
- if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
- !match(V, MatchCmp)) ||
- (!First && VecPred != CurrentPred)) {
- VecPred = CmpInst::BAD_ICMP_PREDICATE;
- break;
- }
- First = false;
- VecPred = CurrentPred;
- }
InstructionCost VecCost = TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
- // Check if it is possible and profitable to use min/max for selects in
- // VL.
+ // Check if it is possible and profitable to use min/max for selects
+ // in VL.
//
auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
@@ -6342,216 +7235,181 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
{VecTy, VecTy});
InstructionCost IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
- // If the selects are the only uses of the compares, they will be dead
- // and we can adjust the cost by removing their cost.
+ // If the selects are the only uses of the compares, they will be
+ // dead and we can adjust the cost by removing their cost.
if (IntrinsicAndUse.second)
IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
MaskTy, VecPred, CostKind);
VecCost = std::min(VecCost, IntrinsicCost);
}
- LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
- return CommonCost + VecCost - ScalarCost;
- }
- case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Certain instructions can be cheaper to vectorize if they have a
- // constant second vector operand.
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_UniformConstantValue;
- TargetTransformInfo::OperandValueProperties Op1VP =
- TargetTransformInfo::OP_None;
- TargetTransformInfo::OperandValueProperties Op2VP =
- TargetTransformInfo::OP_PowerOf2;
-
- // If all operands are exactly the same ConstantInt then set the
- // operand kind to OK_UniformConstantValue.
- // If instead not all operands are constants, then set the operand kind
- // to OK_AnyValue. If all operands are constants but not the same,
- // then set the operand kind to OK_NonUniformConstantValue.
- ConstantInt *CInt0 = nullptr;
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- const Instruction *I = cast<Instruction>(VL[i]);
- unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
- ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
- if (!CInt) {
- Op2VK = TargetTransformInfo::OK_AnyValue;
- Op2VP = TargetTransformInfo::OP_None;
- break;
- }
- if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
- !CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_None;
- if (i == 0) {
- CInt0 = CInt;
- continue;
- }
- if (CInt0 != CInt)
- Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
- }
-
- SmallVector<const Value *, 4> Operands(VL0->operand_values());
- InstructionCost ScalarEltCost =
- TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
- Op2VK, Op1VP, Op2VP, Operands, VL0);
- if (NeedToShuffleReuses) {
- CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
- }
- InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- InstructionCost VecCost =
- TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
- Op2VK, Op1VP, Op2VP, Operands, VL0);
- LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
- return CommonCost + VecCost - ScalarCost;
- }
- case Instruction::GetElementPtr: {
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- any_of(VL,
- [](Value *V) {
- return isa<GetElementPtrInst>(V) &&
- !isConstant(
- cast<GetElementPtrInst>(V)->getOperand(1));
- })
- ? TargetTransformInfo::OK_AnyValue
- : TargetTransformInfo::OK_UniformConstantValue;
-
- InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
- Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
- if (NeedToShuffleReuses) {
- CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
- }
- InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- InstructionCost VecCost = TTI->getArithmeticInstrCost(
- Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
- LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
- return CommonCost + VecCost - ScalarCost;
- }
- case Instruction::Load: {
- // Cost of wide load - cost of scalar loads.
- Align Alignment = cast<LoadInst>(VL0)->getAlign();
- InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
- Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
- if (NeedToShuffleReuses) {
- CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
- }
- InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ return VecCost + CommonCost;
+ };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ auto GetScalarCost = [=](unsigned Idx) {
+ auto *VI = cast<Instruction>(VL[Idx]);
+ unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
+ TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
+ TTI::OperandValueInfo Op2Info =
+ TTI::getOperandInfo(VI->getOperand(OpIdx));
+ SmallVector<const Value *> Operands(VI->operand_values());
+ return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
+ Op1Info, Op2Info, Operands, VI);
+ };
+ auto GetVectorCost = [=](InstructionCost CommonCost) {
+ unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
+ TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
+ TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
+ return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
+ Op2Info) +
+ CommonCost;
+ };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
+ case Instruction::GetElementPtr: {
+ return CommonCost + GetGEPCostDiff(VL, VL0);
+ }
+ case Instruction::Load: {
+ auto GetScalarCost = [=](unsigned Idx) {
+ auto *VI = cast<LoadInst>(VL[Idx]);
+ return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
+ VI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), VI);
+ };
+ auto *LI0 = cast<LoadInst>(VL0);
+ auto GetVectorCost = [=](InstructionCost CommonCost) {
InstructionCost VecLdCost;
if (E->State == TreeEntry::Vectorize) {
- VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
- CostKind, VL0);
+ VecLdCost = TTI->getMemoryOpCost(
+ Instruction::Load, VecTy, LI0->getAlign(),
+ LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
- Align CommonAlignment = Alignment;
+ Align CommonAlignment = LI0->getAlign();
for (Value *V : VL)
CommonAlignment =
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
VecLdCost = TTI->getGatherScatterOpCost(
- Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
+ Instruction::Load, VecTy, LI0->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind);
}
- LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
- return CommonCost + VecLdCost - ScalarLdCost;
- }
- case Instruction::Store: {
+ return VecLdCost + CommonCost;
+ };
+
+ InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
+ // If this node generates masked gather load then it is not a terminal node.
+ // Hence address operand cost is estimated separately.
+ if (E->State == TreeEntry::ScatterVectorize)
+ return Cost;
+
+ // Estimate cost of GEPs since this tree node is a terminator.
+ SmallVector<Value *> PointerOps(VL.size());
+ for (auto [I, V] : enumerate(VL))
+ PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
+ return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
+ }
+ case Instruction::Store: {
+ bool IsReorder = !E->ReorderIndices.empty();
+ auto GetScalarCost = [=](unsigned Idx) {
+ auto *VI = cast<StoreInst>(VL[Idx]);
+ TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0);
+ return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
+ VI->getPointerAddressSpace(), CostKind,
+ OpInfo, VI);
+ };
+ auto *BaseSI =
+ cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
+ auto GetVectorCost = [=](InstructionCost CommonCost) {
// We know that we can merge the stores. Calculate the cost.
- bool IsReorder = !E->ReorderIndices.empty();
- auto *SI =
- cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
- Align Alignment = SI->getAlign();
- InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
- Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
- InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
- InstructionCost VecStCost = TTI->getMemoryOpCost(
- Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
- LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
- return CommonCost + VecStCost - ScalarStCost;
+ TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
+ return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
+ BaseSI->getPointerAddressSpace(), CostKind,
+ OpInfo) +
+ CommonCost;
+ };
+ SmallVector<Value *> PointerOps(VL.size());
+ for (auto [I, V] : enumerate(VL)) {
+ unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
+ PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
}
- case Instruction::Call: {
- CallInst *CI = cast<CallInst>(VL0);
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- // Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
- InstructionCost ScalarEltCost =
- TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
- if (NeedToShuffleReuses) {
- CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+ return GetCostDiff(GetScalarCost, GetVectorCost) +
+ GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
+ }
+ case Instruction::Call: {
+ auto GetScalarCost = [=](unsigned Idx) {
+ auto *CI = cast<CallInst>(VL[Idx]);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (ID != Intrinsic::not_intrinsic) {
+ IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
+ return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
}
- InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
-
+ return TTI->getCallInstrCost(CI->getCalledFunction(),
+ CI->getFunctionType()->getReturnType(),
+ CI->getFunctionType()->params(), CostKind);
+ };
+ auto GetVectorCost = [=](InstructionCost CommonCost) {
+ auto *CI = cast<CallInst>(VL0);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
- InstructionCost VecCallCost =
- std::min(VecCallCosts.first, VecCallCosts.second);
-
- LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
- << " (" << VecCallCost << "-" << ScalarCallCost << ")"
- << " for " << *CI << "\n");
-
- return CommonCost + VecCallCost - ScalarCallCost;
- }
- case Instruction::ShuffleVector: {
- assert(E->isAltShuffle() &&
- ((Instruction::isBinaryOp(E->getOpcode()) &&
- Instruction::isBinaryOp(E->getAltOpcode())) ||
- (Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode())) ||
- (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
- "Invalid Shuffle Vector Operand");
- InstructionCost ScalarCost = 0;
- if (NeedToShuffleReuses) {
- for (unsigned Idx : E->ReuseShuffleIndices) {
- Instruction *I = cast<Instruction>(VL[Idx]);
- CommonCost -= TTI->getInstructionCost(I, CostKind);
- }
- for (Value *V : VL) {
- Instruction *I = cast<Instruction>(V);
- CommonCost += TTI->getInstructionCost(I, CostKind);
- }
- }
- for (Value *V : VL) {
- Instruction *I = cast<Instruction>(V);
- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- ScalarCost += TTI->getInstructionCost(I, CostKind);
+ return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
+ };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
+ case Instruction::ShuffleVector: {
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+ "Invalid Shuffle Vector Operand");
+ // Try to find the previous shuffle node with the same operands and same
+ // main/alternate ops.
+ auto TryFindNodeWithEqualOperands = [=]() {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ if (TE.get() == E)
+ break;
+ if (TE->isAltShuffle() &&
+ ((TE->getOpcode() == E->getOpcode() &&
+ TE->getAltOpcode() == E->getAltOpcode()) ||
+ (TE->getOpcode() == E->getAltOpcode() &&
+ TE->getAltOpcode() == E->getOpcode())) &&
+ TE->hasEqualOperands(*E))
+ return true;
}
+ return false;
+ };
+ auto GetScalarCost = [=](unsigned Idx) {
+ auto *VI = cast<Instruction>(VL[Idx]);
+ assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
+ (void)E;
+ return TTI->getInstructionCost(VI, CostKind);
+ };
+ // Need to clear CommonCost since the final shuffle cost is included into
+ // vector cost.
+ auto GetVectorCost = [&](InstructionCost) {
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
InstructionCost VecCost = 0;
- // Try to find the previous shuffle node with the same operands and same
- // main/alternate ops.
- auto &&TryFindNodeWithEqualOperands = [this, E]() {
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
- if (TE.get() == E)
- break;
- if (TE->isAltShuffle() &&
- ((TE->getOpcode() == E->getOpcode() &&
- TE->getAltOpcode() == E->getAltOpcode()) ||
- (TE->getOpcode() == E->getAltOpcode() &&
- TE->getAltOpcode() == E->getOpcode())) &&
- TE->hasEqualOperands(*E))
- return true;
- }
- return false;
- };
if (TryFindNodeWithEqualOperands()) {
LLVM_DEBUG({
dbgs() << "SLP: diamond match for alternate node found.\n";
@@ -6561,8 +7419,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// same main/alternate vector ops, just do different shuffling.
} else if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
- VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
- CostKind);
+ VecCost +=
+ TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
Builder.getInt1Ty(),
@@ -6581,9 +7439,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
TTI::CastContextHint::None, CostKind);
}
-
if (E->ReuseShuffleIndices.empty()) {
- CommonCost =
+ VecCost +=
TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
} else {
SmallVector<int> Mask;
@@ -6594,14 +7451,15 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
return I->getOpcode() == E->getAltOpcode();
},
Mask);
- CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
- FinalVecTy, Mask);
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ FinalVecTy, Mask);
}
- LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
- return CommonCost + VecCost - ScalarCost;
- }
- default:
- llvm_unreachable("Unknown instruction");
+ return VecCost;
+ };
+ return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
+ default:
+ llvm_unreachable("Unknown instruction");
}
}
@@ -6817,9 +7675,30 @@ InstructionCost BoUpSLP::getSpillCost() const {
continue;
}
+ auto NoCallIntrinsic = [this](Instruction *I) {
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->isAssumeLikeIntrinsic())
+ return true;
+ FastMathFlags FMF;
+ SmallVector<Type *, 4> Tys;
+ for (auto &ArgOp : II->args())
+ Tys.push_back(ArgOp->getType());
+ if (auto *FPMO = dyn_cast<FPMathOperator>(II))
+ FMF = FPMO->getFastMathFlags();
+ IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
+ FMF);
+ InstructionCost IntrCost =
+ TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
+ InstructionCost CallCost = TTI->getCallInstrCost(
+ nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
+ if (IntrCost < CallCost)
+ return true;
+ }
+ return false;
+ };
+
// Debug information does not impact spill cost.
- if ((isa<CallInst>(&*PrevInstIt) &&
- !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+ if (isa<CallInst>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
&*PrevInstIt != PrevInst)
NumCalls++;
@@ -6843,48 +7722,6 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}
-/// Check if two insertelement instructions are from the same buildvector.
-static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
- InsertElementInst *V) {
- // Instructions must be from the same basic blocks.
- if (VU->getParent() != V->getParent())
- return false;
- // Checks if 2 insertelements are from the same buildvector.
- if (VU->getType() != V->getType())
- return false;
- // Multiple used inserts are separate nodes.
- if (!VU->hasOneUse() && !V->hasOneUse())
- return false;
- auto *IE1 = VU;
- auto *IE2 = V;
- unsigned Idx1 = *getInsertIndex(IE1);
- unsigned Idx2 = *getInsertIndex(IE2);
- // Go through the vector operand of insertelement instructions trying to find
- // either VU as the original vector for IE2 or V as the original vector for
- // IE1.
- do {
- if (IE2 == VU)
- return VU->hasOneUse();
- if (IE1 == V)
- return V->hasOneUse();
- if (IE1) {
- if ((IE1 != VU && !IE1->hasOneUse()) ||
- getInsertIndex(IE1).value_or(Idx2) == Idx2)
- IE1 = nullptr;
- else
- IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
- }
- if (IE2) {
- if ((IE2 != V && !IE2->hasOneUse()) ||
- getInsertIndex(IE2).value_or(Idx1) == Idx1)
- IE2 = nullptr;
- else
- IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
- }
- } while (IE1 || IE2);
- return false;
-}
-
/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
/// buildvector sequence.
static bool isFirstInsertElement(const InsertElementInst *IE1,
@@ -6919,13 +7756,11 @@ namespace {
/// value, otherwise.
struct ValueSelect {
template <typename U>
- static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type
- get(Value *V) {
+ static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
return V;
}
template <typename U>
- static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type
- get(Value *) {
+ static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
return U();
}
};
@@ -6947,19 +7782,23 @@ template <typename T>
static T *performExtractsShuffleAction(
MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
function_ref<unsigned(T *)> GetVF,
- function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction,
+ function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
SmallVector<int> Mask(ShuffleMask.begin()->second);
auto VMIt = std::next(ShuffleMask.begin());
T *Prev = nullptr;
- bool IsBaseNotUndef = !isUndefVector(Base);
- if (IsBaseNotUndef) {
+ SmallBitVector UseMask =
+ buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
+ SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
+ if (!IsBaseUndef.all()) {
// Base is not undef, need to combine it with the next subvectors.
- std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ std::pair<T *, bool> Res =
+ ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
+ SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
if (Mask[Idx] == UndefMaskElem)
- Mask[Idx] = Idx;
+ Mask[Idx] = IsBasePoison.test(Idx) ? UndefMaskElem : Idx;
else
Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
}
@@ -6971,7 +7810,8 @@ static T *performExtractsShuffleAction(
} else if (ShuffleMask.size() == 1) {
// Base is undef and only 1 vector is shuffled - perform the action only for
// single vector, if the mask is not the identity mask.
- std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
+ /*ForSingleMask=*/true);
if (Res.second)
// Identity mask is found.
Prev = Res.first;
@@ -6995,9 +7835,10 @@ static T *performExtractsShuffleAction(
Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
} else {
// Vectors of different sizes - resize and reshuffle.
- std::pair<T *, bool> Res1 =
- ResizeAction(ShuffleMask.begin()->first, Mask);
- std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second);
+ std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
+ /*ForSingleMask=*/false);
+ std::pair<T *, bool> Res2 =
+ ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
if (Mask[I] != UndefMaskElem) {
@@ -7013,10 +7854,13 @@ static T *performExtractsShuffleAction(
}
VMIt = std::next(VMIt);
}
+ bool IsBaseNotUndef = !IsBaseUndef.all();
+ (void)IsBaseNotUndef;
// Perform requested actions for the remaining masks/vectors.
for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
// Shuffle other input vectors, if any.
- std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second);
+ std::pair<T *, bool> Res =
+ ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
ArrayRef<int> SecMask = VMIt->second;
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
if (SecMask[I] != UndefMaskElem) {
@@ -7041,6 +7885,18 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
+ if (TE.State == TreeEntry::NeedToGather) {
+ if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
+ E && E->getVectorFactor() == TE.getVectorFactor() &&
+ E->isSame(TE.Scalars)) {
+ // Some gather nodes might be absolutely the same as some vectorizable
+ // nodes after reordering, need to handle it.
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with "
+ << *TE.Scalars[0] << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ continue;
+ }
+ }
InstructionCost C = getEntryCost(&TE, VectorizedVals);
Cost += C;
@@ -7071,24 +7927,25 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;
- // Already counted the cost for external uses when tried to adjust the cost
- // for extractelements, no need to add it again.
- if (isa<ExtractElementInst>(EU.Scalar))
- continue;
-
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
- Optional<unsigned> InsertIdx = getInsertIndex(VU);
+ std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
- auto *It =
- find_if(FirstUsers,
- [VU](const std::pair<Value *, const TreeEntry *> &Pair) {
- return areTwoInsertFromSameBuildVector(
- VU, cast<InsertElementInst>(Pair.first));
- });
+ auto *It = find_if(
+ FirstUsers,
+ [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
+ return areTwoInsertFromSameBuildVector(
+ VU, cast<InsertElementInst>(Pair.first),
+ [this](InsertElementInst *II) -> Value * {
+ Value *Op0 = II->getOperand(0);
+ if (getTreeEntry(II) && !getTreeEntry(Op0))
+ return nullptr;
+ return Op0;
+ });
+ });
int VecId = -1;
if (It == FirstUsers.end()) {
(void)ShuffleMasks.emplace_back();
@@ -7140,6 +7997,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
@@ -7149,14 +8007,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
- ExtractCost +=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+ ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ CostKind, EU.Lane);
}
}
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) {
+ auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
+ bool) {
InstructionCost C = 0;
unsigned VF = Mask.size();
unsigned VecVF = TE->getVectorFactor();
@@ -7218,12 +8077,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return TEs.back();
};
(void)performExtractsShuffleAction<const TreeEntry>(
- makeMutableArrayRef(Vector.data(), Vector.size()), Base,
+ MutableArrayRef(Vector.data(), Vector.size()), Base,
[](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
EstimateShufflesCost);
InstructionCost InsertCost = TTI->getScalarizationOverhead(
cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
- /*Insert*/ true, /*Extract*/ false);
+ /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
Cost -= InsertCost;
}
@@ -7243,22 +8102,89 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return Cost;
}
-Optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
+std::optional<TargetTransformInfo::ShuffleKind>
+BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+ SmallVectorImpl<int> &Mask,
SmallVectorImpl<const TreeEntry *> &Entries) {
+ Entries.clear();
+ // No need to check for the topmost gather node.
+ if (TE == VectorizableTree.front().get())
+ return std::nullopt;
+ Mask.assign(VL.size(), UndefMaskElem);
+ assert(TE->UserTreeIndices.size() == 1 &&
+ "Expected only single user of the gather node.");
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
- Mask.assign(TE->Scalars.size(), UndefMaskElem);
- Entries.clear();
+ Instruction &UserInst =
+ getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);
+ auto *PHI = dyn_cast<PHINode>(&UserInst);
+ auto *NodeUI = DT->getNode(
+ PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx)
+ : UserInst.getParent());
+ assert(NodeUI && "Should only process reachable instructions");
+ SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
+ auto CheckOrdering = [&](Instruction *LastEI) {
+ // Check if the user node of the TE comes after user node of EntryPtr,
+ // otherwise EntryPtr depends on TE.
+ // Gather nodes usually are not scheduled and inserted before their first
+ // user node. So, instead of checking dependency between the gather nodes
+ // themselves, we check the dependency between their user nodes.
+ // If one user node comes before the second one, we cannot use the second
+ // gather node as the source vector for the first gather node, because in
+ // the list of instructions it will be emitted later.
+ auto *EntryParent = LastEI->getParent();
+ auto *NodeEUI = DT->getNode(EntryParent);
+ if (!NodeEUI)
+ return false;
+ assert((NodeUI == NodeEUI) ==
+ (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ // Check the order of the gather nodes users.
+ if (UserInst.getParent() != EntryParent &&
+ (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
+ return false;
+ if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI))
+ return false;
+ return true;
+ };
// Build a lists of values to tree entries.
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
if (EntryPtr.get() == TE)
- break;
+ continue;
if (EntryPtr->State != TreeEntry::NeedToGather)
continue;
+ if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) {
+ return GatheredScalars.contains(V);
+ }))
+ continue;
+ assert(EntryPtr->UserTreeIndices.size() == 1 &&
+ "Expected only single user of the gather node.");
+ Instruction &EntryUserInst =
+ getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE);
+ if (&UserInst == &EntryUserInst) {
+ // If 2 gathers are operands of the same entry, compare operands indices,
+ // use the earlier one as the base.
+ if (TE->UserTreeIndices.front().UserTE ==
+ EntryPtr->UserTreeIndices.front().UserTE &&
+ TE->UserTreeIndices.front().EdgeIdx <
+ EntryPtr->UserTreeIndices.front().EdgeIdx)
+ continue;
+ }
+ // Check if the user node of the TE comes after user node of EntryPtr,
+ // otherwise EntryPtr depends on TE.
+ auto *EntryPHI = dyn_cast<PHINode>(&EntryUserInst);
+ auto *EntryI =
+ EntryPHI
+ ? EntryPHI
+ ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx)
+ ->getTerminator()
+ : &EntryUserInst;
+ if (!CheckOrdering(EntryI))
+ continue;
for (Value *V : EntryPtr->Scalars)
- ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
+ if (!isConstant(V))
+ ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
}
// Find all tree entries used by the gathered values. If no common entries
// found - not a shuffle.
@@ -7270,7 +8196,7 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
DenseMap<Value *, int> UsedValuesEntry;
for (Value *V : TE->Scalars) {
- if (isa<UndefValue>(V))
+ if (isConstant(V))
continue;
// Build a list of tree entries where V is used.
SmallPtrSet<const TreeEntry *, 4> VToTEs;
@@ -7280,10 +8206,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
if (const TreeEntry *VTE = getTreeEntry(V))
VToTEs.insert(VTE);
if (VToTEs.empty())
- return None;
+ continue;
if (UsedTEs.empty()) {
// The first iteration, just insert the list of nodes to vector.
UsedTEs.push_back(VToTEs);
+ UsedValuesEntry.try_emplace(V, 0);
} else {
// Need to check if there are any previously used tree nodes which use V.
// If there are no such nodes, consider that we have another one input
@@ -7308,8 +8235,9 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
if (Idx == UsedTEs.size()) {
// If the number of input vectors is greater than 2 - not a permutation,
// fallback to the regular gather.
+ // TODO: support multiple reshuffled nodes.
if (UsedTEs.size() == 2)
- return None;
+ continue;
UsedTEs.push_back(SavedVToTEs);
Idx = UsedTEs.size() - 1;
}
@@ -7317,32 +8245,55 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
}
}
- if (UsedTEs.empty()) {
- assert(all_of(TE->Scalars, UndefValue::classof) &&
- "Expected vector of undefs only.");
- return None;
- }
+ if (UsedTEs.empty())
+ return std::nullopt;
unsigned VF = 0;
if (UsedTEs.size() == 1) {
+ // Keep the order to avoid non-determinism.
+ SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
+ UsedTEs.front().end());
+ sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
+ return TE1->Idx < TE2->Idx;
+ });
// Try to find the perfect match in another gather node at first.
- auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
- return EntryPtr->isSame(TE->Scalars);
+ auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
+ return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
});
- if (It != UsedTEs.front().end()) {
+ if (It != FirstEntries.end()) {
Entries.push_back(*It);
std::iota(Mask.begin(), Mask.end(), 0);
+ // Clear undef scalars.
+ for (int I = 0, Sz = VL.size(); I < Sz; ++I)
+ if (isa<PoisonValue>(TE->Scalars[I]))
+ Mask[I] = UndefMaskElem;
return TargetTransformInfo::SK_PermuteSingleSrc;
}
- // No perfect match, just shuffle, so choose the first tree node.
- Entries.push_back(*UsedTEs.front().begin());
+ // No perfect match, just shuffle, so choose the first tree node from the
+ // tree.
+ Entries.push_back(FirstEntries.front());
} else {
// Try to find nodes with the same vector factor.
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
+ // Keep the order of tree nodes to avoid non-determinism.
DenseMap<int, const TreeEntry *> VFToTE;
- for (const TreeEntry *TE : UsedTEs.front())
- VFToTE.try_emplace(TE->getVectorFactor(), TE);
- for (const TreeEntry *TE : UsedTEs.back()) {
+ for (const TreeEntry *TE : UsedTEs.front()) {
+ unsigned VF = TE->getVectorFactor();
+ auto It = VFToTE.find(VF);
+ if (It != VFToTE.end()) {
+ if (It->second->Idx > TE->Idx)
+ It->getSecond() = TE;
+ continue;
+ }
+ VFToTE.try_emplace(VF, TE);
+ }
+ // Same, keep the order to avoid non-determinism.
+ SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
+ UsedTEs.back().end());
+ sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
+ return TE1->Idx < TE2->Idx;
+ });
+ for (const TreeEntry *TE : SecondEntries) {
auto It = VFToTE.find(TE->getVectorFactor());
if (It != VFToTE.end()) {
VF = It->first;
@@ -7354,40 +8305,135 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
// No 2 source vectors with the same vector factor - give up and do regular
// gather.
if (Entries.empty())
- return None;
- }
-
+ return std::nullopt;
+ }
+
+ bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof);
+ // Checks if the 2 PHIs are compatible in terms of high possibility to be
+ // vectorized.
+ auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
+ auto *PHI = cast<PHINode>(V);
+ auto *PHI1 = cast<PHINode>(V1);
+ // Check that all incoming values are compatible/from same parent (if they
+ // are instructions).
+ // The incoming values are compatible if they all are constants, or
+ // instruction with the same/alternate opcodes from the same basic block.
+ for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
+ Value *In = PHI->getIncomingValue(I);
+ Value *In1 = PHI1->getIncomingValue(I);
+ if (isConstant(In) && isConstant(In1))
+ continue;
+ if (!getSameOpcode({In, In1}, *TLI).getOpcode())
+ return false;
+ if (cast<Instruction>(In)->getParent() !=
+ cast<Instruction>(In1)->getParent())
+ return false;
+ }
+ return true;
+ };
+ // Check if the value can be ignored during analysis for shuffled gathers.
+ // We suppose it is better to ignore instruction, which do not form splats,
+ // are not vectorized/not extractelements (these instructions will be handled
+ // by extractelements processing) or may form vector node in future.
+ auto MightBeIgnored = [=](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ SmallVector<Value *> IgnoredVals;
+ if (UserIgnoreList)
+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+ return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
+ !isVectorLikeInstWithConstOps(I) &&
+ !areAllUsersVectorized(I, IgnoredVals) && isSimple(I);
+ };
+ // Check that the neighbor instruction may form a full vector node with the
+ // current instruction V. It is possible, if they have same/alternate opcode
+ // and same parent basic block.
+ auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
+ Value *V1 = VL[Idx];
+ bool UsedInSameVTE = false;
+ auto It = UsedValuesEntry.find(V1);
+ if (It != UsedValuesEntry.end())
+ UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
+ return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
+ getSameOpcode({V, V1}, *TLI).getOpcode() &&
+ cast<Instruction>(V)->getParent() ==
+ cast<Instruction>(V1)->getParent() &&
+ (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
+ };
// Build a shuffle mask for better cost estimation and vector emission.
- for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
- Value *V = TE->Scalars[I];
- if (isa<UndefValue>(V))
+ SmallBitVector UsedIdxs(Entries.size());
+ SmallVector<std::pair<unsigned, int>> EntryLanes;
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ Value *V = VL[I];
+ auto It = UsedValuesEntry.find(V);
+ if (It == UsedValuesEntry.end())
+ continue;
+ // Do not try to shuffle scalars, if they are constants, or instructions
+ // that can be vectorized as a result of the following vector build
+ // vectorization.
+ if (isConstant(V) || (MightBeIgnored(V) &&
+ ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
+ (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
+ continue;
+ unsigned Idx = It->second;
+ EntryLanes.emplace_back(Idx, I);
+ UsedIdxs.set(Idx);
+ }
+ // Iterate through all shuffled scalars and select entries, which can be used
+ // for final shuffle.
+ SmallVector<const TreeEntry *> TempEntries;
+ for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
+ if (!UsedIdxs.test(I))
continue;
- unsigned Idx = UsedValuesEntry.lookup(V);
- const TreeEntry *VTE = Entries[Idx];
- int FoundLane = VTE->findLaneForValue(V);
- Mask[I] = Idx * VF + FoundLane;
- // Extra check required by isSingleSourceMaskImpl function (called by
- // ShuffleVectorInst::isSingleSourceMask).
- if (Mask[I] >= 2 * E)
- return None;
+ // Fix the entry number for the given scalar. If it is the first entry, set
+ // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
+ // These indices are used when calculating final shuffle mask as the vector
+ // offset.
+ for (std::pair<unsigned, int> &Pair : EntryLanes)
+ if (Pair.first == I)
+ Pair.first = TempEntries.size();
+ TempEntries.push_back(Entries[I]);
+ }
+ Entries.swap(TempEntries);
+ if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
+ // We may have here 1 or 2 entries only. If the number of scalars is equal
+ // to the number of entries, no need to do the analysis, it is not very
+ // profitable. Since VL is not the same as TE->Scalars, it means we already
+ // have some shuffles before. Cut off not profitable case.
+ Entries.clear();
+ return std::nullopt;
+ }
+ // Build the final mask, check for the identity shuffle, if possible.
+ bool IsIdentity = Entries.size() == 1;
+ // Pair.first is the offset to the vector, while Pair.second is the index of
+ // scalar in the list.
+ for (const std::pair<unsigned, int> &Pair : EntryLanes) {
+ Mask[Pair.second] = Pair.first * VF +
+ Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+ IsIdentity &= Mask[Pair.second] == Pair.second;
}
switch (Entries.size()) {
case 1:
- return TargetTransformInfo::SK_PermuteSingleSrc;
+ if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
+ return TargetTransformInfo::SK_PermuteSingleSrc;
+ break;
case 2:
- return TargetTransformInfo::SK_PermuteTwoSrc;
+ if (EntryLanes.size() > 2 || VL.size() <= 2)
+ return TargetTransformInfo::SK_PermuteTwoSrc;
+ break;
default:
break;
}
- return None;
+ Entries.clear();
+ return std::nullopt;
}
InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,
const APInt &ShuffledIndices,
bool NeedToShuffle) const {
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost =
TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,
- /*Extract*/ false);
+ /*Extract*/ false, CostKind);
if (NeedToShuffle)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
@@ -7423,22 +8469,20 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Perform operand reordering on the instructions in VL and return the reordered
// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const DataLayout &DL,
- ScalarEvolution &SE,
- const BoUpSLP &R) {
+void BoUpSLP::reorderInputsAccordingToOpcode(
+ ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI,
+ const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) {
if (VL.empty())
return;
- VLOperands Ops(VL, DL, SE, R);
+ VLOperands Ops(VL, TLI, DL, SE, R);
// Reorder the operands in place.
Ops.reorder();
Left = Ops.getVL(0);
Right = Ops.getVL(1);
}
-void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
+Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block (except for extractelement-like instructions with
// constant indeces).
@@ -7487,13 +8531,34 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
return LastInst;
};
- auto &&FindFirstInst = [E, Front]() {
+ auto &&FindFirstInst = [E, Front, this]() {
Instruction *FirstInst = Front;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
- if (I->comesBefore(FirstInst))
+ if (FirstInst->getParent() == I->getParent()) {
+ if (I->comesBefore(FirstInst))
+ FirstInst = I;
+ continue;
+ }
+ assert(isVectorLikeInstWithConstOps(FirstInst) &&
+ isVectorLikeInstWithConstOps(I) &&
+ "Expected vector-like insts only.");
+ if (!DT->isReachableFromEntry(FirstInst->getParent())) {
+ FirstInst = I;
+ continue;
+ }
+ if (!DT->isReachableFromEntry(I->getParent()))
+ continue;
+ auto *NodeA = DT->getNode(FirstInst->getParent());
+ auto *NodeB = DT->getNode(I->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) ==
+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
FirstInst = I;
}
return FirstInst;
@@ -7502,19 +8567,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// Set the insert point to the beginning of the basic block if the entry
// should not be scheduled.
if (E->State != TreeEntry::NeedToGather &&
- doesNotNeedToSchedule(E->Scalars)) {
+ (doesNotNeedToSchedule(E->Scalars) ||
+ all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
Instruction *InsertInst;
- if (all_of(E->Scalars, isUsedOutsideBlock))
+ if (all_of(E->Scalars, [](Value *V) {
+ return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V);
+ }))
InsertInst = FindLastInst();
else
InsertInst = FindFirstInst();
- // If the instruction is PHI, set the insert point after all the PHIs.
- if (isa<PHINode>(InsertInst))
- InsertInst = BB->getFirstNonPHI();
- BasicBlock::iterator InsertPt = InsertInst->getIterator();
- Builder.SetInsertPoint(BB, InsertPt);
- Builder.SetCurrentDebugLocation(Front->getDebugLoc());
- return;
+ return *InsertInst;
}
// The last instruction in the bundle in program order.
@@ -7553,17 +8615,29 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// not ideal. However, this should be exceedingly rare since it requires that
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
- if (!LastInst) {
+ if (!LastInst)
LastInst = FindLastInst();
- // If the instruction is PHI, set the insert point after all the PHIs.
- if (isa<PHINode>(LastInst))
- LastInst = BB->getFirstNonPHI()->getPrevNode();
- }
assert(LastInst && "Failed to find last instruction in bundle");
+ return *LastInst;
+}
- // Set the insertion point after the last instruction in the bundle. Set the
- // debug location to Front.
- Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));
+void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
+ auto *Front = E->getMainOp();
+ Instruction *LastInst = EntryToLastInstruction.lookup(E);
+ assert(LastInst && "Failed to find last instruction in bundle");
+ // If the instruction is PHI, set the insert point after all the PHIs.
+ bool IsPHI = isa<PHINode>(LastInst);
+ if (IsPHI)
+ LastInst = LastInst->getParent()->getFirstNonPHI();
+ if (IsPHI || (E->State != TreeEntry::NeedToGather &&
+ doesNotNeedToSchedule(E->Scalars))) {
+ Builder.SetInsertPoint(LastInst);
+ } else {
+ // Set the insertion point after the last instruction in the bundle. Set the
+ // debug location to Front.
+ Builder.SetInsertPoint(LastInst->getParent(),
+ std::next(LastInst->getIterator()));
+ }
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -7594,7 +8668,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
if (!InsElt)
return Vec;
- GatherShuffleSeq.insert(InsElt);
+ GatherShuffleExtractSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
if (TreeEntry *Entry = getTreeEntry(V)) {
@@ -7630,196 +8704,452 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
return Vec;
}
-namespace {
-/// Merges shuffle masks and emits final shuffle instruction, if required.
-class ShuffleInstructionBuilder {
- IRBuilderBase &Builder;
- const unsigned VF = 0;
+/// Merges shuffle masks and emits final shuffle instruction, if required. It
+/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
+/// when the actual shuffle instruction is generated only if this is actually
+/// required. Otherwise, the shuffle instruction emission is delayed till the
+/// end of the process, to reduce the number of emitted instructions and further
+/// analysis/transformations.
+/// The class also will look through the previously emitted shuffle instructions
+/// and properly mark indices in mask as undef.
+/// For example, given the code
+/// \code
+/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
+/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
+/// \endcode
+/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
+/// look through %s1 and %s2 and emit
+/// \code
+/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
+/// \endcode
+/// instead.
+/// If 2 operands are of different size, the smallest one will be resized and
+/// the mask recalculated properly.
+/// For example, given the code
+/// \code
+/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
+/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
+/// \endcode
+/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
+/// look through %s1 and %s2 and emit
+/// \code
+/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
+/// \endcode
+/// instead.
+class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
bool IsFinalized = false;
- SmallVector<int, 4> Mask;
- /// Holds all of the instructions that we gathered.
- SetVector<Instruction *> &GatherShuffleSeq;
- /// A list of blocks that we are going to CSE.
- SetVector<BasicBlock *> &CSEBlocks;
+ /// Combined mask for all applied operands and masks. It is built during
+ /// analysis and actual emission of shuffle vector instructions.
+ SmallVector<int> CommonMask;
+ /// List of operands for the shuffle vector instruction. It hold at max 2
+ /// operands, if the 3rd is going to be added, the first 2 are combined into
+ /// shuffle with \p CommonMask mask, the first operand sets to be the
+ /// resulting shuffle and the second operand sets to be the newly added
+ /// operand. The \p CommonMask is transformed in the proper way after that.
+ SmallVector<Value *, 2> InVectors;
+ IRBuilderBase &Builder;
+ BoUpSLP &R;
-public:
- ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF,
- SetVector<Instruction *> &GatherShuffleSeq,
- SetVector<BasicBlock *> &CSEBlocks)
- : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq),
- CSEBlocks(CSEBlocks) {}
-
- /// Adds a mask, inverting it before applying.
- void addInversedMask(ArrayRef<unsigned> SubMask) {
- if (SubMask.empty())
- return;
- SmallVector<int, 4> NewMask;
- inversePermutation(SubMask, NewMask);
- addMask(NewMask);
- }
+ class ShuffleIRBuilder {
+ IRBuilderBase &Builder;
+ /// Holds all of the instructions that we gathered.
+ SetVector<Instruction *> &GatherShuffleExtractSeq;
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> &CSEBlocks;
+
+ public:
+ ShuffleIRBuilder(IRBuilderBase &Builder,
+ SetVector<Instruction *> &GatherShuffleExtractSeq,
+ SetVector<BasicBlock *> &CSEBlocks)
+ : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
+ CSEBlocks(CSEBlocks) {}
+ ~ShuffleIRBuilder() = default;
+ /// Creates shufflevector for the 2 operands with the given mask.
+ Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
+ Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
+ }
+ /// Creates permutation of the single vector operand with the given mask, if
+ /// it is not identity mask.
+ Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
+ if (Mask.empty())
+ return V1;
+ unsigned VF = Mask.size();
+ unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask))
+ return V1;
+ Value *Vec = Builder.CreateShuffleVector(V1, Mask);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
+ }
+ /// Resizes 2 input vector to match the sizes, if the they are not equal
+ /// yet. The smallest vector is resized to the size of the larger vector.
+ void resizeToMatch(Value *&V1, Value *&V2) {
+ if (V1->getType() == V2->getType())
+ return;
+ int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
+ int VF = std::max(V1VF, V2VF);
+ int MinVF = std::min(V1VF, V2VF);
+ SmallVector<int> IdentityMask(VF, UndefMaskElem);
+ std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
+ 0);
+ Value *&Op = MinVF == V1VF ? V1 : V2;
+ Op = Builder.CreateShuffleVector(Op, IdentityMask);
+ if (auto *I = dyn_cast<Instruction>(Op)) {
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ if (MinVF == V1VF)
+ V1 = Op;
+ else
+ V2 = Op;
+ }
+ };
- /// Functions adds masks, merging them into single one.
- void addMask(ArrayRef<unsigned> SubMask) {
- SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end());
- addMask(NewMask);
+ /// Smart shuffle instruction emission, walks through shuffles trees and
+ /// tries to find the best matching vector for the actual shuffle
+ /// instruction.
+ Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
+ assert(V1 && "Expected at least one vector value.");
+ ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
+ R.CSEBlocks);
+ return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder);
}
- void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); }
+ /// Transforms mask \p CommonMask per given \p Mask to make proper set after
+ /// shuffle emission.
+ static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
+ ArrayRef<int> Mask) {
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != UndefMaskElem)
+ CommonMask[Idx] = Idx;
+ }
- Value *finalize(Value *V) {
+public:
+ ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
+ : Builder(Builder), R(R) {}
+
+ /// Adds 2 input vectors and the mask for their shuffling.
+ void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
+ assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
+ if (InVectors.empty()) {
+ InVectors.push_back(V1);
+ InVectors.push_back(V2);
+ CommonMask.assign(Mask.begin(), Mask.end());
+ return;
+ }
+ Value *Vec = InVectors.front();
+ if (InVectors.size() == 2) {
+ Vec = createShuffle(Vec, InVectors.back(), CommonMask);
+ transformMaskAfterShuffle(CommonMask, Mask);
+ } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
+ Mask.size()) {
+ Vec = createShuffle(Vec, nullptr, CommonMask);
+ transformMaskAfterShuffle(CommonMask, Mask);
+ }
+ V1 = createShuffle(V1, V2, Mask);
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != UndefMaskElem)
+ CommonMask[Idx] = Idx + Sz;
+ InVectors.front() = Vec;
+ if (InVectors.size() == 2)
+ InVectors.back() = V1;
+ else
+ InVectors.push_back(V1);
+ }
+ /// Adds another one input vector and the mask for the shuffling.
+ void add(Value *V1, ArrayRef<int> Mask) {
+ if (InVectors.empty()) {
+ if (!isa<FixedVectorType>(V1->getType())) {
+ V1 = createShuffle(V1, nullptr, CommonMask);
+ CommonMask.assign(Mask.size(), UndefMaskElem);
+ transformMaskAfterShuffle(CommonMask, Mask);
+ }
+ InVectors.push_back(V1);
+ CommonMask.assign(Mask.begin(), Mask.end());
+ return;
+ }
+ const auto *It = find(InVectors, V1);
+ if (It == InVectors.end()) {
+ if (InVectors.size() == 2 ||
+ InVectors.front()->getType() != V1->getType() ||
+ !isa<FixedVectorType>(V1->getType())) {
+ Value *V = InVectors.front();
+ if (InVectors.size() == 2) {
+ V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
+ CommonMask.size()) {
+ V = createShuffle(InVectors.front(), nullptr, CommonMask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ }
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem)
+ CommonMask[Idx] =
+ V->getType() != V1->getType()
+ ? Idx + Sz
+ : Mask[Idx] + cast<FixedVectorType>(V1->getType())
+ ->getNumElements();
+ if (V->getType() != V1->getType())
+ V1 = createShuffle(V1, nullptr, Mask);
+ InVectors.front() = V;
+ if (InVectors.size() == 2)
+ InVectors.back() = V1;
+ else
+ InVectors.push_back(V1);
+ return;
+ }
+ // Check if second vector is required if the used elements are already
+ // used from the first one.
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) {
+ InVectors.push_back(V1);
+ break;
+ }
+ }
+ int VF = CommonMask.size();
+ if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
+ VF = FTy->getNumElements();
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem)
+ CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
+ }
+ /// Adds another one input vector and the mask for the shuffling.
+ void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
+ SmallVector<int> NewMask;
+ inversePermutation(Order, NewMask);
+ add(V1, NewMask);
+ }
+ /// Finalize emission of the shuffles.
+ Value *
+ finalize(ArrayRef<int> ExtMask = std::nullopt) {
IsFinalized = true;
- unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements();
- if (VF == ValueVF && Mask.empty())
- return V;
- SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem);
- std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0);
- addMask(NormalizedMask);
-
- if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))
- return V;
- Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle");
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
+ if (!ExtMask.empty()) {
+ if (CommonMask.empty()) {
+ CommonMask.assign(ExtMask.begin(), ExtMask.end());
+ } else {
+ SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+ if (ExtMask[I] == UndefMaskElem)
+ continue;
+ NewMask[I] = CommonMask[ExtMask[I]];
+ }
+ CommonMask.swap(NewMask);
+ }
}
- return Vec;
+ if (CommonMask.empty()) {
+ assert(InVectors.size() == 1 && "Expected only one vector with no mask");
+ return InVectors.front();
+ }
+ if (InVectors.size() == 2)
+ return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+ return createShuffle(InVectors.front(), nullptr, CommonMask);
}
~ShuffleInstructionBuilder() {
- assert((IsFinalized || Mask.empty()) &&
+ assert((IsFinalized || CommonMask.empty()) &&
"Shuffle construction must be finalized.");
}
};
-} // namespace
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
+ ArrayRef<Value *> VL = E->getOperand(NodeIdx);
const unsigned VF = VL.size();
- InstructionsState S = getSameOpcode(VL);
+ InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
const auto *It =
find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
if (It != VL.end())
- S = getSameOpcode(*It);
+ S = getSameOpcode(*It, *TLI);
}
if (S.getOpcode()) {
- if (TreeEntry *E = getTreeEntry(S.OpValue))
- if (E->isSame(VL)) {
- Value *V = vectorizeTree(E);
- if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
- if (!E->ReuseShuffleIndices.empty()) {
- // Reshuffle to get only unique values.
- // If some of the scalars are duplicated in the vectorization tree
- // entry, we do not vectorize them but instead generate a mask for
- // the reuses. But if there are several users of the same entry,
- // they may have different vectorization factors. This is especially
- // important for PHI nodes. In this case, we need to adapt the
- // resulting instruction for the user vectorization factor and have
- // to reshuffle it again to take only unique elements of the vector.
- // Without this code the function incorrectly returns reduced vector
- // instruction with the same elements, not with the unique ones.
-
- // block:
- // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
- // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
- // ... (use %2)
- // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
- // br %block
- SmallVector<int> UniqueIdxs(VF, UndefMaskElem);
- SmallSet<int, 4> UsedIdxs;
- int Pos = 0;
- int Sz = VL.size();
- for (int Idx : E->ReuseShuffleIndices) {
- if (Idx != Sz && Idx != UndefMaskElem &&
- UsedIdxs.insert(Idx).second)
- UniqueIdxs[Idx] = Pos;
- ++Pos;
- }
- assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
- "less than original vector size.");
- UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);
- V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
- } else {
- assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
- "Expected vectorization factor less "
- "than original vector size.");
- SmallVector<int> UniformMask(VF, 0);
- std::iota(UniformMask.begin(), UniformMask.end(), 0);
- V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
- }
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
+ if (TreeEntry *VE = getTreeEntry(S.OpValue);
+ VE && VE->isSame(VL) &&
+ (any_of(VE->UserTreeIndices,
+ [E, NodeIdx](const EdgeInfo &EI) {
+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
+ }) ||
+ any_of(VectorizableTree,
+ [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isOperandGatherNode({E, NodeIdx}) &&
+ VE->isSame(TE->Scalars);
+ }))) {
+ auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+ ShuffleBuilder.add(V, Mask);
+ return ShuffleBuilder.finalize(std::nullopt);
+ };
+ Value *V = vectorizeTree(VE);
+ if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
+ if (!VE->ReuseShuffleIndices.empty()) {
+ // Reshuffle to get only unique values.
+ // If some of the scalars are duplicated in the vectorization
+ // tree entry, we do not vectorize them but instead generate a
+ // mask for the reuses. But if there are several users of the
+ // same entry, they may have different vectorization factors.
+ // This is especially important for PHI nodes. In this case, we
+ // need to adapt the resulting instruction for the user
+ // vectorization factor and have to reshuffle it again to take
+ // only unique elements of the vector. Without this code the
+ // function incorrectly returns reduced vector instruction with
+ // the same elements, not with the unique ones.
+
+ // block:
+ // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
+ // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
+ // ... (use %2)
+ // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
+ // br %block
+ SmallVector<int> UniqueIdxs(VF, UndefMaskElem);
+ SmallSet<int, 4> UsedIdxs;
+ int Pos = 0;
+ for (int Idx : VE->ReuseShuffleIndices) {
+ if (Idx != static_cast<int>(VF) && Idx != UndefMaskElem &&
+ UsedIdxs.insert(Idx).second)
+ UniqueIdxs[Idx] = Pos;
+ ++Pos;
}
+ assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
+ "less than original vector size.");
+ UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);
+ V = FinalShuffle(V, UniqueIdxs);
+ } else {
+ assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
+ "Expected vectorization factor less "
+ "than original vector size.");
+ SmallVector<int> UniformMask(VF, 0);
+ std::iota(UniformMask.begin(), UniformMask.end(), 0);
+ V = FinalShuffle(V, UniformMask);
}
- return V;
}
+ return V;
+ }
}
- // Can't vectorize this, so simply build a new vector with each lane
- // corresponding to the requested value.
- return createBuildVector(VL);
+ // Find the corresponding gather entry and vectorize it.
+ // Allows to be more accurate with tree/graph transformations, checks for the
+ // correctness of the transformations in many cases.
+ auto *I = find_if(VectorizableTree,
+ [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isOperandGatherNode({E, NodeIdx});
+ });
+ assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
+ assert(I->get()->UserTreeIndices.size() == 1 &&
+ "Expected only single user for the gather node.");
+ assert(I->get()->isSame(VL) && "Expected same list of scalars.");
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (E->getOpcode() != Instruction::InsertElement &&
+ E->getOpcode() != Instruction::PHI) {
+ Instruction *LastInst = EntryToLastInstruction.lookup(E);
+ assert(LastInst && "Failed to find last instruction in bundle");
+ Builder.SetInsertPoint(LastInst);
+ }
+ return vectorizeTree(I->get());
}
-Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
- assert(any_of(VectorizableTree,
- [VL](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::NeedToGather && TE->isSame(VL);
- }) &&
- "Non-matching gather node.");
- unsigned VF = VL.size();
- // Exploit possible reuse of values across lanes.
- SmallVector<int> ReuseShuffleIndicies;
- SmallVector<Value *> UniqueValues;
- if (VL.size() > 2) {
+
+Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+ assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
+ unsigned VF = E->getVectorFactor();
+
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+ SmallVector<Value *> Gathered(
+ VF, PoisonValue::get(E->Scalars.front()->getType()));
+ bool NeedFreeze = false;
+ SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
+ // Build a mask out of the redorder indices and reorder scalars per this mask.
+ SmallVector<int> ReorderMask;
+ inversePermutation(E->ReorderIndices, ReorderMask);
+ if (!ReorderMask.empty())
+ reorderScalars(VL, ReorderMask);
+ SmallVector<int> ReuseMask(VF, UndefMaskElem);
+ if (!allConstant(VL)) {
+ // For splats with can emit broadcasts instead of gathers, so try to find
+ // such sequences.
+ bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back());
+ SmallVector<int> UndefPos;
DenseMap<Value *, unsigned> UniquePositions;
- unsigned NumValues =
- std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) {
- return !isa<UndefValue>(V);
- }).base());
- VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));
- int UniqueVals = 0;
- for (Value *V : VL.drop_back(VL.size() - VF)) {
+ // Gather unique non-const values and all constant values.
+ // For repeated values, just shuffle them.
+ for (auto [I, V] : enumerate(VL)) {
if (isa<UndefValue>(V)) {
- ReuseShuffleIndicies.emplace_back(UndefMaskElem);
+ if (!isa<PoisonValue>(V)) {
+ Gathered[I] = V;
+ ReuseMask[I] = I;
+ UndefPos.push_back(I);
+ }
continue;
}
if (isConstant(V)) {
- ReuseShuffleIndicies.emplace_back(UniqueValues.size());
- UniqueValues.emplace_back(V);
+ Gathered[I] = V;
+ ReuseMask[I] = I;
continue;
}
- auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndicies.emplace_back(Res.first->second);
- if (Res.second) {
- UniqueValues.emplace_back(V);
- ++UniqueVals;
- }
- }
- if (UniqueVals == 1 && UniqueValues.size() == 1) {
- // Emit pure splat vector.
- ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
- UndefMaskElem);
- } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
- if (UniqueValues.empty()) {
- assert(all_of(VL, UndefValue::classof) && "Expected list of undefs.");
- NumValues = VF;
+ if (IsSplat) {
+ Gathered.front() = V;
+ ReuseMask[I] = 0;
+ } else {
+ const auto Res = UniquePositions.try_emplace(V, I);
+ Gathered[Res.first->second] = V;
+ ReuseMask[I] = Res.first->second;
+ }
+ }
+ if (!UndefPos.empty() && IsSplat) {
+ // For undef values, try to replace them with the simple broadcast.
+ // We can do it if the broadcasted value is guaranteed to be
+ // non-poisonous, or by freezing the incoming scalar value first.
+ auto *It = find_if(Gathered, [this, E](Value *V) {
+ return !isa<UndefValue>(V) &&
+ (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
+ any_of(V->uses(), [E](const Use &U) {
+ // Check if the value already used in the same operation in
+ // one of the nodes already.
+ return E->UserTreeIndices.size() == 1 &&
+ is_contained(
+ E->UserTreeIndices.front().UserTE->Scalars,
+ U.getUser()) &&
+ E->UserTreeIndices.front().EdgeIdx != U.getOperandNo();
+ }));
+ });
+ if (It != Gathered.end()) {
+ // Replace undefs by the non-poisoned scalars and emit broadcast.
+ int Pos = std::distance(Gathered.begin(), It);
+ for_each(UndefPos, [&](int I) {
+ // Set the undef position to the non-poisoned scalar.
+ ReuseMask[I] = Pos;
+ // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already.
+ if (I != Pos)
+ Gathered[I] = PoisonValue::get(Gathered[I]->getType());
+ });
+ } else {
+ // Replace undefs by the poisons, emit broadcast and then emit
+ // freeze.
+ for_each(UndefPos, [&](int I) {
+ ReuseMask[I] = UndefMaskElem;
+ if (isa<UndefValue>(Gathered[I]))
+ Gathered[I] = PoisonValue::get(Gathered[I]->getType());
+ });
+ NeedFreeze = true;
}
- ReuseShuffleIndicies.clear();
- UniqueValues.clear();
- UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));
}
- UniqueValues.append(VF - UniqueValues.size(),
- PoisonValue::get(VL[0]->getType()));
- VL = UniqueValues;
- }
-
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
- CSEBlocks);
- Value *Vec = gather(VL);
- if (!ReuseShuffleIndicies.empty()) {
- ShuffleBuilder.addMask(ReuseShuffleIndicies);
- Vec = ShuffleBuilder.finalize(Vec);
- }
+ } else {
+ ReuseMask.clear();
+ copy(VL, Gathered.begin());
+ }
+ // Gather unique scalars and all constants.
+ Value *Vec = gather(Gathered);
+ ShuffleBuilder.add(Vec, ReuseMask);
+ Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ if (NeedFreeze)
+ Vec = Builder.CreateFreeze(Vec);
return Vec;
}
@@ -7831,34 +9161,55 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return E->VectorizedValue;
}
- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- unsigned VF = E->getVectorFactor();
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
- CSEBlocks);
+ auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+ if (E->State != TreeEntry::NeedToGather &&
+ E->getOpcode() == Instruction::Store) {
+ ArrayRef<int> Mask =
+ ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
+ E->ReorderIndices.size());
+ ShuffleBuilder.add(V, Mask);
+ } else {
+ ShuffleBuilder.addOrdered(V, E->ReorderIndices);
+ }
+ return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ };
+
if (E->State == TreeEntry::NeedToGather) {
+ if (E->Idx > 0) {
+ // We are in the middle of a vectorizable chain. We need to gather the
+ // scalars from the users.
+ Value *Vec = createBuildVector(E);
+ E->VectorizedValue = Vec;
+ return Vec;
+ }
if (E->getMainOp())
setInsertPointAfterBundle(E);
+ SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
+ // Build a mask out of the reorder indices and reorder scalars per this
+ // mask.
+ SmallVector<int> ReorderMask;
+ inversePermutation(E->ReorderIndices, ReorderMask);
+ if (!ReorderMask.empty())
+ reorderScalars(GatheredScalars, ReorderMask);
Value *Vec;
SmallVector<int> Mask;
SmallVector<const TreeEntry *> Entries;
- Optional<TargetTransformInfo::ShuffleKind> Shuffle =
- isGatherShuffledEntry(E, Mask, Entries);
+ std::optional<TargetTransformInfo::ShuffleKind> Shuffle =
+ isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
if (Shuffle) {
assert((Entries.size() == 1 || Entries.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
Entries.back()->VectorizedValue, Mask);
if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
+ GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
} else {
Vec = gather(E->Scalars);
}
- if (NeedToShuffleReuses) {
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- Vec = ShuffleBuilder.finalize(Vec);
- }
+ Vec = FinalShuffle(Vec, E);
E->VectorizedValue = Vec;
return Vec;
}
@@ -7891,9 +9242,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
@@ -7905,6 +9254,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ValueList Operands;
BasicBlock *IBB = PH->getIncomingBlock(i);
+ // Stop emission if all incoming values are generated.
+ if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return V;
+ }
+
if (!VisitedBBs.insert(IBB).second) {
NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
continue;
@@ -7912,7 +9267,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- Value *Vec = vectorizeTree(E->getOperand(i));
+ Value *Vec = vectorizeOperand(E, i);
NewPhi->addIncoming(Vec, IBB);
}
@@ -7923,10 +9278,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::ExtractElement: {
Value *V = E->getSingleOperand(0);
- Builder.SetInsertPoint(VL0);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ setInsertPointAfterBundle(E);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
return V;
}
@@ -7937,16 +9290,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
Value *NewV = propagateMetadata(V, E->Scalars);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- NewV = ShuffleBuilder.finalize(NewV);
+ NewV = FinalShuffle(NewV, E);
E->VectorizedValue = NewV;
return NewV;
}
case Instruction::InsertElement: {
assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
- Value *V = vectorizeTree(E->getOperand(1));
+ Value *V = vectorizeOperand(E, 1);
// Create InsertVector shuffle if necessary
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
@@ -7981,27 +9332,58 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (!IsIdentity || NumElts != NumScalars) {
V = Builder.CreateShuffleVector(V, Mask);
if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleSeq.insert(I);
+ GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
- if ((!IsIdentity || Offset != 0 ||
- !isUndefVector(FirstInsert->getOperand(0))) &&
+ SmallVector<int> InsertMask(NumElts, UndefMaskElem);
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (Mask[I] != UndefMaskElem)
+ InsertMask[Offset + I] = I;
+ }
+ SmallBitVector UseMask =
+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
+ SmallBitVector IsFirstUndef =
+ isUndefVector(FirstInsert->getOperand(0), UseMask);
+ if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
NumElts != NumScalars) {
- SmallVector<int> InsertMask(NumElts);
- std::iota(InsertMask.begin(), InsertMask.end(), 0);
- for (unsigned I = 0; I < NumElts; I++) {
- if (Mask[I] != UndefMaskElem)
- InsertMask[Offset + I] = NumElts + I;
- }
-
- V = Builder.CreateShuffleVector(
- FirstInsert->getOperand(0), V, InsertMask,
- cast<Instruction>(E->Scalars.back())->getName());
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
+ if (IsFirstUndef.all()) {
+ if (!ShuffleVectorInst::isIdentityMask(InsertMask)) {
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ if (!IsFirstPoison.all()) {
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == UndefMaskElem && !IsFirstPoison.test(I))
+ InsertMask[I] = I + NumElts;
+ }
+ }
+ V = Builder.CreateShuffleVector(
+ V,
+ IsFirstPoison.all() ? PoisonValue::get(V->getType())
+ : FirstInsert->getOperand(0),
+ InsertMask, cast<Instruction>(E->Scalars.back())->getName());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ } else {
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == UndefMaskElem)
+ InsertMask[I] = IsFirstPoison.test(I) ? UndefMaskElem : I;
+ else
+ InsertMask[I] += NumElts;
+ }
+ V = Builder.CreateShuffleVector(
+ FirstInsert->getOperand(0), V, InsertMask,
+ cast<Instruction>(E->Scalars.back())->getName());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
}
@@ -8023,8 +9405,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::BitCast: {
setInsertPointAfterBundle(E);
- Value *InVec = vectorizeTree(E->getOperand(0));
-
+ Value *InVec = vectorizeOperand(E, 0);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -8032,9 +9413,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *CI = cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8044,9 +9423,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::ICmp: {
setInsertPointAfterBundle(E);
- Value *L = vectorizeTree(E->getOperand(0));
- Value *R = vectorizeTree(E->getOperand(1));
-
+ Value *L = vectorizeOperand(E, 0);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+ Value *R = vectorizeOperand(E, 1);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -8055,9 +9437,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8066,19 +9446,24 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Select: {
setInsertPointAfterBundle(E);
- Value *Cond = vectorizeTree(E->getOperand(0));
- Value *True = vectorizeTree(E->getOperand(1));
- Value *False = vectorizeTree(E->getOperand(2));
-
+ Value *Cond = vectorizeOperand(E, 0);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+ Value *True = vectorizeOperand(E, 1);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+ Value *False = vectorizeOperand(E, 2);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
Value *V = Builder.CreateSelect(Cond, True, False);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8087,7 +9472,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::FNeg: {
setInsertPointAfterBundle(E);
- Value *Op = vectorizeTree(E->getOperand(0));
+ Value *Op = vectorizeOperand(E, 0);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -8100,9 +9485,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8129,9 +9512,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
setInsertPointAfterBundle(E);
- Value *LHS = vectorizeTree(E->getOperand(0));
- Value *RHS = vectorizeTree(E->getOperand(1));
-
+ Value *LHS = vectorizeOperand(E, 0);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+ Value *RHS = vectorizeOperand(E, 1);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -8144,9 +9530,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8177,7 +9561,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
- Value *VecPtr = vectorizeTree(E->getOperand(0));
+ Value *VecPtr = vectorizeOperand(E, 0);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
// Use the minimum alignment of the gathered loads.
Align CommonAlignment = LI->getAlign();
for (Value *V : E->Scalars)
@@ -8187,9 +9575,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = propagateMetadata(NewLI, E->Scalars);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -8200,9 +9586,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E);
- Value *VecValue = vectorizeTree(E->getOperand(0));
- ShuffleBuilder.addMask(E->ReorderIndices);
- VecValue = ShuffleBuilder.finalize(VecValue);
+ Value *VecValue = vectorizeOperand(E, 0);
+ VecValue = FinalShuffle(VecValue, E);
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
@@ -8231,11 +9616,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *GEP0 = cast<GetElementPtrInst>(VL0);
setInsertPointAfterBundle(E);
- Value *Op0 = vectorizeTree(E->getOperand(0));
+ Value *Op0 = vectorizeOperand(E, 0);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
SmallVector<Value *> OpVecs;
for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
- Value *OpVec = vectorizeTree(E->getOperand(J));
+ Value *OpVec = vectorizeOperand(E, J);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
OpVecs.push_back(OpVec);
}
@@ -8249,9 +9642,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = propagateMetadata(I, GEPs);
}
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8289,7 +9680,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
continue;
}
- Value *OpVec = vectorizeTree(E->getOperand(j));
+ Value *OpVec = vectorizeOperand(E, j);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
@@ -8324,9 +9719,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
propagateIRFlags(V, E->Scalars, VL0);
- ShuffleBuilder.addInversedMask(E->ReorderIndices);
- ShuffleBuilder.addMask(E->ReuseShuffleIndices);
- V = ShuffleBuilder.finalize(V);
+ V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8344,13 +9737,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *LHS = nullptr, *RHS = nullptr;
if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
setInsertPointAfterBundle(E);
- LHS = vectorizeTree(E->getOperand(0));
- RHS = vectorizeTree(E->getOperand(1));
+ LHS = vectorizeOperand(E, 0);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+ RHS = vectorizeOperand(E, 1);
} else {
setInsertPointAfterBundle(E);
- LHS = vectorizeTree(E->getOperand(0));
+ LHS = vectorizeOperand(E, 0);
}
-
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -8377,7 +9773,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// instruction, if any.
for (Value *V : {V0, V1}) {
if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleSeq.insert(I);
+ GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
@@ -8389,9 +9785,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<int> Mask;
buildShuffleEntryMask(
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
- [E](Instruction *I) {
+ [E, this](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
+ return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
+ *TLI);
},
Mask, &OpScalars, &AltScalars);
@@ -8401,10 +9798,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (auto *I = dyn_cast<Instruction>(V)) {
V = propagateMetadata(I, E->Scalars);
- GatherShuffleSeq.insert(I);
+ GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
- V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -8433,14 +9829,27 @@ struct ShuffledInsertData {
};
} // namespace
-Value *
-BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ Instruction *ReductionRoot) {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
}
- Builder.SetInsertPoint(&F->getEntryBlock().front());
+ // Pre-gather last instructions.
+ for (const std::unique_ptr<TreeEntry> &E : VectorizableTree) {
+ if ((E->State == TreeEntry::NeedToGather &&
+ (!E->getMainOp() || E->Idx > 0)) ||
+ (E->State != TreeEntry::NeedToGather &&
+ E->getOpcode() == Instruction::ExtractValue) ||
+ E->getOpcode() == Instruction::InsertElement)
+ continue;
+ Instruction *LastInst = &getLastInstructionInBundle(E.get());
+ EntryToLastInstruction.try_emplace(E.get(), LastInst);
+ }
+
+ Builder.SetInsertPoint(ReductionRoot ? ReductionRoot
+ : &F->getEntryBlock().front());
auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
// If the vectorized tree can be rewritten in a smaller type, we truncate the
@@ -8469,6 +9878,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
SmallVector<ShuffledInsertData> ShuffledInserts;
// Maps vector instruction to original insertelement instruction
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
+ // Maps extract Scalar to the corresponding extractelement instruction in the
+ // basic block. Only one extractelement per block should be emitted.
+ DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -8493,13 +9905,36 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Value *Lane = Builder.getInt32(ExternalUse.Lane);
auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
if (Scalar->getType() != Vec->getType()) {
- Value *Ex;
- // "Reuse" the existing extract to improve final codegen.
- if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
- Ex = Builder.CreateExtractElement(ES->getOperand(0),
- ES->getOperand(1));
- } else {
- Ex = Builder.CreateExtractElement(Vec, Lane);
+ Value *Ex = nullptr;
+ auto It = ScalarToEEs.find(Scalar);
+ if (It != ScalarToEEs.end()) {
+ // No need to emit many extracts, just move the only one in the
+ // current block.
+ auto EEIt = It->second.find(Builder.GetInsertBlock());
+ if (EEIt != It->second.end()) {
+ Instruction *I = EEIt->second;
+ if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
+ Builder.GetInsertPoint()->comesBefore(I))
+ I->moveBefore(&*Builder.GetInsertPoint());
+ Ex = I;
+ }
+ }
+ if (!Ex) {
+ // "Reuse" the existing extract to improve final codegen.
+ if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+ Ex = Builder.CreateExtractElement(ES->getOperand(0),
+ ES->getOperand(1));
+ } else {
+ Ex = Builder.CreateExtractElement(Vec, Lane);
+ }
+ if (auto *I = dyn_cast<Instruction>(Ex))
+ ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), I);
+ }
+ // The then branch of the previous if may produce constants, since 0
+ // operand might be a constant.
+ if (auto *ExI = dyn_cast<Instruction>(Ex)) {
+ GatherShuffleExtractSeq.insert(ExI);
+ CSEBlocks.insert(ExI->getParent());
}
// If necessary, sign-extend or zero-extend ScalarRoot
// to the larger type.
@@ -8524,13 +9959,15 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
"Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map");
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
- Builder.SetInsertPoint(VecI->getParent(),
- std::next(VecI->getIterator()));
+ if (auto *PHI = dyn_cast<PHINode>(VecI))
+ Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI());
+ else
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
auto &NewInstLocs = ExternallyUsedValues[NewInst];
auto It = ExternallyUsedValues.find(Scalar);
assert(It != ExternallyUsedValues.end() &&
@@ -8546,7 +9983,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// Skip if the scalar is another vector op or Vec is not an instruction.
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
- Optional<unsigned> InsertIdx = getInsertIndex(VU);
+ std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
// Need to use original vector, if the root is truncated.
if (MinBWs.count(Scalar) &&
@@ -8556,7 +9993,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
// Checks if 2 insertelements are from the same buildvector.
InsertElementInst *VecInsert = Data.InsertElements.front();
- return areTwoInsertFromSameBuildVector(VU, VecInsert);
+ return areTwoInsertFromSameBuildVector(
+ VU, VecInsert,
+ [](InsertElementInst *II) { return II->getOperand(0); });
});
unsigned Idx = *InsertIdx;
if (It == ShuffledInserts.end()) {
@@ -8620,157 +10059,42 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, NewInst);
}
}
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, NewInst);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, NewInst);
}
LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
- // Checks if the mask is an identity mask.
- auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {
- int Limit = Mask.size();
- return VecTy->getNumElements() == Mask.size() &&
- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask);
- };
- // Tries to combine 2 different masks into single one.
- auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {
- SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
- for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
- if (ExtMask[I] == UndefMaskElem)
- continue;
- NewMask[I] = Mask[ExtMask[I]];
- }
- Mask.swap(NewMask);
- };
- // Peek through shuffles, trying to simplify the final shuffle code.
- auto &&PeekThroughShuffles =
- [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,
- bool CheckForLengthChange = false) {
- while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
- // Exit if not a fixed vector type or changing size shuffle.
- if (!isa<FixedVectorType>(SV->getType()) ||
- (CheckForLengthChange && SV->changesLength()))
- break;
- // Exit if the identity or broadcast mask is found.
- if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||
- SV->isZeroEltSplat())
- break;
- bool IsOp1Undef = isUndefVector(SV->getOperand(0));
- bool IsOp2Undef = isUndefVector(SV->getOperand(1));
- if (!IsOp1Undef && !IsOp2Undef)
- break;
- SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
- SV->getShuffleMask().end());
- CombineMasks(ShuffleMask, Mask);
- Mask.swap(ShuffleMask);
- if (IsOp2Undef)
- V = SV->getOperand(0);
- else
- V = SV->getOperand(1);
- }
- };
- // Smart shuffle instruction emission, walks through shuffles trees and
- // tries to find the best matching vector for the actual shuffle
- // instruction.
- auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,
- &CombineMasks](Value *V1, Value *V2,
- ArrayRef<int> Mask) -> Value * {
- assert(V1 && "Expected at least one vector value.");
- if (V2 && !isUndefVector(V2)) {
- // Peek through shuffles.
- Value *Op1 = V1;
- Value *Op2 = V2;
- int VF =
- cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
- SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
- SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
- for (int I = 0, E = Mask.size(); I < E; ++I) {
- if (Mask[I] < VF)
- CombinedMask1[I] = Mask[I];
- else
- CombinedMask2[I] = Mask[I] - VF;
- }
- Value *PrevOp1;
- Value *PrevOp2;
- do {
- PrevOp1 = Op1;
- PrevOp2 = Op2;
- PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);
- PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);
- // Check if we have 2 resizing shuffles - need to peek through operands
- // again.
- if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
- if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2))
- if (SV1->getOperand(0)->getType() ==
- SV2->getOperand(0)->getType() &&
- SV1->getOperand(0)->getType() != SV1->getType() &&
- isUndefVector(SV1->getOperand(1)) &&
- isUndefVector(SV2->getOperand(1))) {
- Op1 = SV1->getOperand(0);
- Op2 = SV2->getOperand(0);
- SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
- SV1->getShuffleMask().end());
- CombineMasks(ShuffleMask1, CombinedMask1);
- CombinedMask1.swap(ShuffleMask1);
- SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
- SV2->getShuffleMask().end());
- CombineMasks(ShuffleMask2, CombinedMask2);
- CombinedMask2.swap(ShuffleMask2);
- }
- } while (PrevOp1 != Op1 || PrevOp2 != Op2);
- VF = cast<VectorType>(Op1->getType())
- ->getElementCount()
- .getKnownMinValue();
- for (int I = 0, E = Mask.size(); I < E; ++I) {
- if (CombinedMask2[I] != UndefMaskElem) {
- assert(CombinedMask1[I] == UndefMaskElem &&
- "Expected undefined mask element");
- CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
- }
- }
- Value *Vec = Builder.CreateShuffleVector(
- Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
- CombinedMask1);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- return Vec;
- }
- if (isa<PoisonValue>(V1))
- return PoisonValue::get(FixedVectorType::get(
- cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
- Value *Op = V1;
- SmallVector<int> CombinedMask(Mask.begin(), Mask.end());
- PeekThroughShuffles(Op, CombinedMask);
- if (!isa<FixedVectorType>(Op->getType()) ||
- !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {
- Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- return Vec;
+ auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
+ SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ for (int I = 0, E = Mask.size(); I < E; ++I) {
+ if (Mask[I] < VF)
+ CombinedMask1[I] = Mask[I];
+ else
+ CombinedMask2[I] = Mask[I] - VF;
}
- return Op;
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+ ShuffleBuilder.add(V1, CombinedMask1);
+ if (V2)
+ ShuffleBuilder.add(V2, CombinedMask2);
+ return ShuffleBuilder.finalize(std::nullopt);
};
- auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) {
+ auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
+ bool ForSingleMask) {
unsigned VF = Mask.size();
unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
if (VF != VecVF) {
@@ -8778,12 +10102,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Vec = CreateShuffle(Vec, nullptr, Mask);
return std::make_pair(Vec, true);
}
- SmallVector<int> ResizeMask(VF, UndefMaskElem);
- for (unsigned I = 0; I < VF; ++I) {
- if (Mask[I] != UndefMaskElem)
- ResizeMask[Mask[I]] = Mask[I];
+ if (!ForSingleMask) {
+ SmallVector<int> ResizeMask(VF, UndefMaskElem);
+ for (unsigned I = 0; I < VF; ++I) {
+ if (Mask[I] != UndefMaskElem)
+ ResizeMask[Mask[I]] = Mask[I];
+ }
+ Vec = CreateShuffle(Vec, nullptr, ResizeMask);
}
- Vec = CreateShuffle(Vec, nullptr, ResizeMask);
}
return std::make_pair(Vec, false);
@@ -8798,7 +10124,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Builder.SetInsertPoint(LastInsert);
auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
Value *NewInst = performExtractsShuffleAction<Value>(
- makeMutableArrayRef(Vector.data(), Vector.size()),
+ MutableArrayRef(Vector.data(), Vector.size()),
FirstInsert->getOperand(0),
[](Value *Vec) {
return cast<VectorType>(Vec->getType())
@@ -8855,6 +10181,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
CSEBlocks.insert(LastInsert->getParent());
}
+ SmallVector<Instruction *> RemovedInsts;
// For each vectorized value:
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
@@ -8889,9 +10216,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
#endif
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
+ // Retain to-be-deleted instructions for some debug-info
+ // bookkeeping. NOTE: eraseInstruction only marks the instruction for
+ // deletion - instructions are not deleted until later.
+ RemovedInsts.push_back(cast<Instruction>(Scalar));
}
}
+ // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
+ // new vector instruction.
+ if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
+ V->mergeDIAssignID(RemovedInsts);
+
Builder.ClearInsertionPoint();
InstrElementSize.clear();
@@ -8899,10 +10235,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
void BoUpSLP::optimizeGatherSequence() {
- LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (Instruction *I : GatherShuffleSeq) {
+ for (Instruction *I : GatherShuffleExtractSeq) {
if (isDeleted(I))
continue;
@@ -8927,6 +10263,7 @@ void BoUpSLP::optimizeGatherSequence() {
// We can hoist this instruction. Move it to the pre-header.
I->moveBefore(PreHeader->getTerminator());
+ CSEBlocks.insert(PreHeader);
}
// Make a list of all reachable blocks in our CSE queue.
@@ -9002,8 +10339,8 @@ void BoUpSLP::optimizeGatherSequence() {
for (Instruction &In : llvm::make_early_inc_range(*BB)) {
if (isDeleted(&In))
continue;
- if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
- !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
+ if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
+ !GatherShuffleExtractSeq.contains(&In))
continue;
// Check if we can replace this instruction with any of the
@@ -9022,7 +10359,7 @@ void BoUpSLP::optimizeGatherSequence() {
break;
}
if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
- GatherShuffleSeq.contains(V) &&
+ GatherShuffleExtractSeq.contains(V) &&
IsIdenticalOrLessDefined(V, &In, NewMask) &&
DT->dominates(In.getParent(), V->getParent())) {
In.moveAfter(V);
@@ -9043,7 +10380,7 @@ void BoUpSLP::optimizeGatherSequence() {
}
}
CSEBlocks.clear();
- GatherShuffleSeq.clear();
+ GatherShuffleExtractSeq.clear();
}
BoUpSLP::ScheduleData *
@@ -9075,7 +10412,7 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
-Optional<BoUpSLP::ScheduleData *>
+std::optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
@@ -9137,7 +10474,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// dependencies and emit instruction in the wrong order at the actual
// scheduling.
TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
- return None;
+ return std::nullopt;
}
}
@@ -9167,7 +10504,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle->isReady()) {
cancelScheduling(VL, S.OpValue);
- return None;
+ return std::nullopt;
}
return Bundle;
}
@@ -9395,13 +10732,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
WorkList.push_back(DestBundle);
};
- // Any instruction which isn't safe to speculate at the begining of the
+ // Any instruction which isn't safe to speculate at the beginning of the
// block is control dependend on any early exit or non-willreturn call
// which proceeds it.
if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
for (Instruction *I = BundleMember->Inst->getNextNode();
I != ScheduleEnd; I = I->getNextNode()) {
- if (isSafeToSpeculativelyExecute(I, &*BB->begin()))
+ if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
continue;
// Add the dependency
@@ -9436,9 +10773,12 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
// In addition to the cases handle just above, we need to prevent
- // allocas from moving below a stacksave. The stackrestore case
- // is currently thought to be conservatism.
- if (isa<AllocaInst>(BundleMember->Inst)) {
+ // allocas and loads/stores from moving below a stacksave or a
+ // stackrestore. Avoiding moving allocas below stackrestore is currently
+ // thought to be conservatism. Moving loads/stores below a stackrestore
+ // can lead to incorrect code.
+ if (isa<AllocaInst>(BundleMember->Inst) ||
+ BundleMember->Inst->mayReadOrWriteMemory()) {
for (Instruction *I = BundleMember->Inst->getNextNode();
I != ScheduleEnd; I = I->getNextNode()) {
if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
@@ -9661,17 +11001,15 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.
- if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) ||
- isa<ExtractValueInst>(I))
+ if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an
// instruction we haven't yet visited and from the same basic block as the
// user or the use is a PHI node, we add it to the worklist.
- else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
- isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) ||
- isa<UnaryOperator>(I)) {
+ else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
+ BinaryOperator, UnaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
if (Visited.insert(J).second &&
@@ -9724,8 +11062,7 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
break;
case Instruction::ZExt:
case Instruction::SExt:
- if (isa<ExtractElementInst>(I->getOperand(0)) ||
- isa<InsertElementInst>(I->getOperand(0)))
+ if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
return false;
break;
@@ -10026,7 +11363,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
DT->updateDFSNumbers();
// Scan the blocks in the function in post order.
- for (auto BB : post_order(&F.getEntryBlock())) {
+ for (auto *BB : post_order(&F.getEntryBlock())) {
// Start new block - clear the list of reduction roots.
R.clearReductionData();
collectSeedInstructions(BB);
@@ -10084,7 +11421,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
InstructionCost Cost = R.getTreeCost();
- LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
@@ -10128,7 +11465,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
++IterCnt;
CheckedPairs[Idx].set(K);
CheckedPairs[K].set(Idx);
- Optional<int> Diff = getPointersDiff(
+ std::optional<int> Diff = getPointersDiff(
Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),
Stores[Idx]->getValueOperand()->getType(),
Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
@@ -10211,12 +11548,17 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned MinVF = TTI->getStoreMinimumVF(
R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
+ if (MaxVF <= MinVF) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "
+ << "MinVF (" << MinVF << ")\n");
+ }
+
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
unsigned StartIdx = 0;
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
- ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
if (!VectorizedStores.count(Slice.front()) &&
!VectorizedStores.count(Slice.back()) &&
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
@@ -10295,7 +11637,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// Check that all of the parts are instructions of the same type,
// we permit an alternate opcode via InstructionsState.
- InstructionsState S = getSameOpcode(VL);
+ InstructionsState S = getSameOpcode(VL, *TLI);
if (!S.getOpcode())
return false;
@@ -10377,7 +11719,9 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.reorderTopToBottom();
- R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
+ R.reorderBottomToTop(
+ /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
+ !R.doesRootHaveInTreeUses());
R.buildExternalUses();
R.computeMinimumValueSizes();
@@ -10385,6 +11729,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
CandidateFound = true;
MinCost = std::min(MinCost, Cost);
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
+ << " for VF=" << OpsWidth << "\n");
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
@@ -10423,8 +11769,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!I)
return false;
- if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
- isa<VectorType>(I->getType()))
+ if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
return false;
Value *P = I->getParent();
@@ -10464,7 +11809,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
return tryToVectorizePair(Op0, Op1, R);
// We have multiple options. Try to pick the single best.
- Optional<int> BestCandidate = R.findBestRootPair(Candidates);
+ std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
if (!BestCandidate)
return false;
return tryToVectorizePair(Candidates[*BestCandidate].first,
@@ -10522,8 +11867,8 @@ class HorizontalReduction {
// select x, y, false
// select x, true, y
static bool isBoolLogicOp(Instruction *I) {
- return match(I, m_LogicalAnd(m_Value(), m_Value())) ||
- match(I, m_LogicalOr(m_Value(), m_Value()));
+ return isa<SelectInst>(I) &&
+ (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
}
/// Checks if instruction is associative and can be vectorized.
@@ -10749,7 +12094,7 @@ class HorizontalReduction {
/// Checks if the instruction is in basic block \p BB.
/// For a cmp+sel min/max reduction check that both ops are in \p BB.
static bool hasSameParent(Instruction *I, BasicBlock *BB) {
- if (isCmpSelMinMax(I) || (isBoolLogicOp(I) && isa<SelectInst>(I))) {
+ if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
auto *Sel = cast<SelectInst>(I);
auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
@@ -10800,6 +12145,13 @@ class HorizontalReduction {
return I->getOperand(getFirstOperandIndex(I) + 1);
}
+ static bool isGoodForReduction(ArrayRef<Value *> Data) {
+ int Sz = Data.size();
+ auto *I = dyn_cast<Instruction>(Data.front());
+ return Sz > 1 || isConstant(Data.front()) ||
+ (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
+ }
+
public:
HorizontalReduction() = default;
@@ -10895,6 +12247,9 @@ public:
MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
PossibleReducedVals;
initReductionOps(Inst);
+ DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
+ SmallSet<size_t, 2> LoadKeyUsed;
+ SmallPtrSet<Value *, 4> DoNotReverseVals;
while (!Worklist.empty()) {
Instruction *TreeN = Worklist.pop_back_val();
SmallVector<Value *> Args;
@@ -10916,18 +12271,36 @@ public:
size_t Key, Idx;
std::tie(Key, Idx) = generateKeySubkey(
V, &TLI,
- [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
- auto It = PossibleReducedVals.find(Key);
- if (It != PossibleReducedVals.end()) {
- for (const auto &LoadData : It->second) {
- auto *RLI = cast<LoadInst>(LoadData.second.front().first);
- if (getPointersDiff(RLI->getType(),
- RLI->getPointerOperand(), LI->getType(),
- LI->getPointerOperand(), DL, SE,
- /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
+ [&](size_t Key, LoadInst *LI) {
+ Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
+ if (LoadKeyUsed.contains(Key)) {
+ auto LIt = LoadsMap.find(Ptr);
+ if (LIt != LoadsMap.end()) {
+ for (LoadInst *RLI: LIt->second) {
+ if (getPointersDiff(
+ RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(), DL, SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ for (LoadInst *RLI : LIt->second) {
+ if (arePointersCompatible(RLI->getPointerOperand(),
+ LI->getPointerOperand(), TLI)) {
+ hash_code SubKey = hash_value(RLI->getPointerOperand());
+ DoNotReverseVals.insert(RLI);
+ return SubKey;
+ }
+ }
+ if (LIt->second.size() > 2) {
+ hash_code SubKey =
+ hash_value(LIt->second.back()->getPointerOperand());
+ DoNotReverseVals.insert(LIt->second.back());
+ return SubKey;
+ }
}
}
+ LoadKeyUsed.insert(Key);
+ LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
return hash_value(LI->getPointerOperand());
},
/*AllowAlternate=*/false);
@@ -10941,17 +12314,35 @@ public:
size_t Key, Idx;
std::tie(Key, Idx) = generateKeySubkey(
TreeN, &TLI,
- [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
- auto It = PossibleReducedVals.find(Key);
- if (It != PossibleReducedVals.end()) {
- for (const auto &LoadData : It->second) {
- auto *RLI = cast<LoadInst>(LoadData.second.front().first);
- if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
- LI->getType(), LI->getPointerOperand(),
- DL, SE, /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
+ [&](size_t Key, LoadInst *LI) {
+ Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
+ if (LoadKeyUsed.contains(Key)) {
+ auto LIt = LoadsMap.find(Ptr);
+ if (LIt != LoadsMap.end()) {
+ for (LoadInst *RLI: LIt->second) {
+ if (getPointersDiff(RLI->getType(),
+ RLI->getPointerOperand(), LI->getType(),
+ LI->getPointerOperand(), DL, SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ for (LoadInst *RLI : LIt->second) {
+ if (arePointersCompatible(RLI->getPointerOperand(),
+ LI->getPointerOperand(), TLI)) {
+ hash_code SubKey = hash_value(RLI->getPointerOperand());
+ DoNotReverseVals.insert(RLI);
+ return SubKey;
+ }
+ }
+ if (LIt->second.size() > 2) {
+ hash_code SubKey = hash_value(LIt->second.back()->getPointerOperand());
+ DoNotReverseVals.insert(LIt->second.back());
+ return SubKey;
+ }
}
}
+ LoadKeyUsed.insert(Key);
+ LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
return hash_value(LI->getPointerOperand());
},
/*AllowAlternate=*/false);
@@ -10977,9 +12368,27 @@ public:
stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
return P1.size() > P2.size();
});
- ReducedVals.emplace_back();
- for (ArrayRef<Value *> Data : PossibleRedValsVect)
- ReducedVals.back().append(Data.rbegin(), Data.rend());
+ int NewIdx = -1;
+ for (ArrayRef<Value *> Data : PossibleRedValsVect) {
+ if (isGoodForReduction(Data) ||
+ (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
+ isa<LoadInst>(ReducedVals[NewIdx].front()) &&
+ getUnderlyingObject(
+ cast<LoadInst>(Data.front())->getPointerOperand()) ==
+ getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
+ ->getPointerOperand()))) {
+ if (NewIdx < 0) {
+ NewIdx = ReducedVals.size();
+ ReducedVals.emplace_back();
+ }
+ if (DoNotReverseVals.contains(Data.front()))
+ ReducedVals[NewIdx].append(Data.begin(), Data.end());
+ else
+ ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
+ } else {
+ ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
+ }
+ }
}
// Sort the reduced values by number of same/alternate opcode and/or pointer
// operand.
@@ -10990,25 +12399,36 @@ public:
}
/// Attempt to vectorize the tree found by matchAssociativeReduction.
- Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI,
+ const TargetLibraryInfo &TLI) {
constexpr int ReductionLimit = 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
- unsigned NumReducedVals = std::accumulate(
- ReducedVals.begin(), ReducedVals.end(), 0,
- [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
- if (NumReducedVals < ReductionLimit)
+ size_t NumReducedVals =
+ std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
+ [](size_t Num, ArrayRef<Value *> Vals) {
+ if (!isGoodForReduction(Vals))
+ return Num;
+ return Num + Vals.size();
+ });
+ if (NumReducedVals < ReductionLimit) {
+ for (ReductionOpsType &RdxOps : ReductionOps)
+ for (Value *RdxOp : RdxOps)
+ V.analyzedReductionRoot(cast<Instruction>(RdxOp));
return nullptr;
+ }
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
// Track the reduced values in case if they are replaced by extractelement
// because of the vectorization.
- DenseMap<Value *, WeakTrackingVH> TrackedVals;
+ DenseMap<Value *, WeakTrackingVH> TrackedVals(
+ ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+ ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
// The same extra argument may be used several times, so log each attempt
// to use it.
for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
@@ -11031,7 +12451,8 @@ public:
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
- SmallDenseSet<Value *> IgnoreList;
+ SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
+ ReductionOps.front().size());
for (ReductionOpsType &RdxOps : ReductionOps)
for (Value *RdxOp : RdxOps) {
if (!RdxOp)
@@ -11046,15 +12467,19 @@ public:
for (Value *V : Candidates)
TrackedVals.try_emplace(V, V);
- DenseMap<Value *, unsigned> VectorizedVals;
+ DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
+ // List of the values that were reduced in other trees as part of gather
+ // nodes and thus requiring extract if fully vectorized in other trees.
+ SmallPtrSet<Value *, 4> RequiredExtract;
Value *VectorizedTree = nullptr;
bool CheckForReusedReductionOps = false;
// Try to vectorize elements based on their type.
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
- InstructionsState S = getSameOpcode(OrigReducedVals);
+ InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
SmallVector<Value *> Candidates;
- DenseMap<Value *, Value *> TrackedToOrig;
+ Candidates.reserve(2 * OrigReducedVals.size());
+ DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
// Check if the reduction value was not overriden by the extractelement
@@ -11071,7 +12496,7 @@ public:
// Try to handle shuffled extractelements.
if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
I + 1 < E) {
- InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
+ InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
if (NextS.getOpcode() == Instruction::ExtractElement &&
!NextS.isAltShuffle()) {
SmallVector<Value *> CommonCandidates(Candidates);
@@ -11179,37 +12604,49 @@ public:
});
}
// Number of uses of the candidates in the vector of values.
- SmallDenseMap<Value *, unsigned> NumUses;
+ SmallDenseMap<Value *, unsigned> NumUses(Candidates.size());
for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
Value *V = Candidates[Cnt];
- if (NumUses.count(V) > 0)
- continue;
- NumUses[V] = std::count(VL.begin(), VL.end(), V);
+ ++NumUses.try_emplace(V, 0).first->getSecond();
}
for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
Value *V = Candidates[Cnt];
- if (NumUses.count(V) > 0)
- continue;
- NumUses[V] = std::count(VL.begin(), VL.end(), V);
+ ++NumUses.try_emplace(V, 0).first->getSecond();
}
+ SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
// Gather externally used values.
SmallPtrSet<Value *, 4> Visited;
for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
- Value *V = Candidates[Cnt];
- if (!Visited.insert(V).second)
+ Value *RdxVal = Candidates[Cnt];
+ if (!Visited.insert(RdxVal).second)
continue;
- unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
- if (NumOps != ReducedValsToOps.find(V)->second.size())
- LocalExternallyUsedValues[V];
+ // Check if the scalar was vectorized as part of the vectorization
+ // tree but not the top node.
+ if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
+ LocalExternallyUsedValues[RdxVal];
+ continue;
+ }
+ unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
+ if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
+ LocalExternallyUsedValues[RdxVal];
}
for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
- Value *V = Candidates[Cnt];
- if (!Visited.insert(V).second)
+ Value *RdxVal = Candidates[Cnt];
+ if (!Visited.insert(RdxVal).second)
continue;
- unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
- if (NumOps != ReducedValsToOps.find(V)->second.size())
- LocalExternallyUsedValues[V];
+ // Check if the scalar was vectorized as part of the vectorization
+ // tree but not the top node.
+ if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
+ LocalExternallyUsedValues[RdxVal];
+ continue;
+ }
+ unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
+ if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
+ LocalExternallyUsedValues[RdxVal];
}
+ for (Value *RdxVal : VL)
+ if (RequiredExtract.contains(RdxVal))
+ LocalExternallyUsedValues[RdxVal];
V.buildExternalUses(LocalExternallyUsedValues);
V.computeMinimumValueSizes();
@@ -11224,11 +12661,25 @@ public:
InstructionCost TreeCost = V.getTreeCost(VL);
InstructionCost ReductionCost =
getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
+ if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
+ Instruction *MainOp = V.getFirstNodeMainOp();
+ for (Value *V : VL) {
+ auto *VI = dyn_cast<LoadInst>(V);
+ // Add the costs of scalar GEP pointers, to be removed from the
+ // code.
+ if (!VI || VI == MainOp)
+ continue;
+ auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
+ if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
+ continue;
+ TreeCost -= TTI->getArithmeticInstrCost(
+ Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
+ }
+ }
InstructionCost Cost = TreeCost + ReductionCost;
- if (!Cost.isValid()) {
- LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
+ if (!Cost.isValid())
return nullptr;
- }
if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
return OptimizationRemarkMissed(
@@ -11257,21 +12708,23 @@ public:
Builder.setFastMathFlags(RdxFMF);
- // Vectorize a tree.
- Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
-
// Emit a reduction. If the root is a select (min/max idiom), the insert
// point is the compare condition of that select.
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+ Instruction *InsertPt = RdxRootInst;
if (IsCmpSelMinMax)
- Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
- else
- Builder.SetInsertPoint(RdxRootInst);
+ InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
+
+ // Vectorize a tree.
+ Value *VectorizedRoot =
+ V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
+
+ Builder.SetInsertPoint(InsertPt);
// To prevent poison from leaking across what used to be sequential,
// safe, scalar boolean logic operations, the reduction operand must be
// frozen.
- if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+ if (isBoolLogicOp(RdxRootInst))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
Value *ReducedSubTree =
@@ -11288,23 +12741,59 @@ public:
ReducedSubTree, "op.rdx", ReductionOps);
}
// Count vectorized reduced values to exclude them from final reduction.
- for (Value *V : VL)
- ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+ for (Value *RdxVal : VL) {
+ ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0)
.first->getSecond();
+ if (!V.isVectorized(RdxVal))
+ RequiredExtract.insert(RdxVal);
+ }
Pos += ReduxWidth;
Start = Pos;
ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
}
}
if (VectorizedTree) {
+ // Reorder operands of bool logical op in the natural order to avoid
+ // possible problem with poison propagation. If not possible to reorder
+ // (both operands are originally RHS), emit an extra freeze instruction
+ // for the LHS operand.
+ //I.e., if we have original code like this:
+ // RedOp1 = select i1 ?, i1 LHS, i1 false
+ // RedOp2 = select i1 RHS, i1 ?, i1 false
+
+ // Then, we swap LHS/RHS to create a new op that matches the poison
+ // semantics of the original code.
+
+ // If we have original code like this and both values could be poison:
+ // RedOp1 = select i1 ?, i1 LHS, i1 false
+ // RedOp2 = select i1 ?, i1 RHS, i1 false
+
+ // Then, we must freeze LHS in the new op.
+ auto &&FixBoolLogicalOps =
+ [&Builder, VectorizedTree](Value *&LHS, Value *&RHS,
+ Instruction *RedOp1, Instruction *RedOp2) {
+ if (!isBoolLogicOp(RedOp1))
+ return;
+ if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS ||
+ isGuaranteedNotToBePoison(LHS))
+ return;
+ if (!isBoolLogicOp(RedOp2))
+ return;
+ if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS ||
+ isGuaranteedNotToBePoison(RHS)) {
+ std::swap(LHS, RHS);
+ return;
+ }
+ LHS = Builder.CreateFreeze(LHS);
+ };
// Finish the reduction.
// Need to add extra arguments and not vectorized possible reduction
// values.
// Try to avoid dependencies between the scalar remainders after
// reductions.
auto &&FinalGen =
- [this, &Builder,
- &TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
+ [this, &Builder, &TrackedVals, &FixBoolLogicalOps](
+ ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
unsigned Sz = InstVals.size();
SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
Sz % 2);
@@ -11321,6 +12810,11 @@ public:
auto It2 = TrackedVals.find(RdxVal2);
if (It2 != TrackedVals.end())
StableRdxVal2 = It2->second;
+ // To prevent poison from leaking across what used to be
+ // sequential, safe, scalar boolean logic operations, the
+ // reduction operand must be frozen.
+ FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
+ RedOp);
Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
StableRdxVal2, "op.rdx", ReductionOps);
ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
@@ -11330,6 +12824,8 @@ public:
return ExtraReds;
};
SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
+ ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
+ VectorizedTree);
SmallPtrSet<Value *, 8> Visited;
for (ArrayRef<Value *> Candidates : ReducedVals) {
for (Value *RdxVal : Candidates) {
@@ -11337,7 +12833,7 @@ public:
continue;
unsigned NumOps = VectorizedVals.lookup(RdxVal);
for (Instruction *RedOp :
- makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
+ ArrayRef(ReducedValsToOps.find(RdxVal)->second)
.drop_back(NumOps))
ExtraReductions.emplace_back(RedOp, RdxVal);
}
@@ -11349,22 +12845,12 @@ public:
}
// Iterate through all not-vectorized reduction values/extra arguments.
while (ExtraReductions.size() > 1) {
+ VectorizedTree = ExtraReductions.front().second;
SmallVector<std::pair<Instruction *, Value *>> NewReds =
FinalGen(ExtraReductions);
ExtraReductions.swap(NewReds);
}
- // Final reduction.
- if (ExtraReductions.size() == 1) {
- Instruction *RedOp = ExtraReductions.back().first;
- Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
- Value *RdxVal = ExtraReductions.back().second;
- Value *StableRdxVal = RdxVal;
- auto It = TrackedVals.find(RdxVal);
- if (It != TrackedVals.end())
- StableRdxVal = It->second;
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- StableRdxVal, "op.rdx", ReductionOps);
- }
+ VectorizedTree = ExtraReductions.front().second;
ReductionRoot->replaceAllUsesWith(VectorizedTree);
@@ -11495,7 +12981,7 @@ private:
} // end anonymous namespace
-static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
+static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
return cast<FixedVectorType>(IE->getType())->getNumElements();
@@ -11506,7 +12992,7 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
if (auto *ST = dyn_cast<StructType>(CurrentType)) {
for (auto *Elt : ST->elements())
if (Elt != ST->getElementType(0)) // check homogeneity
- return None;
+ return std::nullopt;
AggregateSize *= ST->getNumElements();
CurrentType = ST->getElementType(0);
} else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
@@ -11518,7 +13004,7 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
} else if (CurrentType->isSingleValueType()) {
return AggregateSize;
} else {
- return None;
+ return std::nullopt;
}
} while (true);
}
@@ -11530,12 +13016,11 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst,
unsigned OperandOffset) {
do {
Value *InsertedOperand = LastInsertInst->getOperand(1);
- Optional<unsigned> OperandIndex =
+ std::optional<unsigned> OperandIndex =
getInsertIndex(LastInsertInst, OperandOffset);
if (!OperandIndex)
return;
- if (isa<InsertElementInst>(InsertedOperand) ||
- isa<InsertValueInst>(InsertedOperand)) {
+ if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
BuildVectorOpds, InsertElts, *OperandIndex);
@@ -11545,8 +13030,7 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst,
}
LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
} while (LastInsertInst != nullptr &&
- (isa<InsertValueInst>(LastInsertInst) ||
- isa<InsertElementInst>(LastInsertInst)) &&
+ isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
LastInsertInst->hasOneUse());
}
@@ -11576,7 +13060,7 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
"Expected empty result vectors!");
- Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
+ std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
if (!AggregateSize)
return false;
BuildVectorOpds.resize(*AggregateSize);
@@ -11660,28 +13144,19 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
return false;
}
-/// Attempt to reduce a horizontal reduction.
-/// If it is legal to match a horizontal reduction feeding the phi node \a P
-/// with reduction operators \a Root (or one of its operands) in a basic block
-/// \a BB, then check if it can be done. If horizontal reduction is not found
-/// and root instruction is a binary operation, vectorization of the operands is
-/// attempted.
-/// \returns true if a horizontal reduction was matched and reduced or operands
-/// of one of the binary instruction were vectorized.
-/// \returns false if a horizontal reduction was not matched (or not possible)
-/// or no vectorization of any binary operation feeding \a Root instruction was
-/// performed.
-static bool tryToVectorizeHorReductionOrInstOperands(
- PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
- const TargetLibraryInfo &TLI,
- const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
+bool SLPVectorizerPass::vectorizeHorReduction(
+ PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
+ SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
if (!ShouldVectorizeHor)
return false;
+ auto *Root = dyn_cast_or_null<Instruction>(V);
if (!Root)
return false;
+ if (!isa<BinaryOperator>(Root))
+ P = nullptr;
+
if (Root->getParent() != BB || isa<PHINode>(Root))
return false;
// Start analysis starting from Root instruction. If horizontal reduction is
@@ -11693,25 +13168,22 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// horizontal reduction.
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
- // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
- // CmpInsts so we can skip extra attempts in
- // tryToVectorizeHorReductionOrInstOperands and save compile time.
+ // If a horizintal reduction was not matched or vectorized we collect
+ // instructions for possible later attempts for vectorization.
std::queue<std::pair<Instruction *, unsigned>> Stack;
Stack.emplace(Root, 0);
SmallPtrSet<Value *, 8> VisitedInstrs;
- SmallVector<WeakTrackingVH> PostponedInsts;
bool Res = false;
- auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
- Value *&B0,
- Value *&B1) -> Value * {
+ auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0,
+ Value *&B1) -> Value * {
if (R.isAnalyzedReductionRoot(Inst))
return nullptr;
bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
- return HorRdx.tryToReduce(R, TTI);
+ if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI))
+ return HorRdx.tryToReduce(R, TTI, *TLI);
}
return nullptr;
};
@@ -11752,9 +13224,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
P = nullptr;
- // Do not try to vectorize CmpInst operands, this is done separately.
- // Final attempt for binop args vectorization should happen after the loop
- // to try to find reductions.
+ // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
+ // analysis is done separately.
if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
PostponedInsts.push_back(Inst);
}
@@ -11772,29 +13243,25 @@ static bool tryToVectorizeHorReductionOrInstOperands(
!R.isDeleted(I) && I->getParent() == BB)
Stack.emplace(I, Level);
}
- // Try to vectorized binops where reductions were not found.
- for (Value *V : PostponedInsts)
- if (auto *Inst = dyn_cast<Instruction>(V))
- if (!R.isDeleted(Inst))
- Res |= Vectorize(Inst, R);
return Res;
}
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI) {
- auto *I = dyn_cast_or_null<Instruction>(V);
- if (!I)
- return false;
+ SmallVector<WeakTrackingVH> PostponedInsts;
+ bool Res = vectorizeHorReduction(P, V, BB, R, TTI, PostponedInsts);
+ Res |= tryToVectorize(PostponedInsts, R);
+ return Res;
+}
- if (!isa<BinaryOperator>(I))
- P = nullptr;
- // Try to match and vectorize a horizontal reduction.
- auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
- return tryToVectorize(I, R);
- };
- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
- *TLI, ExtraVectorization);
+bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
+ BoUpSLP &R) {
+ bool Res = false;
+ for (Value *V : Insts)
+ if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
+ Res |= tryToVectorize(Inst, R);
+ return Res;
}
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
@@ -11864,7 +13331,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
// same/alternate ops only, this may result in some extra final
// vectorization.
if (NumElts > 1 &&
- TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
+ TryToVectorizeHelper(ArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
// Success start over because instructions might have been changed.
Changed = true;
} else if (NumElts < Limit(*IncIt) &&
@@ -11886,8 +13353,9 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
++SameTypeIt;
unsigned NumElts = (SameTypeIt - It);
- if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts),
- /*LimitForRegisterSize=*/false))
+ if (NumElts > 1 &&
+ TryToVectorizeHelper(ArrayRef(It, NumElts),
+ /*LimitForRegisterSize=*/false))
Changed = true;
It = SameTypeIt;
}
@@ -11909,7 +13377,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
/// predicate of the second or the operands IDs are less than the operands IDs
/// of the second cmp instruction.
template <bool IsCompatibility>
-static bool compareCmp(Value *V, Value *V2,
+static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
function_ref<bool(Instruction *)> IsDeleted) {
auto *CI1 = cast<CmpInst>(V);
auto *CI2 = cast<CmpInst>(V2);
@@ -11945,7 +13413,7 @@ static bool compareCmp(Value *V, Value *V2,
if (auto *I2 = dyn_cast<Instruction>(Op2)) {
if (I1->getParent() != I2->getParent())
return false;
- InstructionsState S = getSameOpcode({I1, I2});
+ InstructionsState S = getSameOpcode({I1, I2}, TLI);
if (S.getOpcode())
continue;
return false;
@@ -11954,25 +13422,35 @@ static bool compareCmp(Value *V, Value *V2,
return IsCompatibility;
}
-bool SLPVectorizerPass::vectorizeSimpleInstructions(
- SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
- bool AtTerminator) {
+bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,
+ BasicBlock *BB, BoUpSLP &R,
+ bool AtTerminator) {
bool OpsChanged = false;
SmallVector<Instruction *, 4> PostponedCmps;
+ SmallVector<WeakTrackingVH> PostponedInsts;
+ // pass1 - try to vectorize reductions only
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
continue;
+ if (isa<CmpInst>(I)) {
+ PostponedCmps.push_back(I);
+ continue;
+ }
+ OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
+ }
+ // pass2 - try to match and vectorize a buildvector sequence.
+ for (auto *I : reverse(Instructions)) {
+ if (R.isDeleted(I) || isa<CmpInst>(I))
+ continue;
if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- } else if (isa<CmpInst>(I)) {
- PostponedCmps.push_back(I);
- continue;
}
- // Try to find reductions in buildvector sequnces.
- OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
}
+ // Now try to vectorize postponed instructions.
+ OpsChanged |= tryToVectorize(PostponedInsts, R);
+
if (AtTerminator) {
// Try to find reductions first.
for (Instruction *I : PostponedCmps) {
@@ -11989,15 +13467,15 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
}
// Try to vectorize list of compares.
// Sort by type, compare predicate, etc.
- auto &&CompareSorter = [&R](Value *V, Value *V2) {
- return compareCmp<false>(V, V2,
+ auto CompareSorter = [&](Value *V, Value *V2) {
+ return compareCmp<false>(V, V2, *TLI,
[&R](Instruction *I) { return R.isDeleted(I); });
};
- auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
+ auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
if (V1 == V2)
return true;
- return compareCmp<true>(V1, V2,
+ return compareCmp<true>(V1, V2, *TLI,
[&R](Instruction *I) { return R.isDeleted(I); });
};
auto Limit = [&R](Value *V) {
@@ -12025,9 +13503,10 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
/*LimitForRegisterSize=*/true);
Instructions.clear();
} else {
+ Instructions.clear();
// Insert in reverse order since the PostponedCmps vector was filled in
// reverse order.
- Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
+ Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend());
}
return OpsChanged;
}
@@ -12056,7 +13535,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return true;
if (Opcodes1.size() > Opcodes2.size())
return false;
- Optional<bool> ConstOrder;
+ std::optional<bool> ConstOrder;
for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
// Undefs are compatible with any other value.
if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {
@@ -12078,7 +13557,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
"Different nodes should have different DFS numbers");
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
- InstructionsState S = getSameOpcode({I1, I2});
+ InstructionsState S = getSameOpcode({I1, I2}, *TLI);
if (S.getOpcode())
continue;
return I1->getOpcode() < I2->getOpcode();
@@ -12095,7 +13574,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
return ConstOrder && *ConstOrder;
};
- auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
+ auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
if (V1 == V2)
return true;
if (V1->getType() != V2->getType())
@@ -12112,7 +13591,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
if (I1->getParent() != I2->getParent())
return false;
- InstructionsState S = getSameOpcode({I1, I2});
+ InstructionsState S = getSameOpcode({I1, I2}, *TLI);
if (S.getOpcode())
continue;
return false;
@@ -12180,7 +13659,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
VisitedInstrs.clear();
- SmallVector<Instruction *, 8> PostProcessInstructions;
+ InstSetVector PostProcessInstructions;
SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
// Skip instructions with scalable type. The num of elements is unknown at
@@ -12232,8 +13711,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
!DT->isReachableFromEntry(P->getIncomingBlock(I)))
continue;
- Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
- P->getIncomingBlock(I), R, TTI);
+ // Postponed instructions should not be vectorized here, delay their
+ // vectorization.
+ if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
+ PI && !PostProcessInstructions.contains(PI))
+ Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+ P->getIncomingBlock(I), R, TTI);
}
continue;
}
@@ -12241,14 +13724,31 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Ran into an instruction without users, like terminator, or function call
// with ignored return value, store. Ignore unused instructions (basing on
// instruction type, except for CallInst and InvokeInst).
- if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
- isa<InvokeInst>(it))) {
+ if (it->use_empty() &&
+ (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {
KeyNodes.insert(&*it);
bool OpsChanged = false;
- if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+ auto *SI = dyn_cast<StoreInst>(it);
+ bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
+ if (SI) {
+ auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
+ // Try to vectorize chain in store, if this is the only store to the
+ // address in the block.
+ // TODO: This is just a temporarily solution to save compile time. Need
+ // to investigate if we can safely turn on slp-vectorize-hor-store
+ // instead to allow lookup for reduction chains in all non-vectorized
+ // stores (need to check side effects and compile time).
+ TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&
+ SI->getValueOperand()->hasOneUse();
+ }
+ if (TryToVectorizeRoot) {
for (auto *V : it->operand_values()) {
- // Try to match and vectorize a horizontal reduction.
- OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
+ // Postponed instructions should not be vectorized here, delay their
+ // vectorization.
+ if (auto *VI = dyn_cast<Instruction>(V);
+ VI && !PostProcessInstructions.contains(VI))
+ // Try to match and vectorize a horizontal reduction.
+ OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
}
}
// Start vectorization of post-process list of instructions from the
@@ -12266,9 +13766,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
}
- if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
- isa<InsertValueInst>(it))
- PostProcessInstructions.push_back(&*it);
+ if (isa<CmpInst, InsertElementInst, InsertValueInst>(it))
+ PostProcessInstructions.insert(&*it);
}
return Changed;
@@ -12395,7 +13894,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
"Different nodes should have different DFS numbers");
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
- InstructionsState S = getSameOpcode({I1, I2});
+ InstructionsState S = getSameOpcode({I1, I2}, *TLI);
if (S.getOpcode())
return false;
return I1->getOpcode() < I2->getOpcode();
@@ -12407,7 +13906,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
V2->getValueOperand()->getValueID();
};
- auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) {
+ auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
if (V1 == V2)
return true;
if (V1->getPointerOperandType() != V2->getPointerOperandType())
@@ -12420,7 +13919,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
if (I1->getParent() != I2->getParent())
return false;
- InstructionsState S = getSameOpcode({I1, I2});
+ InstructionsState S = getSameOpcode({I1, I2}, *TLI);
return S.getOpcode() > 0;
}
if (isa<Constant>(V1->getValueOperand()) &&