aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-04 19:20:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-02-08 19:02:26 +0000
commit81ad626541db97eb356e2c1d4a20eb2a26a766ab (patch)
tree311b6a8987c32b1e1dcbab65c54cfac3fdb56175 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent5fff09660e06a66bed6482da9c70df328e16bbb6 (diff)
parent145449b1e420787bb99721a429341fa6be3adfb6 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4331
1 files changed, 3266 insertions, 1065 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 644372483edd..019a09665a67 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -53,7 +53,6 @@
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
@@ -64,7 +63,6 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
@@ -72,8 +70,9 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
+#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
+#endif
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -87,6 +86,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
@@ -164,13 +164,14 @@ static cl::opt<int> LookAheadMaxDepth(
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for operand reordering scores"));
-// The Look-ahead heuristic goes through the users of the bundle to calculate
-// the users cost in getExternalUsesCost(). To avoid compilation time increase
-// we limit the number of users visited to this value.
-static cl::opt<unsigned> LookAheadUsersBudget(
- "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
- cl::desc("The maximum number of users to visit while visiting the "
- "predecessors. This prevents compilation time increase."));
+// The maximum depth that the look-ahead score heuristic will explore
+// when it probing among candidates for vectorization tree roots.
+// The higher this value, the higher the compilation time overhead but unlike
+// similar limit for operands ordering this is less frequently used, hence
+// impact of higher value is less noticeable.
+static cl::opt<int> RootLookAheadMaxDepth(
+ "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
+ cl::desc("The maximum look-ahead depth for searching best rooting option"));
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
@@ -471,17 +472,36 @@ static bool isValidForAlternation(unsigned Opcode) {
return true;
}
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0);
+
+/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
+/// compatible instructions or constants, or just some other regular values.
+static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
+ Value *Op1) {
+ return (isConstant(BaseOp0) && isConstant(Op0)) ||
+ (isConstant(BaseOp1) && isConstant(Op1)) ||
+ (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
+ !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
+ getSameOpcode({BaseOp0, Op0}).getOpcode() ||
+ getSameOpcode({BaseOp1, Op1}).getOpcode();
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
- unsigned BaseIndex = 0) {
+ unsigned BaseIndex) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
+ CmpInst::Predicate BasePred =
+ IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
+ : CmpInst::BAD_ICMP_PREDICATE;
unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
@@ -514,6 +534,57 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
continue;
}
}
+ } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {
+ auto *BaseInst = cast<Instruction>(VL[BaseIndex]);
+ auto *Inst = cast<Instruction>(VL[Cnt]);
+ Type *Ty0 = BaseInst->getOperand(0)->getType();
+ Type *Ty1 = Inst->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ Value *BaseOp0 = BaseInst->getOperand(0);
+ Value *BaseOp1 = BaseInst->getOperand(1);
+ Value *Op0 = Inst->getOperand(0);
+ Value *Op1 = Inst->getOperand(1);
+ CmpInst::Predicate CurrentPred =
+ cast<CmpInst>(VL[Cnt])->getPredicate();
+ CmpInst::Predicate SwappedCurrentPred =
+ CmpInst::getSwappedPredicate(CurrentPred);
+ // Check for compatible operands. If the corresponding operands are not
+ // compatible - need to perform alternate vectorization.
+ if (InstOpcode == Opcode) {
+ if (BasePred == CurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1))
+ continue;
+ if (BasePred == SwappedCurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0))
+ continue;
+ if (E == 2 &&
+ (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
+ continue;
+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+ CmpInst::Predicate AltPred = AltInst->getPredicate();
+ Value *AltOp0 = AltInst->getOperand(0);
+ Value *AltOp1 = AltInst->getOperand(1);
+ // Check if operands are compatible with alternate operands.
+ if (AltPred == CurrentPred &&
+ areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1))
+ continue;
+ if (AltPred == SwappedCurrentPred &&
+ areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))
+ continue;
+ }
+ if (BaseIndex == AltIndex && BasePred != CurrentPred) {
+ assert(isValidForAlternation(Opcode) &&
+ isValidForAlternation(InstOpcode) &&
+ "Cast isn't safe for alternation, logic needs to be updated!");
+ AltIndex = Cnt;
+ continue;
+ }
+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+ CmpInst::Predicate AltPred = AltInst->getPredicate();
+ if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
+ AltPred == CurrentPred || AltPred == SwappedCurrentPred)
+ continue;
+ }
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
@@ -570,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
- if (hasVectorInstrinsicScalarOpd(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
return (CI->getArgOperand(i) == Scalar);
}
LLVM_FALLTHROUGH;
@@ -666,11 +737,11 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
/// \returns inserting index of InsertElement or InsertValue instruction,
/// using Offset as base offset for index.
-static Optional<unsigned> getInsertIndex(Value *InsertInst,
+static Optional<unsigned> getInsertIndex(const Value *InsertInst,
unsigned Offset = 0) {
int Index = Offset;
- if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+ if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+ if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
auto *VT = cast<FixedVectorType>(IE->getType());
if (CI->getValue().uge(VT->getNumElements()))
return None;
@@ -681,13 +752,13 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst,
return None;
}
- auto *IV = cast<InsertValueInst>(InsertInst);
+ const auto *IV = cast<InsertValueInst>(InsertInst);
Type *CurrentType = IV->getType();
for (unsigned I : IV->indices()) {
- if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+ if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
Index *= ST->getNumElements();
CurrentType = ST->getElementType(I);
- } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
Index *= AT->getNumElements();
CurrentType = AT->getElementType();
} else {
@@ -698,11 +769,7 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst,
return Index;
}
-/// Reorders the list of scalars in accordance with the given \p Order and then
-/// the \p Mask. \p Order - is the original order of the scalars, need to
-/// reorder scalars into an unordered state at first according to the given
-/// order. Then the ordered scalars are shuffled once again in accordance with
-/// the provided mask.
+/// Reorders the list of scalars in accordance with the given \p Mask.
static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
ArrayRef<int> Mask) {
assert(!Mask.empty() && "Expected non-empty mask.");
@@ -714,6 +781,58 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
Scalars[Mask[I]] = Prev[I];
}
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all operands are either not instructions
+/// or phi nodes or instructions from different blocks.
+static bool areAllOperandsNonInsts(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return true;
+ return !mayHaveNonDefUseDependency(*I) &&
+ all_of(I->operands(), [I](Value *V) {
+ auto *IO = dyn_cast<Instruction>(V);
+ if (!IO)
+ return true;
+ return isa<PHINode>(IO) || IO->getParent() != I->getParent();
+ });
+}
+
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all users are phi nodes or instructions
+/// from the different blocks.
+static bool isUsedOutsideBlock(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return true;
+ // Limits the number of uses to save compile time.
+ constexpr int UsesLimit = 8;
+ return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
+ all_of(I->users(), [I](User *U) {
+ auto *IU = dyn_cast<Instruction>(U);
+ if (!IU)
+ return true;
+ return IU->getParent() != I->getParent() || isa<PHINode>(IU);
+ });
+}
+
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V) {
+ return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
+}
+
+/// Checks if the specified array of instructions does not require scheduling.
+/// It is so if all either instructions have operands that do not require
+/// scheduling or their users do not require scheduling since they are phis or
+/// in other basic blocks.
+static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
+ return !VL.empty() &&
+ (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -734,8 +853,8 @@ public:
TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
- : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
- DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+ : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
+ DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -776,7 +895,10 @@ public:
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
- ArrayRef<Value *> UserIgnoreLst = None);
+ const SmallDenseSet<Value *> &UserIgnoreLst);
+
+ /// Construct a vectorizable tree that starts at \p Roots.
+ void buildTree(ArrayRef<Value *> Roots);
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
@@ -797,6 +919,7 @@ public:
}
MinBWs.clear();
InstrElementSize.clear();
+ UserIgnoreList = nullptr;
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -810,6 +933,9 @@ public:
/// ExtractElement, ExtractValue), which can be part of the graph.
Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ /// Sort loads into increasing pointers offsets to allow greater clustering.
+ Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
+
/// Gets reordering data for the given tree entry. If the entry is vectorized
/// - just return ReorderIndices, otherwise check if the scalars can be
/// reordered and return the most optimal order.
@@ -924,96 +1050,18 @@ public:
#endif
};
- /// A helper data structure to hold the operands of a vector of instructions.
- /// This supports a fixed vector length for all operand vectors.
- class VLOperands {
- /// For each operand we need (i) the value, and (ii) the opcode that it
- /// would be attached to if the expression was in a left-linearized form.
- /// This is required to avoid illegal operand reordering.
- /// For example:
- /// \verbatim
- /// 0 Op1
- /// |/
- /// Op1 Op2 Linearized + Op2
- /// \ / ----------> |/
- /// - -
- ///
- /// Op1 - Op2 (0 + Op1) - Op2
- /// \endverbatim
- ///
- /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
- ///
- /// Another way to think of this is to track all the operations across the
- /// path from the operand all the way to the root of the tree and to
- /// calculate the operation that corresponds to this path. For example, the
- /// path from Op2 to the root crosses the RHS of the '-', therefore the
- /// corresponding operation is a '-' (which matches the one in the
- /// linearized tree, as shown above).
- ///
- /// For lack of a better term, we refer to this operation as Accumulated
- /// Path Operation (APO).
- struct OperandData {
- OperandData() = default;
- OperandData(Value *V, bool APO, bool IsUsed)
- : V(V), APO(APO), IsUsed(IsUsed) {}
- /// The operand value.
- Value *V = nullptr;
- /// TreeEntries only allow a single opcode, or an alternate sequence of
- /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
- /// APO. It is set to 'true' if 'V' is attached to an inverse operation
- /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
- /// (e.g., Add/Mul)
- bool APO = false;
- /// Helper data for the reordering function.
- bool IsUsed = false;
- };
-
- /// During operand reordering, we are trying to select the operand at lane
- /// that matches best with the operand at the neighboring lane. Our
- /// selection is based on the type of value we are looking for. For example,
- /// if the neighboring lane has a load, we need to look for a load that is
- /// accessing a consecutive address. These strategies are summarized in the
- /// 'ReorderingMode' enumerator.
- enum class ReorderingMode {
- Load, ///< Matching loads to consecutive memory addresses
- Opcode, ///< Matching instructions based on opcode (same or alternate)
- Constant, ///< Matching constants
- Splat, ///< Matching the same instruction multiple times (broadcast)
- Failed, ///< We failed to create a vectorizable group
- };
-
- using OperandDataVec = SmallVector<OperandData, 2>;
-
- /// A vector of operand vectors.
- SmallVector<OperandDataVec, 4> OpsVec;
-
+ /// A helper class used for scoring candidates for two consecutive lanes.
+ class LookAheadHeuristics {
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
+ int NumLanes; // Total number of lanes (aka vectorization factor).
+ int MaxLevel; // The maximum recursion depth for accumulating score.
- /// \returns the operand data at \p OpIdx and \p Lane.
- OperandData &getData(unsigned OpIdx, unsigned Lane) {
- return OpsVec[OpIdx][Lane];
- }
-
- /// \returns the operand data at \p OpIdx and \p Lane. Const version.
- const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
- return OpsVec[OpIdx][Lane];
- }
-
- /// Clears the used flag for all entries.
- void clearUsed() {
- for (unsigned OpIdx = 0, NumOperands = getNumOperands();
- OpIdx != NumOperands; ++OpIdx)
- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
- ++Lane)
- OpsVec[OpIdx][Lane].IsUsed = false;
- }
-
- /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
- void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
- std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
- }
+ public:
+ LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,
+ const BoUpSLP &R, int NumLanes, int MaxLevel)
+ : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
// The hard-coded scores listed here are not very important, though it shall
// be higher for better matches to improve the resulting cost. When
@@ -1028,6 +1076,11 @@ public:
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreConsecutiveLoads = 4;
+ /// The same load multiple times. This should have a better score than
+ /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
+ /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
+ /// a vector load and 1.0 for a broadcast.
+ static const int ScoreSplatLoads = 3;
/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreReversedLoads = 3;
/// ExtractElementInst from same vector and consecutive indexes.
@@ -1046,43 +1099,67 @@ public:
static const int ScoreUndef = 1;
/// Score for failing to find a decent match.
static const int ScoreFail = 0;
- /// User exteranl to the vectorized code.
- static const int ExternalUseCost = 1;
- /// The user is internal but in a different lane.
- static const int UserInDiffLaneCost = ExternalUseCost;
+ /// Score if all users are vectorized.
+ static const int ScoreAllUserVectorized = 1;
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
- static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
- ScalarEvolution &SE, int NumLanes) {
- if (V1 == V2)
- return VLOperands::ScoreSplat;
+ /// \p U1 and \p U2 are the users of \p V1 and \p V2.
+ /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
+ /// MainAltOps.
+ int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
+ ArrayRef<Value *> MainAltOps) const {
+ if (V1 == V2) {
+ if (isa<LoadInst>(V1)) {
+ // Retruns true if the users of V1 and V2 won't need to be extracted.
+ auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
+ // Bail out if we have too many uses to save compilation time.
+ static constexpr unsigned Limit = 8;
+ if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
+ return false;
+
+ auto AllUsersVectorized = [U1, U2, this](Value *V) {
+ return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
+ return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
+ });
+ };
+ return AllUsersVectorized(V1) && AllUsersVectorized(V2);
+ };
+ // A broadcast of a load can be cheaper on some targets.
+ if (R.TTI->isLegalBroadcastLoad(V1->getType(),
+ ElementCount::getFixed(NumLanes)) &&
+ ((int)V1->getNumUses() == NumLanes ||
+ AllUsersAreInternal(V1, V2)))
+ return LookAheadHeuristics::ScoreSplatLoads;
+ }
+ return LookAheadHeuristics::ScoreSplat;
+ }
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
if (LI1->getParent() != LI2->getParent())
- return VLOperands::ScoreFail;
+ return LookAheadHeuristics::ScoreFail;
Optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
- if (!Dist)
- return VLOperands::ScoreFail;
+ if (!Dist || *Dist == 0)
+ return LookAheadHeuristics::ScoreFail;
// The distance is too large - still may be profitable to use masked
// loads/gathers.
if (std::abs(*Dist) > NumLanes / 2)
- return VLOperands::ScoreAltOpcodes;
+ return LookAheadHeuristics::ScoreAltOpcodes;
// This still will detect consecutive loads, but we might have "holes"
// in some cases. It is ok for non-power-2 vectorization and may produce
// better results. It should not affect current vectorization.
- return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
- : VLOperands::ScoreReversedLoads;
+ return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
+ : LookAheadHeuristics::ScoreReversedLoads;
}
auto *C1 = dyn_cast<Constant>(V1);
auto *C2 = dyn_cast<Constant>(V2);
if (C1 && C2)
- return VLOperands::ScoreConstants;
+ return LookAheadHeuristics::ScoreConstants;
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
@@ -1091,7 +1168,7 @@ public:
if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
// Undefs are always profitable for extractelements.
if (isa<UndefValue>(V2))
- return VLOperands::ScoreConsecutiveExtracts;
+ return LookAheadHeuristics::ScoreConsecutiveExtracts;
Value *EV2 = nullptr;
ConstantInt *Ex2Idx = nullptr;
if (match(V2,
@@ -1099,108 +1176,62 @@ public:
m_Undef())))) {
// Undefs are always profitable for extractelements.
if (!Ex2Idx)
- return VLOperands::ScoreConsecutiveExtracts;
+ return LookAheadHeuristics::ScoreConsecutiveExtracts;
if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
- return VLOperands::ScoreConsecutiveExtracts;
+ return LookAheadHeuristics::ScoreConsecutiveExtracts;
if (EV2 == EV1) {
int Idx1 = Ex1Idx->getZExtValue();
int Idx2 = Ex2Idx->getZExtValue();
int Dist = Idx2 - Idx1;
// The distance is too large - still may be profitable to use
// shuffles.
+ if (std::abs(Dist) == 0)
+ return LookAheadHeuristics::ScoreSplat;
if (std::abs(Dist) > NumLanes / 2)
- return VLOperands::ScoreAltOpcodes;
- return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
- : VLOperands::ScoreReversedExtracts;
+ return LookAheadHeuristics::ScoreSameOpcode;
+ return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
+ : LookAheadHeuristics::ScoreReversedExtracts;
}
+ return LookAheadHeuristics::ScoreAltOpcodes;
}
+ return LookAheadHeuristics::ScoreFail;
}
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
if (I1->getParent() != I2->getParent())
- return VLOperands::ScoreFail;
- InstructionsState S = getSameOpcode({I1, I2});
+ return LookAheadHeuristics::ScoreFail;
+ SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
+ Ops.push_back(I1);
+ Ops.push_back(I2);
+ InstructionsState S = getSameOpcode(Ops);
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
- if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
- return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
- : VLOperands::ScoreSameOpcode;
+ if (S.getOpcode() &&
+ (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
+ !S.isAltShuffle()) &&
+ all_of(Ops, [&S](Value *V) {
+ return cast<Instruction>(V)->getNumOperands() ==
+ S.MainOp->getNumOperands();
+ }))
+ return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
+ : LookAheadHeuristics::ScoreSameOpcode;
}
if (isa<UndefValue>(V2))
- return VLOperands::ScoreUndef;
-
- return VLOperands::ScoreFail;
- }
-
- /// Holds the values and their lanes that are taking part in the look-ahead
- /// score calculation. This is used in the external uses cost calculation.
- /// Need to hold all the lanes in case of splat/broadcast at least to
- /// correctly check for the use in the different lane.
- SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
-
- /// \returns the additional cost due to uses of \p LHS and \p RHS that are
- /// either external to the vectorized code, or require shuffling.
- int getExternalUsesCost(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS) {
- int Cost = 0;
- std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
- for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
- Value *V = Values[Idx].first;
- if (isa<Constant>(V)) {
- // Since this is a function pass, it doesn't make semantic sense to
- // walk the users of a subclass of Constant. The users could be in
- // another function, or even another module that happens to be in
- // the same LLVMContext.
- continue;
- }
+ return LookAheadHeuristics::ScoreUndef;
- // Calculate the absolute lane, using the minimum relative lane of LHS
- // and RHS as base and Idx as the offset.
- int Ln = std::min(LHS.second, RHS.second) + Idx;
- assert(Ln >= 0 && "Bad lane calculation");
- unsigned UsersBudget = LookAheadUsersBudget;
- for (User *U : V->users()) {
- if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
- // The user is in the VectorizableTree. Check if we need to insert.
- int UserLn = UserTE->findLaneForValue(U);
- assert(UserLn >= 0 && "Bad lane");
- // If the values are different, check just the line of the current
- // value. If the values are the same, need to add UserInDiffLaneCost
- // only if UserLn does not match both line numbers.
- if ((LHS.first != RHS.first && UserLn != Ln) ||
- (LHS.first == RHS.first && UserLn != LHS.second &&
- UserLn != RHS.second)) {
- Cost += UserInDiffLaneCost;
- break;
- }
- } else {
- // Check if the user is in the look-ahead code.
- auto It2 = InLookAheadValues.find(U);
- if (It2 != InLookAheadValues.end()) {
- // The user is in the look-ahead code. Check the lane.
- if (!It2->getSecond().contains(Ln)) {
- Cost += UserInDiffLaneCost;
- break;
- }
- } else {
- // The user is neither in SLP tree nor in the look-ahead code.
- Cost += ExternalUseCost;
- break;
- }
- }
- // Limit the number of visited uses to cap compilation time.
- if (--UsersBudget == 0)
- break;
- }
- }
- return Cost;
+ return LookAheadHeuristics::ScoreFail;
}
- /// Go through the operands of \p LHS and \p RHS recursively until \p
- /// MaxLevel, and return the cummulative score. For example:
+ /// Go through the operands of \p LHS and \p RHS recursively until
+ /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
+ /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
+ /// of \p U1 and \p U2), except at the beginning of the recursion where
+ /// these are set to nullptr.
+ ///
+ /// For example:
/// \verbatim
/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
/// \ / \ / \ / \ /
@@ -1211,8 +1242,8 @@ public:
/// each level recursively, accumulating the score. It starts from matching
/// the additions at level 0, then moves on to the loads (level 1). The
/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
- /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
- /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+ /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
+ /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
/// Please note that the order of the operands does not matter, as we
/// evaluate the score of all profitable combinations of operands. In
/// other words the score of G1 and G4 is the same as G1 and G2. This
@@ -1220,18 +1251,13 @@ public:
/// Look-ahead SLP: Auto-vectorization in the presence of commutative
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
/// Luís F. W. Góes
- int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS, int CurrLevel,
- int MaxLevel) {
+ int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
+ Instruction *U2, int CurrLevel,
+ ArrayRef<Value *> MainAltOps) const {
- Value *V1 = LHS.first;
- Value *V2 = RHS.first;
// Get the shallow score of V1 and V2.
- int ShallowScoreAtThisLevel = std::max(
- (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
- getExternalUsesCost(LHS, RHS));
- int Lane1 = LHS.second;
- int Lane2 = RHS.second;
+ int ShallowScoreAtThisLevel =
+ getShallowScore(LHS, RHS, U1, U2, MainAltOps);
// If reached MaxLevel,
// or if V1 and V2 are not instructions,
@@ -1239,20 +1265,17 @@ public:
// or if they are not consecutive,
// or if profitable to vectorize loads or extractelements, early return
// the current cost.
- auto *I1 = dyn_cast<Instruction>(V1);
- auto *I2 = dyn_cast<Instruction>(V2);
+ auto *I1 = dyn_cast<Instruction>(LHS);
+ auto *I2 = dyn_cast<Instruction>(RHS);
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
- ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+ ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+ (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
(isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
ShallowScoreAtThisLevel))
return ShallowScoreAtThisLevel;
assert(I1 && I2 && "Should have early exited.");
- // Keep track of in-tree values for determining the external-use cost.
- InLookAheadValues[V1].insert(Lane1);
- InLookAheadValues[V2].insert(Lane2);
-
// Contains the I2 operand indexes that got matched with I1 operands.
SmallSet<unsigned, 4> Op2Used;
@@ -1275,11 +1298,12 @@ public:
if (Op2Used.count(OpIdx2))
continue;
// Recursively calculate the cost at each level
- int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
- {I2->getOperand(OpIdx2), Lane2},
- CurrLevel + 1, MaxLevel);
+ int TmpScore =
+ getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
+ I1, I2, CurrLevel + 1, None);
// Look for the best score.
- if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+ if (TmpScore > LookAheadHeuristics::ScoreFail &&
+ TmpScore > MaxTmpScore) {
MaxTmpScore = TmpScore;
MaxOpIdx2 = OpIdx2;
FoundBest = true;
@@ -1293,24 +1317,213 @@ public:
}
return ShallowScoreAtThisLevel;
}
+ };
+ /// A helper data structure to hold the operands of a vector of instructions.
+ /// This supports a fixed vector length for all operand vectors.
+ class VLOperands {
+ /// For each operand we need (i) the value, and (ii) the opcode that it
+ /// would be attached to if the expression was in a left-linearized form.
+ /// This is required to avoid illegal operand reordering.
+ /// For example:
+ /// \verbatim
+ /// 0 Op1
+ /// |/
+ /// Op1 Op2 Linearized + Op2
+ /// \ / ----------> |/
+ /// - -
+ ///
+ /// Op1 - Op2 (0 + Op1) - Op2
+ /// \endverbatim
+ ///
+ /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+ ///
+ /// Another way to think of this is to track all the operations across the
+ /// path from the operand all the way to the root of the tree and to
+ /// calculate the operation that corresponds to this path. For example, the
+ /// path from Op2 to the root crosses the RHS of the '-', therefore the
+ /// corresponding operation is a '-' (which matches the one in the
+ /// linearized tree, as shown above).
+ ///
+ /// For lack of a better term, we refer to this operation as Accumulated
+ /// Path Operation (APO).
+ struct OperandData {
+ OperandData() = default;
+ OperandData(Value *V, bool APO, bool IsUsed)
+ : V(V), APO(APO), IsUsed(IsUsed) {}
+ /// The operand value.
+ Value *V = nullptr;
+ /// TreeEntries only allow a single opcode, or an alternate sequence of
+ /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+ /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+ /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+ /// (e.g., Add/Mul)
+ bool APO = false;
+ /// Helper data for the reordering function.
+ bool IsUsed = false;
+ };
+
+ /// During operand reordering, we are trying to select the operand at lane
+ /// that matches best with the operand at the neighboring lane. Our
+ /// selection is based on the type of value we are looking for. For example,
+ /// if the neighboring lane has a load, we need to look for a load that is
+ /// accessing a consecutive address. These strategies are summarized in the
+ /// 'ReorderingMode' enumerator.
+ enum class ReorderingMode {
+ Load, ///< Matching loads to consecutive memory addresses
+ Opcode, ///< Matching instructions based on opcode (same or alternate)
+ Constant, ///< Matching constants
+ Splat, ///< Matching the same instruction multiple times (broadcast)
+ Failed, ///< We failed to create a vectorizable group
+ };
+
+ using OperandDataVec = SmallVector<OperandData, 2>;
+
+ /// A vector of operand vectors.
+ SmallVector<OperandDataVec, 4> OpsVec;
+
+ const DataLayout &DL;
+ ScalarEvolution &SE;
+ const BoUpSLP &R;
+
+ /// \returns the operand data at \p OpIdx and \p Lane.
+ OperandData &getData(unsigned OpIdx, unsigned Lane) {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+ const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// Clears the used flag for all entries.
+ void clearUsed() {
+ for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane)
+ OpsVec[OpIdx][Lane].IsUsed = false;
+ }
+
+ /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+ void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+ }
+
+ /// \param Lane lane of the operands under analysis.
+ /// \param OpIdx operand index in \p Lane lane we're looking the best
+ /// candidate for.
+ /// \param Idx operand index of the current candidate value.
+ /// \returns The additional score due to possible broadcasting of the
+ /// elements in the lane. It is more profitable to have power-of-2 unique
+ /// elements in the lane, it will be vectorized with higher probability
+ /// after removing duplicates. Currently the SLP vectorizer supports only
+ /// vectorization of the power-of-2 number of unique scalars.
+ int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+ Value *IdxLaneV = getData(Idx, Lane).V;
+ if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
+ return 0;
+ SmallPtrSet<Value *, 4> Uniques;
+ for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
+ if (Ln == Lane)
+ continue;
+ Value *OpIdxLnV = getData(OpIdx, Ln).V;
+ if (!isa<Instruction>(OpIdxLnV))
+ return 0;
+ Uniques.insert(OpIdxLnV);
+ }
+ int UniquesCount = Uniques.size();
+ int UniquesCntWithIdxLaneV =
+ Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
+ Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+ int UniquesCntWithOpIdxLaneV =
+ Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
+ if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
+ return 0;
+ return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
+ UniquesCntWithOpIdxLaneV) -
+ (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
+ }
+
+ /// \param Lane lane of the operands under analysis.
+ /// \param OpIdx operand index in \p Lane lane we're looking the best
+ /// candidate for.
+ /// \param Idx operand index of the current candidate value.
+ /// \returns The additional score for the scalar which users are all
+ /// vectorized.
+ int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+ Value *IdxLaneV = getData(Idx, Lane).V;
+ Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+ // Do not care about number of uses for vector-like instructions
+ // (extractelement/extractvalue with constant indices), they are extracts
+ // themselves and already externally used. Vectorization of such
+ // instructions does not add extra extractelement instruction, just may
+ // remove it.
+ if (isVectorLikeInstWithConstOps(IdxLaneV) &&
+ isVectorLikeInstWithConstOps(OpIdxLaneV))
+ return LookAheadHeuristics::ScoreAllUserVectorized;
+ auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
+ if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
+ return 0;
+ return R.areAllUsersVectorized(IdxLaneI, None)
+ ? LookAheadHeuristics::ScoreAllUserVectorized
+ : 0;
+ }
+
+ /// Score scaling factor for fully compatible instructions but with
+ /// different number of external uses. Allows better selection of the
+ /// instructions with less external uses.
+ static const int ScoreScaleFactor = 10;
/// \Returns the look-ahead score, which tells us how much the sub-trees
/// rooted at \p LHS and \p RHS match, the more they match the higher the
/// score. This helps break ties in an informed way when we cannot decide on
/// the order of the operands by just considering the immediate
/// predecessors.
- int getLookAheadScore(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS) {
- InLookAheadValues.clear();
- return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
+ int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
+ int Lane, unsigned OpIdx, unsigned Idx,
+ bool &IsUsed) {
+ LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),
+ LookAheadMaxDepth);
+ // Keep track of the instruction stack as we recurse into the operands
+ // during the look-ahead score exploration.
+ int Score =
+ LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
+ /*CurrLevel=*/1, MainAltOps);
+ if (Score) {
+ int SplatScore = getSplatScore(Lane, OpIdx, Idx);
+ if (Score <= -SplatScore) {
+ // Set the minimum score for splat-like sequence to avoid setting
+ // failed state.
+ Score = 1;
+ } else {
+ Score += SplatScore;
+ // Scale score to see the difference between different operands
+ // and similar operands but all vectorized/not all vectorized
+ // uses. It does not affect actual selection of the best
+ // compatible operand in general, just allows to select the
+ // operand with all vectorized uses.
+ Score *= ScoreScaleFactor;
+ Score += getExternalUseScore(Lane, OpIdx, Idx);
+ IsUsed = true;
+ }
+ }
+ return Score;
}
+ /// Best defined scores per lanes between the passes. Used to choose the
+ /// best operand (with the highest score) between the passes.
+ /// The key - {Operand Index, Lane}.
+ /// The value - the best score between the passes for the lane and the
+ /// operand.
+ SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
+ BestScoresPerLanes;
+
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return None.
- Optional<unsigned>
- getBestOperand(unsigned OpIdx, int Lane, int LastLane,
- ArrayRef<ReorderingMode> ReorderingModes) {
+ Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ ArrayRef<ReorderingMode> ReorderingModes,
+ ArrayRef<Value *> MainAltOps) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
@@ -1318,6 +1531,8 @@ public:
// Our strategy mode for OpIdx.
ReorderingMode RMode = ReorderingModes[OpIdx];
+ if (RMode == ReorderingMode::Failed)
+ return None;
// The linearized opcode of the operand at OpIdx, Lane.
bool OpIdxAPO = getData(OpIdx, Lane).APO;
@@ -1329,7 +1544,15 @@ public:
Optional<unsigned> Idx = None;
unsigned Score = 0;
} BestOp;
-
+ BestOp.Score =
+ BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
+ .first->second;
+
+ // Track if the operand must be marked as used. If the operand is set to
+ // Score 1 explicitly (because of non power-of-2 unique scalars, we may
+ // want to reestimate the operands again on the following iterations).
+ bool IsUsed =
+ RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
@@ -1355,11 +1578,12 @@ public:
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
- unsigned Score =
- getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
- if (Score > BestOp.Score) {
+ int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
+ OpIdx, Idx, IsUsed);
+ if (Score > static_cast<int>(BestOp.Score)) {
BestOp.Idx = Idx;
BestOp.Score = Score;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
}
break;
}
@@ -1368,12 +1592,12 @@ public:
BestOp.Idx = Idx;
break;
case ReorderingMode::Failed:
- return None;
+ llvm_unreachable("Not expected Failed reordering mode.");
}
}
if (BestOp.Idx) {
- getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
+ getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
return BestOp.Idx;
}
// If we could not find a good match return None.
@@ -1690,6 +1914,10 @@ public:
// rest of the lanes. We are visiting the nodes in a circular fashion,
// using FirstLane as the center point and increasing the radius
// distance.
+ SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
+ for (unsigned I = 0; I < NumOperands; ++I)
+ MainAltOps[I].push_back(getData(I, FirstLane).V);
+
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
// Visit the lane on the right and then the lane on the left.
for (int Direction : {+1, -1}) {
@@ -1702,21 +1930,29 @@ public:
// Look for a good match for each operand.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
- Optional<unsigned> BestIdx =
- getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
+ Optional<unsigned> BestIdx = getBestOperand(
+ OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
// By not selecting a value, we allow the operands that follow to
// select a better matching value. We will get a non-null value in
// the next run of getBestOperand().
if (BestIdx) {
// Swap the current operand with the one returned by
// getBestOperand().
- swap(OpIdx, BestIdx.getValue(), Lane);
+ swap(OpIdx, *BestIdx, Lane);
} else {
// We failed to find a best operand, set mode to 'Failed'.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
}
+ // Try to get the alternate opcode and follow it during analysis.
+ if (MainAltOps[OpIdx].size() != 2) {
+ OperandData &AltOp = getData(OpIdx, Lane);
+ InstructionsState OpS =
+ getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});
+ if (OpS.getOpcode() && OpS.isAltShuffle())
+ MainAltOps[OpIdx].push_back(AltOp.V);
+ }
}
}
}
@@ -1780,15 +2016,109 @@ public:
#endif
};
+ /// Evaluate each pair in \p Candidates and return index into \p Candidates
+ /// for a pair which have highest score deemed to have best chance to form
+ /// root of profitable tree to vectorize. Return None if no candidate scored
+ /// above the LookAheadHeuristics::ScoreFail.
+ /// \param Limit Lower limit of the cost, considered to be good enough score.
+ Optional<int>
+ findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
+ int Limit = LookAheadHeuristics::ScoreFail) {
+ LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
+ RootLookAheadMaxDepth);
+ int BestScore = Limit;
+ Optional<int> Index = None;
+ for (int I : seq<int>(0, Candidates.size())) {
+ int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
+ Candidates[I].second,
+ /*U1=*/nullptr, /*U2=*/nullptr,
+ /*Level=*/1, None);
+ if (Score > BestScore) {
+ BestScore = Score;
+ Index = I;
+ }
+ }
+ return Index;
+ }
+
/// Checks if the instruction is marked for deletion.
bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
- /// Marks values operands for later deletion by replacing them with Undefs.
- void eraseInstructions(ArrayRef<Value *> AV);
+ /// Removes an instruction from its block and eventually deletes it.
+ /// It's like Instruction::eraseFromParent() except that the actual deletion
+ /// is delayed until BoUpSLP is destructed.
+ void eraseInstruction(Instruction *I) {
+ DeletedInstructions.insert(I);
+ }
+
+ /// Checks if the instruction was already analyzed for being possible
+ /// reduction root.
+ bool isAnalyzedReductionRoot(Instruction *I) const {
+ return AnalyzedReductionsRoots.count(I);
+ }
+ /// Register given instruction as already analyzed for being possible
+ /// reduction root.
+ void analyzedReductionRoot(Instruction *I) {
+ AnalyzedReductionsRoots.insert(I);
+ }
+ /// Checks if the provided list of reduced values was checked already for
+ /// vectorization.
+ bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
+ return AnalyzedReductionVals.contains(hash_value(VL));
+ }
+ /// Adds the list of reduced values to list of already checked values for the
+ /// vectorization.
+ void analyzedReductionVals(ArrayRef<Value *> VL) {
+ AnalyzedReductionVals.insert(hash_value(VL));
+ }
+ /// Clear the list of the analyzed reduction root instructions.
+ void clearReductionData() {
+ AnalyzedReductionsRoots.clear();
+ AnalyzedReductionVals.clear();
+ }
+ /// Checks if the given value is gathered in one of the nodes.
+ bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
+ return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
+ }
~BoUpSLP();
private:
+ /// Check if the operands on the edges \p Edges of the \p UserTE allows
+ /// reordering (i.e. the operands can be reordered because they have only one
+ /// user and reordarable).
+ /// \param ReorderableGathers List of all gather nodes that require reordering
+ /// (e.g., gather of extractlements or partially vectorizable loads).
+ /// \param GatherOps List of gather operand nodes for \p UserTE that require
+ /// reordering, subset of \p NonVectorized.
+ bool
+ canReorderOperands(TreeEntry *UserTE,
+ SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+ ArrayRef<TreeEntry *> ReorderableGathers,
+ SmallVectorImpl<TreeEntry *> &GatherOps);
+
+ /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+ /// if any. If it is not vectorized (gather node), returns nullptr.
+ TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
+ ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
+ TreeEntry *TE = nullptr;
+ const auto *It = find_if(VL, [this, &TE](Value *V) {
+ TE = getTreeEntry(V);
+ return TE;
+ });
+ if (It != VL.end() && TE->isSame(VL))
+ return TE;
+ return nullptr;
+ }
+
+ /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+ /// if any. If it is not vectorized (gather node), returns nullptr.
+ const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
+ unsigned OpIdx) const {
+ return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
+ const_cast<TreeEntry *>(UserTE), OpIdx);
+ }
+
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const;
@@ -1815,12 +2145,17 @@ private:
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
+ /// Create a new vector from a list of scalar values. Produces a sequence
+ /// which exploits values reused across lanes, and arranges the inserts
+ /// for ease of later optimization.
+ Value *createBuildVector(ArrayRef<Value *> VL);
+
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars. If \p
/// NeedToShuffle is true, need to add a cost of reshuffling some of the
/// vector elements.
InstructionCost getGatherCost(FixedVectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices,
+ const APInt &ShuffledIndices,
bool NeedToShuffle) const;
/// Checks if the gathered \p VL can be represented as shuffle(s) of previous
@@ -1855,6 +2190,29 @@ private:
const DataLayout &DL,
ScalarEvolution &SE,
const BoUpSLP &R);
+
+ /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
+ /// users of \p TE and collects the stores. It returns the map from the store
+ /// pointers to the collected stores.
+ DenseMap<Value *, SmallVector<StoreInst *, 4>>
+ collectUserStores(const BoUpSLP::TreeEntry *TE) const;
+
+ /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
+ /// stores in \p StoresVec can for a vector instruction. If so it returns true
+ /// and populates \p ReorderIndices with the shuffle indices of the the stores
+ /// when compared to the sorted vector.
+ bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+ OrdersType &ReorderIndices) const;
+
+ /// Iterates through the users of \p TE, looking for scalar stores that can be
+ /// potentially vectorized in a future SLP-tree. If found, it keeps track of
+ /// their order and builds an order index vector for each store bundle. It
+ /// returns all these order vectors found.
+ /// We run this after the tree has formed, otherwise we may come across user
+ /// instructions that are not yet in the tree.
+ SmallVector<OrdersType, 1>
+ findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
+
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -2199,15 +2557,21 @@ private:
ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
- unsigned Lane = 0;
- for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
- BundleMember = BundleMember->NextInBundle) {
- BundleMember->TE = Last;
- BundleMember->Lane = Lane;
- ++Lane;
- }
- assert((!Bundle.getValue() || Lane == VL.size()) &&
+ ScheduleData *BundleMember = *Bundle;
+ assert((BundleMember || isa<PHINode>(S.MainOp) ||
+ isVectorLikeInstWithConstOps(S.MainOp) ||
+ doesNotNeedToSchedule(VL)) &&
"Bundle and VL out of sync");
+ if (BundleMember) {
+ for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
+ assert(BundleMember && "Unexpected end of bundle.");
+ BundleMember->TE = Last;
+ BundleMember = BundleMember->NextInBundle;
+ }
+ }
+ assert(!BundleMember && "Bundle and VL out of sync");
} else {
MustGather.insert(VL.begin(), VL.end());
}
@@ -2241,7 +2605,7 @@ private:
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
- /// Maps a value to the proposed vectorizable size.
+ /// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
/// A list of scalars that we found that we need to keep as scalars.
@@ -2272,12 +2636,12 @@ private:
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
- if (result.hasValue()) {
+ if (result) {
return result.getValue();
}
bool aliased = true;
if (Loc1.Ptr && isSimple(Inst1))
- aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));
+ aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
// Store the result in the cache.
result = aliased;
return aliased;
@@ -2289,20 +2653,23 @@ private:
/// TODO: consider moving this to the AliasAnalysis itself.
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
- /// Removes an instruction from its block and eventually deletes it.
- /// It's like Instruction::eraseFromParent() except that the actual deletion
- /// is delayed until BoUpSLP is destructed.
- /// This is required to ensure that there are no incorrect collisions in the
- /// AliasCache, which can happen if a new instruction is allocated at the
- /// same address as a previously deleted instruction.
- void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
- auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
- It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
- }
+ // Cache for pointerMayBeCaptured calls inside AA. This is preserved
+ // globally through SLP because we don't perform any action which
+ // invalidates capture results.
+ BatchAAResults BatchAA;
/// Temporary store for deleted instructions. Instructions will be deleted
- /// eventually when the BoUpSLP is destructed.
- DenseMap<Instruction *, bool> DeletedInstructions;
+ /// eventually when the BoUpSLP is destructed. The deferral is required to
+ /// ensure that there are no incorrect collisions in the AliasCache, which
+ /// can happen if a new instruction is allocated at the same address as a
+ /// previously deleted instruction.
+ DenseSet<Instruction *> DeletedInstructions;
+
+ /// Set of the instruction, being analyzed already for reductions.
+ SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
+
+ /// Set of hashes for the list of reduction values already being analyzed.
+ DenseSet<size_t> AnalyzedReductionVals;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
@@ -2336,14 +2703,39 @@ private:
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
- UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
OpValue = OpVal;
TE = nullptr;
- Lane = -1;
+ }
+
+ /// Verify basic self consistency properties
+ void verify() {
+ if (hasValidDependencies()) {
+ assert(UnscheduledDeps <= Dependencies && "invariant");
+ } else {
+ assert(UnscheduledDeps == Dependencies && "invariant");
+ }
+
+ if (IsScheduled) {
+ assert(isSchedulingEntity() &&
+ "unexpected scheduled state");
+ for (const ScheduleData *BundleMember = this; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ assert(BundleMember->hasValidDependencies() &&
+ BundleMember->UnscheduledDeps == 0 &&
+ "unexpected scheduled state");
+ assert((BundleMember == this || !BundleMember->IsScheduled) &&
+ "only bundle is marked scheduled");
+ }
+ }
+
+ assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
+ "all bundle members must be in same basic block");
}
/// Returns true if the dependency information has been calculated.
+ /// Note that depenendency validity can vary between instructions within
+ /// a single bundle.
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
/// Returns true for single instructions and for bundle representatives
@@ -2353,7 +2745,7 @@ private:
/// Returns true if it represents an instruction bundle and not only a
/// single instruction.
bool isPartOfBundle() const {
- return NextInBundle != nullptr || FirstInBundle != this;
+ return NextInBundle != nullptr || FirstInBundle != this || TE;
}
/// Returns true if it is ready for scheduling, i.e. it has no more
@@ -2361,20 +2753,23 @@ private:
bool isReady() const {
assert(isSchedulingEntity() &&
"can't consider non-scheduling entity for ready list");
- return UnscheduledDepsInBundle == 0 && !IsScheduled;
+ return unscheduledDepsInBundle() == 0 && !IsScheduled;
}
- /// Modifies the number of unscheduled dependencies, also updating it for
- /// the whole bundle.
+ /// Modifies the number of unscheduled dependencies for this instruction,
+ /// and returns the number of remaining dependencies for the containing
+ /// bundle.
int incrementUnscheduledDeps(int Incr) {
+ assert(hasValidDependencies() &&
+ "increment of unscheduled deps would be meaningless");
UnscheduledDeps += Incr;
- return FirstInBundle->UnscheduledDepsInBundle += Incr;
+ return FirstInBundle->unscheduledDepsInBundle();
}
/// Sets the number of unscheduled dependencies to the number of
/// dependencies.
void resetUnscheduledDeps() {
- incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+ UnscheduledDeps = Dependencies;
}
/// Clears all dependency information.
@@ -2382,6 +2777,19 @@ private:
Dependencies = InvalidDeps;
resetUnscheduledDeps();
MemoryDependencies.clear();
+ ControlDependencies.clear();
+ }
+
+ int unscheduledDepsInBundle() const {
+ assert(isSchedulingEntity() && "only meaningful on the bundle");
+ int Sum = 0;
+ for (const ScheduleData *BundleMember = this; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ if (BundleMember->UnscheduledDeps == InvalidDeps)
+ return InvalidDeps;
+ Sum += BundleMember->UnscheduledDeps;
+ }
+ return Sum;
}
void dump(raw_ostream &os) const {
@@ -2402,6 +2810,12 @@ private:
Instruction *Inst = nullptr;
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
+
+ /// The TreeEntry that this instruction corresponds to.
+ TreeEntry *TE = nullptr;
+
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
ScheduleData *FirstInBundle = nullptr;
@@ -2418,6 +2832,12 @@ private:
/// This list is derived on demand in calculateDependencies().
SmallVector<ScheduleData *, 4> MemoryDependencies;
+ /// List of instructions which this instruction could be control dependent
+ /// on. Allowing such nodes to be scheduled below this one could introduce
+ /// a runtime fault which didn't exist in the original program.
+ /// ex: this is a load or udiv following a readonly call which inf loops
+ SmallVector<ScheduleData *, 4> ControlDependencies;
+
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
int SchedulingRegionID = 0;
@@ -2437,22 +2857,9 @@ private:
/// Note that this is negative as long as Dependencies is not calculated.
int UnscheduledDeps = InvalidDeps;
- /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
- /// single instructions.
- int UnscheduledDepsInBundle = InvalidDeps;
-
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled = false;
-
- /// Opcode of the current instruction in the schedule data.
- Value *OpValue = nullptr;
-
- /// The TreeEntry that this instruction corresponds to.
- TreeEntry *TE = nullptr;
-
- /// The lane of this node in the TreeEntry.
- int Lane = -1;
};
#ifndef NDEBUG
@@ -2467,6 +2874,21 @@ private:
friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
+ /// It does not schedules instructions, which are not memory read/write
+ /// instructions and their operands are either constants, or arguments, or
+ /// phis, or instructions from others blocks, or their users are phis or from
+ /// the other blocks. The resulting vector instructions can be placed at the
+ /// beginning of the basic block without scheduling (if operands does not need
+ /// to be scheduled) or at the end of the block (if users are outside of the
+ /// block). It allows to save some compile time and memory used by the
+ /// compiler.
+ /// ScheduleData is assigned for each instruction in between the boundaries of
+ /// the tree entry, even for those, which are not part of the graph. It is
+ /// required to correctly follow the dependencies between the instructions and
+ /// their correct scheduling. The ScheduleData is not allocated for the
+ /// instructions, which do not require scheduling, like phis, nodes with
+ /// extractelements/insertelements only or nodes with instructions, with
+ /// uses/operands outside of the block.
struct BlockScheduling {
BlockScheduling(BasicBlock *BB)
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
@@ -2477,6 +2899,7 @@ private:
ScheduleEnd = nullptr;
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
+ RegionHasStackSave = false;
// Reduce the maximum schedule region size by the size of the
// previous scheduling run.
@@ -2490,20 +2913,29 @@ private:
++SchedulingRegionID;
}
- ScheduleData *getScheduleData(Value *V) {
- ScheduleData *SD = ScheduleDataMap[V];
- if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ ScheduleData *getScheduleData(Instruction *I) {
+ if (BB != I->getParent())
+ // Avoid lookup if can't possibly be in map.
+ return nullptr;
+ ScheduleData *SD = ScheduleDataMap.lookup(I);
+ if (SD && isInSchedulingRegion(SD))
return SD;
return nullptr;
}
+ ScheduleData *getScheduleData(Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ return getScheduleData(I);
+ return nullptr;
+ }
+
ScheduleData *getScheduleData(Value *V, Value *Key) {
if (V == Key)
return getScheduleData(V);
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end()) {
- ScheduleData *SD = I->second[Key];
- if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ ScheduleData *SD = I->second.lookup(Key);
+ if (SD && isInSchedulingRegion(SD))
return SD;
}
return nullptr;
@@ -2524,7 +2956,7 @@ private:
BundleMember = BundleMember->NextInBundle) {
if (BundleMember->Inst != BundleMember->OpValue)
continue;
-
+
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
@@ -2546,10 +2978,12 @@ private:
};
// If BundleMember is a vector bundle, its operands may have been
- // reordered duiring buildTree(). We therefore need to get its operands
+ // reordered during buildTree(). We therefore need to get its operands
// through the TreeEntry.
if (TreeEntry *TE = BundleMember->TE) {
- int Lane = BundleMember->Lane;
+ // Need to search for the lane since the tree entry can be reordered.
+ int Lane = std::distance(TE->Scalars.begin(),
+ find(TE->Scalars, BundleMember->Inst));
assert(Lane >= 0 && "Lane not set");
// Since vectorization tree is being built recursively this assertion
@@ -2558,7 +2992,7 @@ private:
// where their second (immediate) operand is not added. Since
// immediates do not affect scheduler behavior this is considered
// okay.
- auto *In = TE->getMainOp();
+ auto *In = BundleMember->Inst;
assert(In &&
(isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
In->getNumOperands() == TE->getNumOperands()) &&
@@ -2578,7 +3012,8 @@ private:
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
- if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+ if (MemoryDepSD->hasValidDependencies() &&
+ MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
@@ -2589,6 +3024,48 @@ private:
<< "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
+ // Handle the control dependencies.
+ for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
+ if (DepSD->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = DepSD->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (ctl): " << *DepBundle << "\n");
+ }
+ }
+
+ }
+ }
+
+ /// Verify basic self consistency properties of the data structure.
+ void verify() {
+ if (!ScheduleStart)
+ return;
+
+ assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
+ ScheduleStart->comesBefore(ScheduleEnd) &&
+ "Not a valid scheduling region?");
+
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ auto *SD = getScheduleData(I);
+ if (!SD)
+ continue;
+ assert(isInSchedulingRegion(SD) &&
+ "primary schedule data not in window?");
+ assert(isInSchedulingRegion(SD->FirstInBundle) &&
+ "entire bundle in window!");
+ (void)SD;
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
+ }
+
+ for (auto *SD : ReadyInsts) {
+ assert(SD->isSchedulingEntity() && SD->isReady() &&
+ "item in ready list not ready?");
+ (void)SD;
}
}
@@ -2599,7 +3076,7 @@ private:
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end())
for (auto &P : I->second)
- if (P.second->SchedulingRegionID == SchedulingRegionID)
+ if (isInSchedulingRegion(P.second))
Action(P.second);
}
@@ -2608,10 +3085,11 @@ private:
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [&](ScheduleData *SD) {
- if (SD->isSchedulingEntity() && SD->isReady()) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+ SD->isReady()) {
ReadyList.insert(SD);
LLVM_DEBUG(dbgs()
- << "SLP: initially in ready list: " << *I << "\n");
+ << "SLP: initially in ready list: " << *SD << "\n");
}
});
}
@@ -2669,18 +3147,14 @@ private:
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
- DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+ DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
/// Attaches ScheduleData to Instruction with the leading key.
DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
ExtraScheduleDataMap;
- struct ReadyList : SmallVector<ScheduleData *, 8> {
- void insert(ScheduleData *SD) { push_back(SD); }
- };
-
/// The ready-list for scheduling (only used for the dry-run).
- ReadyList ReadyInsts;
+ SetVector<ScheduleData *> ReadyInsts;
/// The first instruction of the scheduling region.
Instruction *ScheduleStart = nullptr;
@@ -2696,6 +3170,11 @@ private:
/// (can be null).
ScheduleData *LastLoadStoreInRegion = nullptr;
+ /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
+ /// region? Used to optimize the dependence calculation for the
+ /// common case where there isn't.
+ bool RegionHasStackSave = false;
+
/// The current size of the scheduling region.
int ScheduleRegionSize = 0;
@@ -2704,8 +3183,8 @@ private:
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
- // Make sure that the initial SchedulingRegionID is greater than the
- // initial SchedulingRegionID in ScheduleData (which is 0).
+ /// Make sure that the initial SchedulingRegionID is greater than the
+ /// initial SchedulingRegionID in ScheduleData (which is 0).
int SchedulingRegionID = 1;
};
@@ -2717,7 +3196,7 @@ private:
void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
- ArrayRef<Value *> UserIgnoreList;
+ const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
/// sorted SmallVectors of unsigned.
@@ -2748,7 +3227,6 @@ private:
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
- AAResults *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
@@ -2865,20 +3343,25 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
} // end namespace llvm
BoUpSLP::~BoUpSLP() {
- for (const auto &Pair : DeletedInstructions) {
- // Replace operands of ignored instructions with Undefs in case if they were
- // marked for deletion.
- if (Pair.getSecond()) {
- Value *Undef = UndefValue::get(Pair.getFirst()->getType());
- Pair.getFirst()->replaceAllUsesWith(Undef);
- }
- Pair.getFirst()->dropAllReferences();
- }
- for (const auto &Pair : DeletedInstructions) {
- assert(Pair.getFirst()->use_empty() &&
+ SmallVector<WeakTrackingVH> DeadInsts;
+ for (auto *I : DeletedInstructions) {
+ for (Use &U : I->operands()) {
+ auto *Op = dyn_cast<Instruction>(U.get());
+ if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
+ wouldInstructionBeTriviallyDead(Op, TLI))
+ DeadInsts.emplace_back(Op);
+ }
+ I->dropAllReferences();
+ }
+ for (auto *I : DeletedInstructions) {
+ assert(I->use_empty() &&
"trying to erase instruction with users.");
- Pair.getFirst()->eraseFromParent();
+ I->eraseFromParent();
}
+
+ // Cleanup any dead scalar code feeding the vectorized instructions
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
+
#ifdef EXPENSIVE_CHECKS
// If we could guarantee that this call is not extremely slow, we could
// remove the ifdef limitation (see PR47712).
@@ -2886,13 +3369,6 @@ BoUpSLP::~BoUpSLP() {
#endif
}
-void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
- for (auto *V : AV) {
- if (auto *I = dyn_cast<Instruction>(V))
- eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
- };
-}
-
/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
/// contains original mask for the scalars reused in the node. Procedure
/// transform this mask in accordance with the given \p Mask.
@@ -2997,6 +3473,189 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
return None;
}
+namespace {
+/// Tracks the state we can represent the loads in the given sequence.
+enum class LoadsState { Gather, Vectorize, ScatterVectorize };
+} // anonymous namespace
+
+/// Checks if the given array of loads can be represented as a vectorized,
+/// scatter or just simple gather.
+static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE,
+ LoopInfo &LI,
+ SmallVectorImpl<unsigned> &Order,
+ SmallVectorImpl<Value *> &PointerOps) {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ Type *ScalarTy = VL0->getType();
+
+ if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
+ return LoadsState::Gather;
+
+ // Make sure all loads in the bundle are simple - we can't vectorize
+ // atomic or volatile loads.
+ PointerOps.clear();
+ PointerOps.resize(VL.size());
+ auto *POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
+ if (!L->isSimple())
+ return LoadsState::Gather;
+ *POIter = L->getPointerOperand();
+ ++POIter;
+ }
+
+ Order.clear();
+ // Check the order of pointer operands or that all pointers are the same.
+ bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
+ if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {
+ if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front()))
+ return false;
+ auto *GEP = dyn_cast<GetElementPtrInst>(P);
+ if (!GEP)
+ return false;
+ auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());
+ return GEP->getNumOperands() == 2 &&
+ ((isConstant(GEP->getOperand(1)) &&
+ isConstant(GEP0->getOperand(1))) ||
+ getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)})
+ .getOpcode());
+ })) {
+ if (IsSorted) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ Optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ // Check that the sorted loads are consecutive.
+ if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+ return LoadsState::Vectorize;
+ }
+ // TODO: need to improve analysis of the pointers, if not all of them are
+ // GEPs or have > 2 operands, we end up with a gather node, which just
+ // increases the cost.
+ Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
+ bool ProfitableGatherPointers =
+ static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+ return L && L->isLoopInvariant(V);
+ })) <= VL.size() / 2 && VL.size() > 2;
+ if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(P);
+ return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
+ (GEP && GEP->getNumOperands() == 2);
+ })) {
+ Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
+ for (Value *V : VL)
+ CommonAlignment =
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
+ !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
+ return LoadsState::ScatterVectorize;
+ }
+ }
+
+ return LoadsState::Gather;
+}
+
+bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices) {
+ assert(llvm::all_of(
+ VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
+ "Expected list of pointer operands.");
+ // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
+ // Ptr into, sort and return the sorted indices with values next to one
+ // another.
+ MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
+ Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
+
+ unsigned Cnt = 1;
+ for (Value *Ptr : VL.drop_front()) {
+ bool Found = any_of(Bases, [&](auto &Base) {
+ Optional<int> Diff =
+ getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
+ /*StrictCheck=*/true);
+ if (!Diff)
+ return false;
+
+ Base.second.emplace_back(Ptr, *Diff, Cnt++);
+ return true;
+ });
+
+ if (!Found) {
+ // If we haven't found enough to usefully cluster, return early.
+ if (Bases.size() > VL.size() / 2 - 1)
+ return false;
+
+ // Not found already - add a new Base
+ Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
+ }
+ }
+
+ // For each of the bases sort the pointers by Offset and check if any of the
+ // base become consecutively allocated.
+ bool AnyConsecutive = false;
+ for (auto &Base : Bases) {
+ auto &Vec = Base.second;
+ if (Vec.size() > 1) {
+ llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
+ const std::tuple<Value *, int, unsigned> &Y) {
+ return std::get<1>(X) < std::get<1>(Y);
+ });
+ int InitialOffset = std::get<1>(Vec[0]);
+ AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {
+ return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
+ });
+ }
+ }
+
+ // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
+ SortedIndices.clear();
+ if (!AnyConsecutive)
+ return false;
+
+ for (auto &Base : Bases) {
+ for (auto &T : Base.second)
+ SortedIndices.push_back(std::get<2>(T));
+ }
+
+ assert(SortedIndices.size() == VL.size() &&
+ "Expected SortedIndices to be the size of VL");
+ return true;
+}
+
+Optional<BoUpSLP::OrdersType>
+BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
+ assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
+ Type *ScalarTy = TE.Scalars[0]->getType();
+
+ SmallVector<Value *> Ptrs;
+ Ptrs.reserve(TE.Scalars.size());
+ for (Value *V : TE.Scalars) {
+ auto *L = dyn_cast<LoadInst>(V);
+ if (!L || !L->isSimple())
+ return None;
+ Ptrs.push_back(L->getPointerOperand());
+ }
+
+ BoUpSLP::OrdersType Order;
+ if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
+ return Order;
+ return None;
+}
+
Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
bool TopToBottom) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -3037,6 +3696,9 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
}
if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
return CurrentOrder;
+ if (TE.Scalars.size() >= 4)
+ if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
+ return Order;
}
return None;
}
@@ -3047,13 +3709,55 @@ void BoUpSLP::reorderTopToBottom() {
// ExtractElement gather nodes which can be vectorized and need to handle
// their ordering.
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+
+ // AltShuffles can also have a preferred ordering that leads to fewer
+ // instructions, e.g., the addsub instruction in x86.
+ DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
+
+ // Maps a TreeEntry to the reorder indices of external users.
+ DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
+ ExternalUserReorderMap;
+ // FIXME: Workaround for syntax error reported by MSVC buildbots.
+ TargetTransformInfo &TTIRef = *TTI;
// Find all reorderable nodes with the given VF.
// Currently the are vectorized stores,loads,extracts + some gathering of
// extracts.
- for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
+ for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
+ &GathersToOrders, &ExternalUserReorderMap,
+ &AltShufflesToOrders](
const std::unique_ptr<TreeEntry> &TE) {
+ // Look for external users that will probably be vectorized.
+ SmallVector<OrdersType, 1> ExternalUserReorderIndices =
+ findExternalStoreUsersReorderIndices(TE.get());
+ if (!ExternalUserReorderIndices.empty()) {
+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ ExternalUserReorderMap.try_emplace(TE.get(),
+ std::move(ExternalUserReorderIndices));
+ }
+
+ // Patterns like [fadd,fsub] can be combined into a single instruction in
+ // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
+ // to take into account their order when looking for the most used order.
+ if (TE->isAltShuffle()) {
+ VectorType *VecTy =
+ FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
+ unsigned Opcode0 = TE->getOpcode();
+ unsigned Opcode1 = TE->getAltOpcode();
+ // The opcode mask selects between the two opcodes.
+ SmallBitVector OpcodeMask(TE->Scalars.size(), 0);
+ for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
+ if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
+ OpcodeMask.set(Lane);
+ // If this pattern is supported by the target then we consider the order.
+ if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
+ }
+ // TODO: Check the reverse order too.
+ }
+
if (Optional<OrdersType> CurrentOrder =
- getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
+ getReorderingData(*TE, /*TopToBottom=*/true)) {
// Do not include ordering for nodes used in the alt opcode vectorization,
// better to reorder them during bottom-to-top stage. If follow the order
// here, it causes reordering of the whole graph though actually it is
@@ -3071,10 +3775,7 @@ void BoUpSLP::reorderTopToBottom() {
EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
}))
return;
- if (UserTE->UserTreeIndices.empty())
- UserTE = nullptr;
- else
- UserTE = UserTE->UserTreeIndices.back().UserTE;
+ UserTE = UserTE->UserTreeIndices.back().UserTE;
++Cnt;
}
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
@@ -3105,11 +3806,30 @@ void BoUpSLP::reorderTopToBottom() {
if (!OpTE->ReuseShuffleIndices.empty())
continue;
// Count number of orders uses.
- const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
- if (OpTE->State == TreeEntry::NeedToGather)
- return GathersToOrders.find(OpTE)->second;
+ const auto &Order = [OpTE, &GathersToOrders,
+ &AltShufflesToOrders]() -> const OrdersType & {
+ if (OpTE->State == TreeEntry::NeedToGather) {
+ auto It = GathersToOrders.find(OpTE);
+ if (It != GathersToOrders.end())
+ return It->second;
+ }
+ if (OpTE->isAltShuffle()) {
+ auto It = AltShufflesToOrders.find(OpTE);
+ if (It != AltShufflesToOrders.end())
+ return It->second;
+ }
return OpTE->ReorderIndices;
}();
+ // First consider the order of the external scalar users.
+ auto It = ExternalUserReorderMap.find(OpTE);
+ if (It != ExternalUserReorderMap.end()) {
+ const auto &ExternalUserReorderIndices = It->second;
+ for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
+ ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+ // No other useful reorder data in this entry.
+ if (Order.empty())
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -3199,6 +3919,57 @@ void BoUpSLP::reorderTopToBottom() {
}
}
+bool BoUpSLP::canReorderOperands(
+ TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+ ArrayRef<TreeEntry *> ReorderableGathers,
+ SmallVectorImpl<TreeEntry *> &GatherOps) {
+ for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
+ if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
+ return OpData.first == I &&
+ OpData.second->State == TreeEntry::Vectorize;
+ }))
+ continue;
+ if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+ // Do not reorder if operand node is used by many user nodes.
+ if (any_of(TE->UserTreeIndices,
+ [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
+ return false;
+ // Add the node to the list of the ordered nodes with the identity
+ // order.
+ Edges.emplace_back(I, TE);
+ // Add ScatterVectorize nodes to the list of operands, where just
+ // reordering of the scalars is required. Similar to the gathers, so
+ // simply add to the list of gathered ops.
+ // If there are reused scalars, process this node as a regular vectorize
+ // node, just reorder reuses mask.
+ if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
+ GatherOps.push_back(TE);
+ continue;
+ }
+ TreeEntry *Gather = nullptr;
+ if (count_if(ReorderableGathers,
+ [&Gather, UserTE, I](TreeEntry *TE) {
+ assert(TE->State != TreeEntry::Vectorize &&
+ "Only non-vectorized nodes are expected.");
+ if (any_of(TE->UserTreeIndices,
+ [UserTE, I](const EdgeInfo &EI) {
+ return EI.UserTE == UserTE && EI.EdgeIdx == I;
+ })) {
+ assert(TE->isSame(UserTE->getOperand(I)) &&
+ "Operand entry does not match operands.");
+ Gather = TE;
+ return true;
+ }
+ return false;
+ }) > 1 &&
+ !all_of(UserTE->getOperand(I), isConstant))
+ return false;
+ if (Gather)
+ GatherOps.push_back(Gather);
+ }
+ return true;
+}
+
void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SetVector<TreeEntry *> OrderedEntries;
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
@@ -3212,49 +3983,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (TE->State != TreeEntry::Vectorize)
NonVectorized.push_back(TE.get());
if (Optional<OrdersType> CurrentOrder =
- getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
+ getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
});
- // Checks if the operands of the users are reordarable and have only single
- // use.
- auto &&CheckOperands =
- [this, &NonVectorized](const auto &Data,
- SmallVectorImpl<TreeEntry *> &GatherOps) {
- for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
- if (any_of(Data.second,
- [I](const std::pair<unsigned, TreeEntry *> &OpData) {
- return OpData.first == I &&
- OpData.second->State == TreeEntry::Vectorize;
- }))
- continue;
- ArrayRef<Value *> VL = Data.first->getOperand(I);
- const TreeEntry *TE = nullptr;
- const auto *It = find_if(VL, [this, &TE](Value *V) {
- TE = getTreeEntry(V);
- return TE;
- });
- if (It != VL.end() && TE->isSame(VL))
- return false;
- TreeEntry *Gather = nullptr;
- if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
- assert(TE->State != TreeEntry::Vectorize &&
- "Only non-vectorized nodes are expected.");
- if (TE->isSame(VL)) {
- Gather = TE;
- return true;
- }
- return false;
- }) > 1)
- return false;
- if (Gather)
- GatherOps.push_back(Gather);
- }
- return true;
- };
// 1. Propagate order to the graph nodes, which use only reordered nodes.
// I.e., if the node has operands, that are reordered, try to make at least
// one operand order in the natural order and reorder others + reorder the
@@ -3263,7 +3998,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
while (!OrderedEntries.empty()) {
// 1. Filter out only reordered nodes.
// 2. If the entry has multiple uses - skip it and jump to the next node.
- MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
+ DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
@@ -3291,10 +4026,17 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Erase filtered entries.
for_each(Filtered,
[&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
- for (const auto &Data : Users) {
+ SmallVector<
+ std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
+ UsersVec(Users.begin(), Users.end());
+ sort(UsersVec, [](const auto &Data1, const auto &Data2) {
+ return Data1.first->Idx > Data2.first->Idx;
+ });
+ for (auto &Data : UsersVec) {
// Check that operands are used only in the User node.
SmallVector<TreeEntry *> GatherOps;
- if (!CheckOperands(Data, GatherOps)) {
+ if (!canReorderOperands(Data.first, Data.second, NonVectorized,
+ GatherOps)) {
for_each(Data.second,
[&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
OrderedEntries.remove(Op.second);
@@ -3310,18 +4052,22 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// the same node my be considered several times, though might be not
// profitable.
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+ SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
for (const auto &Op : Data.second) {
TreeEntry *OpTE = Op.second;
if (!VisitedOps.insert(OpTE).second)
continue;
- if (!OpTE->ReuseShuffleIndices.empty() ||
- (IgnoreReorder && OpTE == VectorizableTree.front().get()))
+ if (!OpTE->ReuseShuffleIndices.empty())
continue;
const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
if (OpTE->State == TreeEntry::NeedToGather)
return GathersToOrders.find(OpTE)->second;
return OpTE->ReorderIndices;
}();
+ unsigned NumOps = count_if(
+ Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
+ return P.second == OpTE;
+ });
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -3333,14 +4079,52 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
- ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+ OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
+ NumOps;
} else {
- ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
+ }
+ auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
+ const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
+ const TreeEntry *TE) {
+ if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
+ (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
+ (IgnoreReorder && TE->Idx == 0))
+ return true;
+ if (TE->State == TreeEntry::NeedToGather) {
+ auto It = GathersToOrders.find(TE);
+ if (It != GathersToOrders.end())
+ return !It->second.empty();
+ return true;
+ }
+ return false;
+ };
+ for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
+ TreeEntry *UserTE = EI.UserTE;
+ if (!VisitedUsers.insert(UserTE).second)
+ continue;
+ // May reorder user node if it requires reordering, has reused
+ // scalars, is an alternate op vectorize node or its op nodes require
+ // reordering.
+ if (AllowsReordering(UserTE))
+ continue;
+ // Check if users allow reordering.
+ // Currently look up just 1 level of operands to avoid increase of
+ // the compile time.
+ // Profitable to reorder if definitely more operands allow
+ // reordering rather than those with natural order.
+ ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
+ if (static_cast<unsigned>(count_if(
+ Ops, [UserTE, &AllowsReordering](
+ const std::pair<unsigned, TreeEntry *> &Op) {
+ return AllowsReordering(Op.second) &&
+ all_of(Op.second->UserTreeIndices,
+ [UserTE](const EdgeInfo &EI) {
+ return EI.UserTE == UserTE;
+ });
+ })) <= Ops.size() / 2)
+ ++Res.first->second;
}
- OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
- OpTE->UserTreeIndices.size();
- assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
- --OrdersUses[{}];
}
// If no orders - skip current nodes and jump to the next one, if any.
if (OrdersUses.empty()) {
@@ -3381,7 +4165,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
OrderedEntries.remove(TE);
if (!VisitedOps.insert(TE).second)
continue;
- if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
+ if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
// Just reorder reuses indices.
reorderReuses(TE->ReuseShuffleIndices, Mask);
continue;
@@ -3393,6 +4177,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
TE->ReorderIndices.empty()) &&
"Non-matching sizes of user/operand entries.");
reorderOrder(TE->ReorderIndices, Mask);
+ if (IgnoreReorder && TE == VectorizableTree.front().get())
+ IgnoreReorder = false;
}
// For gathers just need to reorder its scalars.
for (TreeEntry *Gather : GatherOps) {
@@ -3484,7 +4270,7 @@ void BoUpSLP::buildExternalUses(
}
// Ignore users in the user ignore list.
- if (is_contained(UserIgnoreList, UserInst))
+ if (UserIgnoreList && UserIgnoreList->contains(UserInst))
continue;
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
@@ -3495,78 +4281,270 @@ void BoUpSLP::buildExternalUses(
}
}
+DenseMap<Value *, SmallVector<StoreInst *, 4>>
+BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
+ DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;
+ for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
+ Value *V = TE->Scalars[Lane];
+ // To save compilation time we don't visit if we have too many users.
+ static constexpr unsigned UsersLimit = 4;
+ if (V->hasNUsesOrMore(UsersLimit))
+ break;
+
+ // Collect stores per pointer object.
+ for (User *U : V->users()) {
+ auto *SI = dyn_cast<StoreInst>(U);
+ if (SI == nullptr || !SI->isSimple() ||
+ !isValidElementType(SI->getValueOperand()->getType()))
+ continue;
+ // Skip entry if already
+ if (getTreeEntry(U))
+ continue;
+
+ Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+ auto &StoresVec = PtrToStoresMap[Ptr];
+ // For now just keep one store per pointer object per lane.
+ // TODO: Extend this to support multiple stores per pointer per lane
+ if (StoresVec.size() > Lane)
+ continue;
+ // Skip if in different BBs.
+ if (!StoresVec.empty() &&
+ SI->getParent() != StoresVec.back()->getParent())
+ continue;
+ // Make sure that the stores are of the same type.
+ if (!StoresVec.empty() &&
+ SI->getValueOperand()->getType() !=
+ StoresVec.back()->getValueOperand()->getType())
+ continue;
+ StoresVec.push_back(SI);
+ }
+ }
+ return PtrToStoresMap;
+}
+
+bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+ OrdersType &ReorderIndices) const {
+ // We check whether the stores in StoreVec can form a vector by sorting them
+ // and checking whether they are consecutive.
+
+ // To avoid calling getPointersDiff() while sorting we create a vector of
+ // pairs {store, offset from first} and sort this instead.
+ SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
+ StoreInst *S0 = StoresVec[0];
+ StoreOffsetVec[0] = {S0, 0};
+ Type *S0Ty = S0->getValueOperand()->getType();
+ Value *S0Ptr = S0->getPointerOperand();
+ for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
+ StoreInst *SI = StoresVec[Idx];
+ Optional<int> Diff =
+ getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
+ SI->getPointerOperand(), *DL, *SE,
+ /*StrictCheck=*/true);
+ // We failed to compare the pointers so just abandon this StoresVec.
+ if (!Diff)
+ return false;
+ StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
+ }
+
+ // Sort the vector based on the pointers. We create a copy because we may
+ // need the original later for calculating the reorder (shuffle) indices.
+ stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
+ const std::pair<StoreInst *, int> &Pair2) {
+ int Offset1 = Pair1.second;
+ int Offset2 = Pair2.second;
+ return Offset1 < Offset2;
+ });
+
+ // Check if the stores are consecutive by checking if their difference is 1.
+ for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
+ if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
+ return false;
+
+ // Calculate the shuffle indices according to their offset against the sorted
+ // StoreOffsetVec.
+ ReorderIndices.reserve(StoresVec.size());
+ for (StoreInst *SI : StoresVec) {
+ unsigned Idx = find_if(StoreOffsetVec,
+ [SI](const std::pair<StoreInst *, int> &Pair) {
+ return Pair.first == SI;
+ }) -
+ StoreOffsetVec.begin();
+ ReorderIndices.push_back(Idx);
+ }
+ // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
+ // reorderTopToBottom() and reorderBottomToTop(), so we are following the
+ // same convention here.
+ auto IsIdentityOrder = [](const OrdersType &Order) {
+ for (unsigned Idx : seq<unsigned>(0, Order.size()))
+ if (Idx != Order[Idx])
+ return false;
+ return true;
+ };
+ if (IsIdentityOrder(ReorderIndices))
+ ReorderIndices.clear();
+
+ return true;
+}
+
+#ifndef NDEBUG
+LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
+ for (unsigned Idx : Order)
+ dbgs() << Idx << ", ";
+ dbgs() << "\n";
+}
+#endif
+
+SmallVector<BoUpSLP::OrdersType, 1>
+BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
+ unsigned NumLanes = TE->Scalars.size();
+
+ DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =
+ collectUserStores(TE);
+
+ // Holds the reorder indices for each candidate store vector that is a user of
+ // the current TreeEntry.
+ SmallVector<OrdersType, 1> ExternalReorderIndices;
+
+ // Now inspect the stores collected per pointer and look for vectorization
+ // candidates. For each candidate calculate the reorder index vector and push
+ // it into `ExternalReorderIndices`
+ for (const auto &Pair : PtrToStoresMap) {
+ auto &StoresVec = Pair.second;
+ // If we have fewer than NumLanes stores, then we can't form a vector.
+ if (StoresVec.size() != NumLanes)
+ continue;
+
+ // If the stores are not consecutive then abandon this StoresVec.
+ OrdersType ReorderIndices;
+ if (!CanFormVector(StoresVec, ReorderIndices))
+ continue;
+
+ // We now know that the scalars in StoresVec can form a vector instruction,
+ // so set the reorder indices.
+ ExternalReorderIndices.push_back(ReorderIndices);
+ }
+ return ExternalReorderIndices;
+}
+
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
- ArrayRef<Value *> UserIgnoreLst) {
+ const SmallDenseSet<Value *> &UserIgnoreLst) {
deleteTree();
- UserIgnoreList = UserIgnoreLst;
+ UserIgnoreList = &UserIgnoreLst;
if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0, EdgeInfo());
}
-namespace {
-/// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize };
-} // anonymous namespace
-
-/// Checks if the given array of loads can be represented as a vectorized,
-/// scatter or just simple gather.
-static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
- const TargetTransformInfo &TTI,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &Order,
- SmallVectorImpl<Value *> &PointerOps) {
- // Check that a vectorized load would load the same memory as a scalar
- // load. For example, we don't want to vectorize loads that are smaller
- // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
- // treats loading/storing it as an i8 struct. If we vectorize loads/stores
- // from such a struct, we read/write packed bits disagreeing with the
- // unvectorized version.
- Type *ScalarTy = VL0->getType();
-
- if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
- return LoadsState::Gather;
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+ deleteTree();
+ if (!allSameType(Roots))
+ return;
+ buildTree_rec(Roots, 0, EdgeInfo());
+}
- // Make sure all loads in the bundle are simple - we can't vectorize
- // atomic or volatile loads.
- PointerOps.clear();
- PointerOps.resize(VL.size());
- auto *POIter = PointerOps.begin();
+/// \return true if the specified list of values has only one instruction that
+/// requires scheduling, false otherwise.
+#ifndef NDEBUG
+static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
+ Value *NeedsScheduling = nullptr;
for (Value *V : VL) {
- auto *L = cast<LoadInst>(V);
- if (!L->isSimple())
- return LoadsState::Gather;
- *POIter = L->getPointerOperand();
- ++POIter;
+ if (doesNotNeedToBeScheduled(V))
+ continue;
+ if (!NeedsScheduling) {
+ NeedsScheduling = V;
+ continue;
+ }
+ return false;
}
+ return NeedsScheduling;
+}
+#endif
- Order.clear();
- // Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {
- Value *Ptr0;
- Value *PtrN;
- if (Order.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
+/// Generates key/subkey pair for the given value to provide effective sorting
+/// of the values and better detection of the vectorizable values sequences. The
+/// keys/subkeys can be used for better sorting of the values themselves (keys)
+/// and in values subgroups (subkeys).
+static std::pair<size_t, size_t> generateKeySubkey(
+ Value *V, const TargetLibraryInfo *TLI,
+ function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
+ bool AllowAlternate) {
+ hash_code Key = hash_value(V->getValueID() + 2);
+ hash_code SubKey = hash_value(0);
+ // Sort the loads by the distance between the pointers.
+ if (auto *LI = dyn_cast<LoadInst>(V)) {
+ Key = hash_combine(hash_value(Instruction::Load), Key);
+ if (LI->isSimple())
+ SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
+ else
+ SubKey = hash_value(LI);
+ } else if (isVectorLikeInstWithConstOps(V)) {
+ // Sort extracts by the vector operands.
+ if (isa<ExtractElementInst, UndefValue>(V))
+ Key = hash_value(Value::UndefValueVal + 1);
+ if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
+ if (!isUndefVector(EI->getVectorOperand()) &&
+ !isa<UndefValue>(EI->getIndexOperand()))
+ SubKey = hash_value(EI->getVectorOperand());
+ }
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ // Sort other instructions just by the opcodes except for CMPInst.
+ // For CMP also sort by the predicate kind.
+ if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+ isValidForAlternation(I->getOpcode())) {
+ if (AllowAlternate)
+ Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
+ else
+ Key = hash_combine(hash_value(I->getOpcode()), Key);
+ SubKey = hash_combine(
+ hash_value(I->getOpcode()), hash_value(I->getType()),
+ hash_value(isa<BinaryOperator>(I)
+ ? I->getType()
+ : cast<CastInst>(I)->getOperand(0)->getType()));
+ // For casts, look through the only operand to improve compile time.
+ if (isa<CastInst>(I)) {
+ std::pair<size_t, size_t> OpVals =
+ generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
+ /*=AllowAlternate*/ true);
+ Key = hash_combine(OpVals.first, Key);
+ SubKey = hash_combine(OpVals.first, SubKey);
+ }
+ } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+ CmpInst::Predicate Pred = CI->getPredicate();
+ if (CI->isCommutative())
+ Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
+ CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
+ hash_value(SwapPred),
+ hash_value(CI->getOperand(0)->getType()));
+ } else if (auto *Call = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
+ if (isTriviallyVectorizable(ID)) {
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
+ } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
+ SubKey = hash_combine(hash_value(I->getOpcode()),
+ hash_value(Call->getCalledFunction()));
+ } else {
+ Key = hash_combine(hash_value(Call), Key);
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
+ }
+ for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
+ SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
+ hash_value(Op.Tag), SubKey);
+ } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+ if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
+ SubKey = hash_value(Gep->getPointerOperand());
+ else
+ SubKey = hash_value(Gep);
+ } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
+ !isa<ConstantInt>(I->getOperand(1))) {
+ // Do not try to vectorize instructions with potentially high cost.
+ SubKey = hash_value(I);
} else {
- Ptr0 = PointerOps[Order.front()];
- PtrN = PointerOps[Order.back()];
+ SubKey = hash_value(I->getOpcode());
}
- Optional<int> Diff =
- getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
- // Check that the sorted loads are consecutive.
- if (static_cast<unsigned>(*Diff) == VL.size() - 1)
- return LoadsState::Vectorize;
- Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
- for (Value *V : VL)
- CommonAlignment =
- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
- if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
- CommonAlignment))
- return LoadsState::ScatterVectorize;
+ Key = hash_combine(hash_value(I->getParent()), Key);
}
-
- return LoadsState::Gather;
+ return std::make_pair(Key, SubKey);
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
@@ -3651,10 +4629,84 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// If all of the operands are identical or constant we have a simple solution.
// If we deal with insert/extract instructions, they all must have constant
// indices, otherwise we should gather them, not try to vectorize.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
- (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
- !all_of(VL, isVectorLikeInstWithConstOps))) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ // If alternate op node with 2 elements with gathered operands - do not
+ // vectorize.
+ auto &&NotProfitableForVectorization = [&S, this,
+ Depth](ArrayRef<Value *> VL) {
+ if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
+ return false;
+ if (VectorizableTree.size() < MinTreeSize)
+ return false;
+ if (Depth >= RecursionMaxDepth - 1)
+ return true;
+ // Check if all operands are extracts, part of vector node or can build a
+ // regular vectorize node.
+ SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
+ for (Value *V : VL) {
+ auto *I = cast<Instruction>(V);
+ InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
+ return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
+ }));
+ }
+ bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
+ if ((IsCommutative &&
+ std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
+ (!IsCommutative &&
+ all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
+ return true;
+ assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
+ SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
+ auto *I1 = cast<Instruction>(VL.front());
+ auto *I2 = cast<Instruction>(VL.back());
+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+ I2->getOperand(Op));
+ if (static_cast<unsigned>(count_if(
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+ })) >= S.MainOp->getNumOperands() / 2)
+ return false;
+ if (S.MainOp->getNumOperands() > 2)
+ return true;
+ if (IsCommutative) {
+ // Check permuted operands.
+ Candidates.clear();
+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+ I2->getOperand((Op + 1) % E));
+ if (any_of(
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+ }))
+ return false;
+ }
+ return true;
+ };
+ SmallVector<unsigned> SortedIndices;
+ BasicBlock *BB = nullptr;
+ bool AreAllSameInsts =
+ (S.getOpcode() && allSameBlock(VL)) ||
+ (S.OpValue->getType()->isPointerTy() && UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+ VL.size() > 2 &&
+ all_of(VL,
+ [&BB](Value *V) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ return doesNotNeedToBeScheduled(V);
+ if (!BB)
+ BB = I->getParent();
+ return BB == I->getParent() && I->getNumOperands() == 2;
+ }) &&
+ BB &&
+ sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
+ SortedIndices));
+ if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts ||
+ (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
+ S.OpValue) &&
+ !all_of(VL, isVectorLikeInstWithConstOps)) ||
+ NotProfitableForVectorization(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
@@ -3665,12 +4717,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// the same block.
// Don't vectorize ephemeral values.
- for (Value *V : VL) {
- if (EphValues.count(V)) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
- << ") is ephemeral.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
+ if (!EphValues.empty()) {
+ for (Value *V : VL) {
+ if (EphValues.count(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is ephemeral.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
}
}
@@ -3708,20 +4762,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
- for (Value *V : VL) {
- if (is_contained(UserIgnoreList, V)) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
+ if (UserIgnoreList && !UserIgnoreList->empty()) {
+ for (Value *V : VL) {
+ if (UserIgnoreList && UserIgnoreList->contains(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
}
}
+ // Special processing for sorted pointers for ScatterVectorize node with
+ // constant indeces only.
+ if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) &&
+ UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {
+ assert(S.OpValue->getType()->isPointerTy() &&
+ count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
+ 2 &&
+ "Expected pointers only.");
+ // Reset S to make it GetElementPtr kind of node.
+ const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
+ assert(It != VL.end() && "Expected at least one GEP.");
+ S = getSameOpcode(*It);
+ }
+
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
auto *VL0 = cast<Instruction>(S.OpValue);
- BasicBlock *BB = VL0->getParent();
+ BB = VL0->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
@@ -3739,9 +4810,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);
- BlockScheduling &BS = *BSRef.get();
+ BlockScheduling &BS = *BSRef;
Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+#ifdef EXPENSIVE_CHECKS
+ // Make sure we didn't break any internal invariants
+ BS.verify();
+#endif
if (!Bundle) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
@@ -3761,10 +4836,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check for terminator values (e.g. invoke).
for (Value *V : VL)
- for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
- Instruction *Term = dyn_cast<Instruction>(
- cast<PHINode>(V)->getIncomingValueForBlock(
- PH->getIncomingBlock(I)));
+ for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
+ Instruction *Term = dyn_cast<Instruction>(Incoming);
if (Term && Term->isTerminator()) {
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
@@ -3908,7 +4981,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<Value *> PointerOps;
OrdersType CurrentOrder;
TreeEntry *TE = nullptr;
- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,
+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,
PointerOps)) {
case LoadsState::Vectorize:
if (CurrentOrder.empty()) {
@@ -4089,7 +5162,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (Value *V : VL) {
- if (cast<Instruction>(V)->getNumOperands() != 2) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ if (I->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -4100,9 +5176,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We can't combine several GEPs into one vector if they operate on
// different types.
- Type *Ty0 = VL0->getOperand(0)->getType();
+ Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
for (Value *V : VL) {
- Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
+ auto *GEP = dyn_cast<GEPOperator>(V);
+ if (!GEP)
+ continue;
+ Type *CurTy = GEP->getSourceElementType();
if (Ty0 != CurTy) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (different types).\n");
@@ -4113,15 +5192,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
+ bool IsScatterUser =
+ UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
// We don't combine GEPs with non-constant indexes.
Type *Ty1 = VL0->getOperand(1)->getType();
for (Value *V : VL) {
- auto Op = cast<Instruction>(V)->getOperand(1);
- if (!isa<ConstantInt>(Op) ||
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ auto *Op = I->getOperand(1);
+ if ((!IsScatterUser && !isa<ConstantInt>(Op)) ||
(Op->getType() != Ty1 &&
- Op->getType()->getScalarSizeInBits() >
- DL->getIndexSizeInBits(
- V->getType()->getPointerAddressSpace()))) {
+ ((IsScatterUser && !isa<ConstantInt>(Op)) ||
+ Op->getType()->getScalarSizeInBits() >
+ DL->getIndexSizeInBits(
+ V->getType()->getPointerAddressSpace())))) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
@@ -4136,9 +5222,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
SmallVector<ValueList, 2> Operands(2);
// Prepare the operand vector for pointer operands.
- for (Value *V : VL)
- Operands.front().push_back(
- cast<GetElementPtrInst>(V)->getPointerOperand());
+ for (Value *V : VL) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(V);
+ if (!GEP) {
+ Operands.front().push_back(V);
+ continue;
+ }
+ Operands.front().push_back(GEP->getPointerOperand());
+ }
TE->setOperand(0, Operands.front());
// Need to cast all indices to the same type before vectorization to
// avoid crash.
@@ -4149,9 +5240,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
Type *Ty = all_of(VL,
[VL0Ty, IndexIdx](Value *V) {
- return VL0Ty == cast<GetElementPtrInst>(V)
- ->getOperand(IndexIdx)
- ->getType();
+ auto *GEP = dyn_cast<GetElementPtrInst>(V);
+ if (!GEP)
+ return true;
+ return VL0Ty == GEP->getOperand(IndexIdx)->getType();
})
? VL0Ty
: DL->getIndexType(cast<GetElementPtrInst>(VL0)
@@ -4159,10 +5251,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
->getScalarType());
// Prepare the operand vector.
for (Value *V : VL) {
- auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);
- auto *CI = cast<ConstantInt>(Op);
- Operands.back().push_back(ConstantExpr::getIntegerCast(
- CI, Ty, CI->getValue().isSignBitSet()));
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I) {
+ Operands.back().push_back(
+ ConstantInt::get(Ty, 0, /*isSigned=*/false));
+ continue;
+ }
+ auto *Op = I->getOperand(IndexIdx);
+ auto *CI = dyn_cast<ConstantInt>(Op);
+ if (!CI)
+ Operands.back().push_back(Op);
+ else
+ Operands.back().push_back(ConstantExpr::getIntegerCast(
+ CI, Ty, CI->getValue().isSignBitSet()));
}
TE->setOperand(IndexIdx, Operands.back());
@@ -4268,7 +5369,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
unsigned NumArgs = CI->arg_size();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
- if (hasVectorInstrinsicScalarOpd(ID, j))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
ScalarArgs[j] = CI->getArgOperand(j);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -4287,7 +5388,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Some intrinsics have scalar arguments and should be same in order for
// them to be vectorized.
for (unsigned j = 0; j != NumArgs; ++j) {
- if (hasVectorInstrinsicScalarOpd(ID, j)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
Value *A1J = CI2->getArgOperand(j);
if (ScalarArgs[j] != A1J) {
BS.cancelScheduling(VL, VL0);
@@ -4320,7 +5421,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
// For scalar operands no need to to create an entry since no need to
// vectorize it.
- if (hasVectorInstrinsicScalarOpd(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
continue;
ValueList Operands;
// Prepare the operand vector.
@@ -4347,9 +5448,42 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
- if (isa<BinaryOperator>(VL0)) {
+ auto *CI = dyn_cast<CmpInst>(VL0);
+ if (isa<BinaryOperator>(VL0) || CI) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ if (!CI || all_of(VL, [](Value *V) {
+ return cast<CmpInst>(V)->isCommutative();
+ })) {
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ } else {
+ CmpInst::Predicate P0 = CI->getPredicate();
+ CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();
+ assert(P0 != AltP0 &&
+ "Expected different main/alternate predicates.");
+ CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+ Value *BaseOp0 = VL0->getOperand(0);
+ Value *BaseOp1 = VL0->getOperand(1);
+ // Collect operands - commute if it uses the swapped predicate or
+ // alternate operation.
+ for (Value *V : VL) {
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ CmpInst::Predicate CurrentPred = Cmp->getPredicate();
+ if (P0 == AltP0Swapped) {
+ if (CI != Cmp && S.AltOp != Cmp &&
+ ((P0 == CurrentPred &&
+ !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
+ (AltP0 == CurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))))
+ std::swap(LHS, RHS);
+ } else if (P0 != CurrentPred && AltP0 != CurrentPred) {
+ std::swap(LHS, RHS);
+ }
+ Left.push_back(LHS);
+ Right.push_back(RHS);
+ }
+ }
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -4493,7 +5627,9 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const {
return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
all_of(I->users(), [this](User *U) {
- return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
+ return ScalarToTreeEntry.count(U) > 0 ||
+ isVectorLikeInstWithConstOps(U) ||
+ (isa<ExtractElementInst>(U) && MustGather.contains(U));
});
}
@@ -4550,19 +5686,21 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a shuffle
// to extract the values into a vector register.
+ SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
for (auto *V : VL) {
++Idx;
- // Need to exclude undefs from analysis.
- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
- continue;
-
// Reached the start of a new vector registers.
if (Idx % EltsPerVector == 0) {
+ RegMask.assign(EltsPerVector, UndefMaskElem);
AllConsecutive = true;
continue;
}
+ // Need to exclude undefs from analysis.
+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
+ continue;
+
// Check all extracts for a vector register on the target directly
// extract values in order.
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
@@ -4570,6 +5708,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
}
if (AllConsecutive)
@@ -4581,10 +5720,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// If we have a series of extracts which are not consecutive and hence
// cannot re-use the source vector register directly, compute the shuffle
- // cost to extract the a vector with EltsPerVector elements.
+ // cost to extract the vector with EltsPerVector elements.
Cost += TTI.getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
+ FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
}
return Cost;
}
@@ -4592,12 +5731,12 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
/// Build shuffle mask for shuffle graph entries and lists of main and alternate
/// operations operands.
static void
-buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
- ArrayRef<int> ReusesIndices,
- const function_ref<bool(Instruction *)> IsAltOp,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<Value *> *OpScalars = nullptr,
- SmallVectorImpl<Value *> *AltScalars = nullptr) {
+buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
+ ArrayRef<int> ReusesIndices,
+ const function_ref<bool(Instruction *)> IsAltOp,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<Value *> *OpScalars = nullptr,
+ SmallVectorImpl<Value *> *AltScalars = nullptr) {
unsigned Sz = VL.size();
Mask.assign(Sz, UndefMaskElem);
SmallVector<int> OrderMask;
@@ -4627,6 +5766,29 @@ buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
}
}
+/// Checks if the specified instruction \p I is an alternate operation for the
+/// given \p MainOp and \p AltOp instructions.
+static bool isAlternateInstruction(const Instruction *I,
+ const Instruction *MainOp,
+ const Instruction *AltOp) {
+ if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) {
+ auto *AltCI0 = cast<CmpInst>(AltOp);
+ auto *CI = cast<CmpInst>(I);
+ CmpInst::Predicate P0 = CI0->getPredicate();
+ CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+ assert(P0 != AltP0 && "Expected different main/alternate predicates.");
+ CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+ CmpInst::Predicate CurrentPred = CI->getPredicate();
+ if (P0 == AltP0Swapped)
+ return I == AltCI0 ||
+ (I != MainOp &&
+ !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+ CI->getOperand(0), CI->getOperand(1)));
+ return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
+ }
+ return I->getOpcode() == AltOp->getOpcode();
+}
+
InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals) {
ArrayRef<Value*> VL = E->Scalars;
@@ -4740,7 +5902,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
SmallVector<const TreeEntry *> Entries;
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
isGatherShuffledEntry(E, Mask, Entries);
- if (Shuffle.hasValue()) {
+ if (Shuffle) {
InstructionCost GatherCost = 0;
if (ShuffleVectorInst::isIdentityMask(Mask)) {
// Perfect match in the graph, will reuse the previously vectorized
@@ -4776,7 +5938,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
SmallVector<int> Mask;
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
isFixedVectorShuffle(VL, Mask);
- if (ShuffleKind.hasValue()) {
+ if (ShuffleKind) {
// Found the bunch of extractelement instructions that must be gathered
// into a vector and can be represented as a permutation elements in a
// single input vector or of 2 input vectors.
@@ -4794,7 +5956,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// broadcast.
assert(VecTy == FinalVecTy &&
"No reused scalars expected for broadcast.");
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+ /*Mask=*/None, /*Index=*/0,
+ /*SubTp=*/nullptr, /*Args=*/VL[0]);
}
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
@@ -4818,8 +5982,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
SmallVector<Value *> PointerOps;
OrdersType CurrentOrder;
- LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,
- *SE, CurrentOrder, PointerOps);
+ LoadsState LS =
+ canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
+ CurrentOrder, PointerOps);
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
@@ -4909,7 +6074,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");
- assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ assert(E->getOpcode() &&
+ ((allSameType(VL) && allSameBlock(VL)) ||
+ (E->getOpcode() == Instruction::GetElementPtr &&
+ E->getMainOp()->getType()->isPointerTy())) &&
+ "Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -4981,28 +6150,60 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
assert(E->ReuseShuffleIndices.empty() &&
"Unique insertelements only are expected.");
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
-
unsigned const NumElts = SrcVecTy->getNumElements();
unsigned const NumScalars = VL.size();
+
+ unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+
+ unsigned OffsetBeg = *getInsertIndex(VL.front());
+ unsigned OffsetEnd = OffsetBeg;
+ for (Value *V : VL.drop_front()) {
+ unsigned Idx = *getInsertIndex(V);
+ if (OffsetBeg > Idx)
+ OffsetBeg = Idx;
+ else if (OffsetEnd < Idx)
+ OffsetEnd = Idx;
+ }
+ unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
+ if (NumOfParts > 0)
+ VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
+ unsigned VecSz =
+ (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
+ VecScalarsSz;
+ unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
+ unsigned InsertVecSz = std::min<unsigned>(
+ PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
+ ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
+ VecScalarsSz);
+ bool IsWholeSubvector =
+ OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
+ // Check if we can safely insert a subvector. If it is not possible, just
+ // generate a whole-sized vector and shuffle the source vector and the new
+ // subvector.
+ if (OffsetBeg + InsertVecSz > VecSz) {
+ // Align OffsetBeg to generate correct mask.
+ OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
+ InsertVecSz = VecSz;
+ }
+
APInt DemandedElts = APInt::getZero(NumElts);
// TODO: Add support for Instruction::InsertValue.
SmallVector<int> Mask;
if (!E->ReorderIndices.empty()) {
inversePermutation(E->ReorderIndices, Mask);
- Mask.append(NumElts - NumScalars, UndefMaskElem);
+ Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
} else {
- Mask.assign(NumElts, UndefMaskElem);
- std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+ Mask.assign(VecSz, UndefMaskElem);
+ std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
}
- unsigned Offset = *getInsertIndex(VL0, 0);
bool IsIdentity = true;
- SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+ SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
DemandedElts.setBit(InsertIdx);
- IsIdentity &= InsertIdx - Offset == I;
- Mask[InsertIdx - Offset] = I;
+ IsIdentity &= InsertIdx - OffsetBeg == I;
+ Mask[InsertIdx - OffsetBeg] = I;
}
assert(Offset < NumElts && "Failed to find vector index offset");
@@ -5010,32 +6211,41 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
/*Insert*/ true, /*Extract*/ false);
- if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
- // FIXME: Replace with SK_InsertSubvector once it is properly supported.
- unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
- Cost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(SrcVecTy->getElementType(), Sz));
- } else if (!IsIdentity) {
- auto *FirstInsert =
- cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
- return !is_contained(E->Scalars,
- cast<Instruction>(V)->getOperand(0));
- }));
- if (isUndefVector(FirstInsert->getOperand(0))) {
- Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
+ // First cost - resize to actual vector size if not identity shuffle or
+ // need to shift the vector.
+ // Do not calculate the cost if the actual size is the register size and
+ // we can merge this shuffle with the following SK_Select.
+ auto *InsertVecTy =
+ FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
+ if (!IsIdentity)
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ InsertVecTy, Mask);
+ auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+ return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+ }));
+ // Second cost - permutation with subvector, if some elements are from the
+ // initial vector or inserting a subvector.
+ // TODO: Implement the analysis of the FirstInsert->getOperand(0)
+ // subvector of ActualVecTy.
+ if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts &&
+ !IsWholeSubvector) {
+ if (InsertVecSz != VecSz) {
+ auto *ActualVecTy =
+ FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
+ Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
+ None, OffsetBeg - Offset, InsertVecTy);
} else {
- SmallVector<int> InsertMask(NumElts);
- std::iota(InsertMask.begin(), InsertMask.end(), 0);
- for (unsigned I = 0; I < NumElts; I++) {
+ for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
+ Mask[I] = I;
+ for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
+ I <= End; ++I)
if (Mask[I] != UndefMaskElem)
- InsertMask[Offset + I] = NumElts + I;
- }
- Cost +=
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
+ Mask[I] = I + VecSz;
+ for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
+ Mask[I] = I;
+ Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
}
}
-
return Cost;
}
case Instruction::ZExt:
@@ -5116,9 +6326,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// If the selects are the only uses of the compares, they will be dead
// and we can adjust the cost by removing their cost.
if (IntrinsicAndUse.second)
- IntrinsicCost -=
- TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
+ MaskTy, VecPred, CostKind);
VecCost = std::min(VecCost, IntrinsicCost);
}
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
@@ -5198,7 +6407,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_UniformConstantValue;
+ any_of(VL,
+ [](Value *V) {
+ return isa<GetElementPtrInst>(V) &&
+ !isConstant(
+ cast<GetElementPtrInst>(V)->getOperand(1));
+ })
+ ? TargetTransformInfo::OK_AnyValue
+ : TargetTransformInfo::OK_UniformConstantValue;
InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
@@ -5229,7 +6445,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Align CommonAlignment = Alignment;
for (Value *V : VL)
CommonAlignment =
- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
VecLdCost = TTI->getGatherScatterOpCost(
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
/*VariableMask=*/false, CommonAlignment, CostKind, VL0);
@@ -5279,7 +6495,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
"Invalid Shuffle Vector Operand");
InstructionCost ScalarCost = 0;
if (NeedToShuffleReuses) {
@@ -5327,6 +6544,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+ Builder.getInt1Ty(),
+ CI0->getPredicate(), CostKind, VL0);
+ VecCost += TTI->getCmpSelInstrCost(
+ E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
+ E->getAltOp());
} else {
Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
@@ -5338,16 +6563,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI::CastContextHint::None, CostKind);
}
- SmallVector<int> Mask;
- buildSuffleEntryMask(
- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
- [E](Instruction *I) {
- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return I->getOpcode() == E->getAltOpcode();
- },
- Mask);
- CommonCost =
- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
+ if (E->ReuseShuffleIndices.empty()) {
+ CommonCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
+ } else {
+ SmallVector<int> Mask;
+ buildShuffleEntryMask(
+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+ [E](Instruction *I) {
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ return I->getOpcode() == E->getAltOpcode();
+ },
+ Mask);
+ CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ FinalVecTy, Mask);
+ }
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
return CommonCost + VecCost - ScalarCost;
}
@@ -5475,7 +6705,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// No need to vectorize inserts of gathered values.
if (VectorizableTree.size() == 2 &&
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
- VectorizableTree[1]->State == TreeEntry::NeedToGather)
+ VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+ (VectorizableTree[1]->getVectorFactor() <= 2 ||
+ !(isSplat(VectorizableTree[1]->Scalars) ||
+ allConstant(VectorizableTree[1]->Scalars))))
return true;
// We can vectorize the tree if its size is greater than or equal to the
@@ -5605,20 +6838,26 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
return false;
auto *IE1 = VU;
auto *IE2 = V;
+ unsigned Idx1 = *getInsertIndex(IE1);
+ unsigned Idx2 = *getInsertIndex(IE2);
// Go through the vector operand of insertelement instructions trying to find
// either VU as the original vector for IE2 or V as the original vector for
// IE1.
do {
- if (IE2 == VU || IE1 == V)
- return true;
+ if (IE2 == VU)
+ return VU->hasOneUse();
+ if (IE1 == V)
+ return V->hasOneUse();
if (IE1) {
- if (IE1 != VU && !IE1->hasOneUse())
+ if ((IE1 != VU && !IE1->hasOneUse()) ||
+ getInsertIndex(IE1).value_or(Idx2) == Idx2)
IE1 = nullptr;
else
IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
}
if (IE2) {
- if (IE2 != V && !IE2->hasOneUse())
+ if ((IE2 != V && !IE2->hasOneUse()) ||
+ getInsertIndex(IE2).value_or(Idx1) == Idx1)
IE2 = nullptr;
else
IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
@@ -5627,6 +6866,153 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
return false;
}
+/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
+/// buildvector sequence.
+static bool isFirstInsertElement(const InsertElementInst *IE1,
+ const InsertElementInst *IE2) {
+ if (IE1 == IE2)
+ return false;
+ const auto *I1 = IE1;
+ const auto *I2 = IE2;
+ const InsertElementInst *PrevI1;
+ const InsertElementInst *PrevI2;
+ unsigned Idx1 = *getInsertIndex(IE1);
+ unsigned Idx2 = *getInsertIndex(IE2);
+ do {
+ if (I2 == IE1)
+ return true;
+ if (I1 == IE2)
+ return false;
+ PrevI1 = I1;
+ PrevI2 = I2;
+ if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
+ getInsertIndex(I1).value_or(Idx2) != Idx2)
+ I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
+ if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
+ getInsertIndex(I2).value_or(Idx1) != Idx1)
+ I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
+ } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
+ llvm_unreachable("Two different buildvectors not expected.");
+}
+
+namespace {
+/// Returns incoming Value *, if the requested type is Value * too, or a default
+/// value, otherwise.
+struct ValueSelect {
+ template <typename U>
+ static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type
+ get(Value *V) {
+ return V;
+ }
+ template <typename U>
+ static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type
+ get(Value *) {
+ return U();
+ }
+};
+} // namespace
+
+/// Does the analysis of the provided shuffle masks and performs the requested
+/// actions on the vectors with the given shuffle masks. It tries to do it in
+/// several steps.
+/// 1. If the Base vector is not undef vector, resizing the very first mask to
+/// have common VF and perform action for 2 input vectors (including non-undef
+/// Base). Other shuffle masks are combined with the resulting after the 1 stage
+/// and processed as a shuffle of 2 elements.
+/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
+/// action only for 1 vector with the given mask, if it is not the identity
+/// mask.
+/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
+/// vectors, combing the masks properly between the steps.
+template <typename T>
+static T *performExtractsShuffleAction(
+ MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
+ function_ref<unsigned(T *)> GetVF,
+ function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction,
+ function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
+ assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
+ SmallVector<int> Mask(ShuffleMask.begin()->second);
+ auto VMIt = std::next(ShuffleMask.begin());
+ T *Prev = nullptr;
+ bool IsBaseNotUndef = !isUndefVector(Base);
+ if (IsBaseNotUndef) {
+ // Base is not undef, need to combine it with the next subvectors.
+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
+ if (Mask[Idx] == UndefMaskElem)
+ Mask[Idx] = Idx;
+ else
+ Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
+ }
+ auto *V = ValueSelect::get<T *>(Base);
+ (void)V;
+ assert((!V || GetVF(V) == Mask.size()) &&
+ "Expected base vector of VF number of elements.");
+ Prev = Action(Mask, {nullptr, Res.first});
+ } else if (ShuffleMask.size() == 1) {
+ // Base is undef and only 1 vector is shuffled - perform the action only for
+ // single vector, if the mask is not the identity mask.
+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ if (Res.second)
+ // Identity mask is found.
+ Prev = Res.first;
+ else
+ Prev = Action(Mask, {ShuffleMask.begin()->first});
+ } else {
+ // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
+ // shuffles step by step, combining shuffle between the steps.
+ unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
+ unsigned Vec2VF = GetVF(VMIt->first);
+ if (Vec1VF == Vec2VF) {
+ // No need to resize the input vectors since they are of the same size, we
+ // can shuffle them directly.
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (SecMask[I] != UndefMaskElem) {
+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ Mask[I] = SecMask[I] + Vec1VF;
+ }
+ }
+ Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
+ } else {
+ // Vectors of different sizes - resize and reshuffle.
+ std::pair<T *, bool> Res1 =
+ ResizeAction(ShuffleMask.begin()->first, Mask);
+ std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second);
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (Mask[I] != UndefMaskElem) {
+ assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ if (Res1.second)
+ Mask[I] = I;
+ } else if (SecMask[I] != UndefMaskElem) {
+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
+ }
+ }
+ Prev = Action(Mask, {Res1.first, Res2.first});
+ }
+ VMIt = std::next(VMIt);
+ }
+ // Perform requested actions for the remaining masks/vectors.
+ for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
+ // Shuffle other input vectors, if any.
+ std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second);
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (SecMask[I] != UndefMaskElem) {
+ assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
+ "Multiple uses of scalars.");
+ Mask[I] = (Res.second ? I : SecMask[I]) + VF;
+ } else if (Mask[I] != UndefMaskElem) {
+ Mask[I] = I;
+ }
+ }
+ Prev = Action(Mask, {Prev, Res.first});
+ }
+ return Prev;
+}
+
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5635,7 +7021,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
- TreeEntry &TE = *VectorizableTree[I].get();
+ TreeEntry &TE = *VectorizableTree[I];
InstructionCost C = getEntryCost(&TE, VectorizedVals);
Cost += C;
@@ -5647,9 +7033,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallPtrSet<Value *, 16> ExtractCostCalculated;
InstructionCost ExtractCost = 0;
- SmallVector<unsigned> VF;
- SmallVector<SmallVector<int>> ShuffleMask;
- SmallVector<Value *> FirstUsers;
+ SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
+ SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<APInt> DemandedElts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
@@ -5678,37 +7063,55 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
Optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
- auto *It = find_if(FirstUsers, [VU](Value *V) {
- return areTwoInsertFromSameBuildVector(VU,
- cast<InsertElementInst>(V));
- });
+ const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
+ auto *It =
+ find_if(FirstUsers,
+ [VU](const std::pair<Value *, const TreeEntry *> &Pair) {
+ return areTwoInsertFromSameBuildVector(
+ VU, cast<InsertElementInst>(Pair.first));
+ });
int VecId = -1;
if (It == FirstUsers.end()) {
- VF.push_back(FTy->getNumElements());
- ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
+ (void)ShuffleMasks.emplace_back();
+ SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
- while (isa<InsertElementInst>(Base)) {
+ while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
+ if (IEBase != EU.User &&
+ (!IEBase->hasOneUse() ||
+ getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
+ break;
// Build the mask for the vectorized insertelement instructions.
- if (const TreeEntry *E = getTreeEntry(Base)) {
- VU = cast<InsertElementInst>(Base);
+ if (const TreeEntry *E = getTreeEntry(IEBase)) {
+ VU = IEBase;
do {
- int Idx = E->findLaneForValue(Base);
- ShuffleMask.back()[Idx] = Idx;
- Base = cast<InsertElementInst>(Base)->getOperand(0);
+ IEBase = cast<InsertElementInst>(Base);
+ int Idx = *getInsertIndex(IEBase);
+ assert(Mask[Idx] == UndefMaskElem &&
+ "InsertElementInstruction used already.");
+ Mask[Idx] = Idx;
+ Base = IEBase->getOperand(0);
} while (E == getTreeEntry(Base));
break;
}
Base = cast<InsertElementInst>(Base)->getOperand(0);
}
- FirstUsers.push_back(VU);
- DemandedElts.push_back(APInt::getZero(VF.back()));
+ FirstUsers.emplace_back(VU, ScalarTE);
+ DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
VecId = FirstUsers.size() - 1;
} else {
+ if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
+ It->first = VU;
VecId = std::distance(FirstUsers.begin(), It);
}
- ShuffleMask[VecId][*InsertIdx] = EU.Lane;
- DemandedElts[VecId].setBit(*InsertIdx);
+ int InIdx = *InsertIdx;
+ SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask[InIdx] = EU.Lane;
+ DemandedElts[VecId].setBit(InIdx);
continue;
}
}
@@ -5734,86 +7137,75 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- if (FirstUsers.size() == 1) {
- int Limit = ShuffleMask.front().size() * 2;
- if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
- !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
- InstructionCost C = TTI->getShuffleCost(
+ auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) {
+ InstructionCost C = 0;
+ unsigned VF = Mask.size();
+ unsigned VecVF = TE->getVectorFactor();
+ if (VF != VecVF &&
+ (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
+ (all_of(Mask,
+ [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
+ !ShuffleVectorInst::isIdentityMask(Mask)))) {
+ SmallVector<int> OrigMask(VecVF, UndefMaskElem);
+ std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
+ OrigMask.begin());
+ C = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc,
- cast<FixedVectorType>(FirstUsers.front()->getType()),
- ShuffleMask.front());
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of insertelement external users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
+ FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
+ LLVM_DEBUG(
+ dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of insertelement external users.\n";
+ TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
Cost += C;
+ return std::make_pair(TE, true);
}
+ return std::make_pair(TE, false);
+ };
+ // Calculate the cost of the reshuffled vectors, if any.
+ for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+ Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
+ unsigned VF = ShuffleMasks[I].begin()->second.size();
+ auto *FTy = FixedVectorType::get(
+ cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
+ auto Vector = ShuffleMasks[I].takeVector();
+ auto &&EstimateShufflesCost = [this, FTy,
+ &Cost](ArrayRef<int> Mask,
+ ArrayRef<const TreeEntry *> TEs) {
+ assert((TEs.size() == 1 || TEs.size() == 2) &&
+ "Expected exactly 1 or 2 tree entries.");
+ if (TEs.size() == 1) {
+ int Limit = 2 * Mask.size();
+ if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
+ !ShuffleVectorInst::isIdentityMask(Mask)) {
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of insertelement "
+ "external users.\n";
+ TEs.front()->dump();
+ dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ }
+ } else {
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external "
+ "insertelement users.\n";
+ if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
+ dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ }
+ return TEs.back();
+ };
+ (void)performExtractsShuffleAction<const TreeEntry>(
+ makeMutableArrayRef(Vector.data(), Vector.size()), Base,
+ [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
+ EstimateShufflesCost);
InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers.front()->getType()),
- DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost -= InsertCost;
- } else if (FirstUsers.size() >= 2) {
- unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
- // Combined masks of the first 2 vectors.
- SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
- copy(ShuffleMask.front(), CombinedMask.begin());
- APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
- auto *VecTy = FixedVectorType::get(
- cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
- MaxVF);
- for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
- if (ShuffleMask[1][I] != UndefMaskElem) {
- CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
- CombinedDemandedElts.setBit(I);
- }
- }
- InstructionCost C =
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external "
- "insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
- InstructionCost InsertCost = TTI->getScalarizationOverhead(
- VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
+ cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
+ /*Insert*/ true, /*Extract*/ false);
Cost -= InsertCost;
- for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
- // Other elements - permutation of 2 vectors (the initial one and the
- // next Ith incoming vector).
- unsigned VF = ShuffleMask[I].size();
- for (unsigned Idx = 0; Idx < VF; ++Idx) {
- int Mask = ShuffleMask[I][Idx];
- if (Mask != UndefMaskElem)
- CombinedMask[Idx] = MaxVF + Mask;
- else if (CombinedMask[Idx] != UndefMaskElem)
- CombinedMask[Idx] = Idx;
- }
- for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
- if (CombinedMask[Idx] != UndefMaskElem)
- CombinedMask[Idx] = Idx;
- InstructionCost C =
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external "
- "insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
- InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
- /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost -= InsertCost;
- }
}
#ifndef NDEBUG
@@ -5906,6 +7298,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
}
}
+ if (UsedTEs.empty()) {
+ assert(all_of(TE->Scalars, UndefValue::classof) &&
+ "Expected vector of undefs only.");
+ return None;
+ }
+
unsigned VF = 0;
if (UsedTEs.size() == 1) {
// Try to find the perfect match in another gather node at first.
@@ -5965,17 +7363,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
return None;
}
-InstructionCost
-BoUpSLP::getGatherCost(FixedVectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices,
- bool NeedToShuffle) const {
- unsigned NumElts = Ty->getNumElements();
- APInt DemandedElts = APInt::getZero(NumElts);
- for (unsigned I = 0; I < NumElts; ++I)
- if (!ShuffledIndices.count(I))
- DemandedElts.setBit(I);
+InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,
+ const APInt &ShuffledIndices,
+ bool NeedToShuffle) const {
InstructionCost Cost =
- TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+ TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,
/*Extract*/ false);
if (NeedToShuffle)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
@@ -5992,19 +7384,19 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
- DenseSet<unsigned> ShuffledElements;
+ APInt ShuffledElements = APInt::getZero(VL.size());
DenseSet<Value *> UniqueElements;
// Iterate in reverse order to consider insert elements with the high cost.
for (unsigned I = VL.size(); I > 0; --I) {
unsigned Idx = I - 1;
// No need to shuffle duplicates for constants.
if (isConstant(VL[Idx])) {
- ShuffledElements.insert(Idx);
+ ShuffledElements.setBit(Idx);
continue;
}
if (!UniqueElements.insert(VL[Idx]).second) {
DuplicateNonConst = true;
- ShuffledElements.insert(Idx);
+ ShuffledElements.setBit(Idx);
}
}
return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
@@ -6029,14 +7421,83 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// Get the basic block this bundle is in. All instructions in the bundle
- // should be in this block.
+ // should be in this block (except for extractelement-like instructions with
+ // constant indeces).
auto *Front = E->getMainOp();
auto *BB = Front->getParent();
assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
+ if (E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(V))
+ return true;
auto *I = cast<Instruction>(V);
- return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
+ isVectorLikeInstWithConstOps(I);
}));
+ auto &&FindLastInst = [E, Front, this, &BB]() {
+ Instruction *LastInst = Front;
+ for (Value *V : E->Scalars) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (LastInst->getParent() == I->getParent()) {
+ if (LastInst->comesBefore(I))
+ LastInst = I;
+ continue;
+ }
+ assert(isVectorLikeInstWithConstOps(LastInst) &&
+ isVectorLikeInstWithConstOps(I) &&
+ "Expected vector-like insts only.");
+ if (!DT->isReachableFromEntry(LastInst->getParent())) {
+ LastInst = I;
+ continue;
+ }
+ if (!DT->isReachableFromEntry(I->getParent()))
+ continue;
+ auto *NodeA = DT->getNode(LastInst->getParent());
+ auto *NodeB = DT->getNode(I->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) ==
+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
+ LastInst = I;
+ }
+ BB = LastInst->getParent();
+ return LastInst;
+ };
+
+ auto &&FindFirstInst = [E, Front]() {
+ Instruction *FirstInst = Front;
+ for (Value *V : E->Scalars) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (I->comesBefore(FirstInst))
+ FirstInst = I;
+ }
+ return FirstInst;
+ };
+
+ // Set the insert point to the beginning of the basic block if the entry
+ // should not be scheduled.
+ if (E->State != TreeEntry::NeedToGather &&
+ doesNotNeedToSchedule(E->Scalars)) {
+ Instruction *InsertInst;
+ if (all_of(E->Scalars, isUsedOutsideBlock))
+ InsertInst = FindLastInst();
+ else
+ InsertInst = FindFirstInst();
+ // If the instruction is PHI, set the insert point after all the PHIs.
+ if (isa<PHINode>(InsertInst))
+ InsertInst = BB->getFirstNonPHI();
+ BasicBlock::iterator InsertPt = InsertInst->getIterator();
+ Builder.SetInsertPoint(BB, InsertPt);
+ Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+ return;
+ }
+
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
@@ -6045,8 +7506,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
- auto *Bundle =
- BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+ Value *V = E->isOneOf(E->Scalars.back());
+ if (doesNotNeedToBeScheduled(V))
+ V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
+ auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
@@ -6072,19 +7535,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
if (!LastInst) {
- SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
- for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
- LastInst = &I;
- if (Bundle.empty())
- break;
- }
+ LastInst = FindLastInst();
+ // If the instruction is PHI, set the insert point after all the PHIs.
+ if (isa<PHINode>(LastInst))
+ LastInst = BB->getFirstNonPHI()->getPrevNode();
}
assert(LastInst && "Failed to find last instruction in bundle");
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+ Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -6214,8 +7674,15 @@ public:
} // namespace
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
- unsigned VF = VL.size();
+ const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL);
+ // Special processing for GEPs bundle, which may include non-gep values.
+ if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
+ const auto *It =
+ find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
+ if (It != VL.end())
+ S = getSameOpcode(*It);
+ }
if (S.getOpcode()) {
if (TreeEntry *E = getTreeEntry(S.OpValue))
if (E->isSame(VL)) {
@@ -6270,7 +7737,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
}
}
- // Check that every instruction appears once in this bundle.
+ // Can't vectorize this, so simply build a new vector with each lane
+ // corresponding to the requested value.
+ return createBuildVector(VL);
+}
+Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
+ assert(any_of(VectorizableTree,
+ [VL](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather && TE->isSame(VL);
+ }) &&
+ "Non-matching gather node.");
+ unsigned VF = VL.size();
+ // Exploit possible reuse of values across lanes.
SmallVector<int> ReuseShuffleIndicies;
SmallVector<Value *> UniqueValues;
if (VL.size() > 2) {
@@ -6303,6 +7781,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
UndefMaskElem);
} else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
+ if (UniqueValues.empty()) {
+ assert(all_of(VL, UndefValue::classof) && "Expected list of undefs.");
+ NumValues = VF;
+ }
ReuseShuffleIndicies.clear();
UniqueValues.clear();
UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));
@@ -6342,7 +7824,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<const TreeEntry *> Entries;
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
isGatherShuffledEntry(E, Mask, Entries);
- if (Shuffle.hasValue()) {
+ if (Shuffle) {
assert((Entries.size() == 1 || Entries.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
@@ -6376,14 +7858,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
- assert(
- (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) &&
- "PHI reordering is free.");
+ assert((E->ReorderIndices.empty() ||
+ E != VectorizableTree.front().get() ||
+ !E->UserTreeIndices.empty()) &&
+ "PHI reordering is free.");
auto *PH = cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
Value *V = NewPhi;
+
+ // Adjust insertion point once all PHI's have been generated.
+ Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+
ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -6449,7 +7937,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
const unsigned NumScalars = E->Scalars.size();
- unsigned Offset = *getInsertIndex(VL0, 0);
+ unsigned Offset = *getInsertIndex(VL0);
assert(Offset < NumElts && "Failed to find vector index offset");
// Create shuffle to resize vector
@@ -6656,19 +8144,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned AS = LI->getPointerAddressSpace();
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
-
Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
+ NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
// The pointer operand uses an in-tree scalar so we add the new BitCast
- // to ExternalUses list to make sure that an extract will be generated
- // in the future.
+ // or LoadInst to ExternalUses list to make sure that an extract will
+ // be generated in the future.
if (TreeEntry *Entry = getTreeEntry(PO)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(PO);
- ExternalUses.emplace_back(PO, cast<User>(VecPtr), FoundLane);
+ ExternalUses.emplace_back(
+ PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);
}
-
- NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
Value *VecPtr = vectorizeTree(E->getOperand(0));
@@ -6676,7 +8163,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Align CommonAlignment = LI->getAlign();
for (Value *V : E->Scalars)
CommonAlignment =
- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
}
Value *V = propagateMetadata(NewLI, E->Scalars);
@@ -6701,17 +8188,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
ScalarPtr, VecValue->getType()->getPointerTo(AS));
- StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
- SI->getAlign());
+ StoreInst *ST =
+ Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());
- // The pointer operand uses an in-tree scalar, so add the new BitCast to
- // ExternalUses to make sure that an extract will be generated in the
- // future.
+ // The pointer operand uses an in-tree scalar, so add the new BitCast or
+ // StoreInst to ExternalUses to make sure that an extract will be
+ // generated in the future.
if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);
- ExternalUses.push_back(
- ExternalUser(ScalarPtr, cast<User>(VecPtr), FoundLane));
+ ExternalUses.push_back(ExternalUser(
+ ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,
+ FoundLane));
}
Value *V = propagateMetadata(ST, E->Scalars);
@@ -6733,8 +8221,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
- if (Instruction *I = dyn_cast<Instruction>(V))
- V = propagateMetadata(I, E->Scalars);
+ if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
+ SmallVector<Value *> GEPs;
+ for (Value *V : E->Scalars) {
+ if (isa<GetElementPtrInst>(V))
+ GEPs.push_back(V);
+ }
+ V = propagateMetadata(I, GEPs);
+ }
ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
@@ -6767,11 +8261,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
- if (hasVectorInstrinsicOverloadedScalarOpd(IID, j))
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
TysForDecl.push_back(ScalarArg->getType());
continue;
}
@@ -6779,6 +8273,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *OpVec = vectorizeTree(E->getOperand(j));
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+ TysForDecl.push_back(OpVec->getType());
}
Function *CF;
@@ -6822,11 +8318,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
"Invalid Shuffle Vector Operand");
Value *LHS = nullptr, *RHS = nullptr;
- if (Instruction::isBinaryOp(E->getOpcode())) {
+ if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
setInsertPointAfterBundle(E);
LHS = vectorizeTree(E->getOperand(0));
RHS = vectorizeTree(E->getOperand(1));
@@ -6846,6 +8343,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
V1 = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
+ auto *AltCI = cast<CmpInst>(E->getAltOp());
+ CmpInst::Predicate AltPred = AltCI->getPredicate();
+ V1 = Builder.CreateCmp(AltPred, LHS, RHS);
} else {
V0 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
@@ -6866,11 +8368,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// each vector operation.
ValueList OpScalars, AltScalars;
SmallVector<int> Mask;
- buildSuffleEntryMask(
+ buildShuffleEntryMask(
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
[E](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return I->getOpcode() == E->getAltOpcode();
+ return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
},
Mask, &OpScalars, &AltScalars);
@@ -6901,6 +8403,17 @@ Value *BoUpSLP::vectorizeTree() {
return vectorizeTree(ExternallyUsedValues);
}
+namespace {
+/// Data type for handling buildvector sequences with the reused scalars from
+/// other tree entries.
+struct ShuffledInsertData {
+ /// List of insertelements to be replaced by shuffles.
+ SmallVector<InsertElementInst *> InsertElements;
+ /// The parent vectors and shuffle mask for the given list of inserts.
+ MapVector<Value *, SmallVector<int>> ValueMasks;
+};
+} // namespace
+
Value *
BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// All blocks must be scheduled before any instructions are inserted.
@@ -6934,6 +8447,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");
+ SmallVector<ShuffledInsertData> ShuffledInserts;
+ // Maps vector instruction to original insertelement instruction
+ DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -6947,6 +8463,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
assert(E && "Invalid scalar");
assert(E->State != TreeEntry::NeedToGather &&
"Extracting from a gather list");
+ // Non-instruction pointers are not deleted, just skip them.
+ if (E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(Scalar))
+ continue;
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
@@ -6973,6 +8493,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
assert(isa<FixedVectorType>(Scalar->getType()) &&
isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?");
+ auto *IE = cast<InsertElementInst>(Scalar);
+ VectorToInsertElement.try_emplace(Vec, IE);
return Vec;
};
// If User == nullptr, the Scalar is used as extra arg. Generate
@@ -7001,6 +8523,69 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
}
+ if (auto *VU = dyn_cast<InsertElementInst>(User)) {
+ // Skip if the scalar is another vector op or Vec is not an instruction.
+ if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
+ if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
+ Optional<unsigned> InsertIdx = getInsertIndex(VU);
+ if (InsertIdx) {
+ // Need to use original vector, if the root is truncated.
+ if (MinBWs.count(Scalar) &&
+ VectorizableTree[0]->VectorizedValue == Vec)
+ Vec = VectorRoot;
+ auto *It =
+ find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
+ // Checks if 2 insertelements are from the same buildvector.
+ InsertElementInst *VecInsert = Data.InsertElements.front();
+ return areTwoInsertFromSameBuildVector(VU, VecInsert);
+ });
+ unsigned Idx = *InsertIdx;
+ if (It == ShuffledInserts.end()) {
+ (void)ShuffledInserts.emplace_back();
+ It = std::next(ShuffledInserts.begin(),
+ ShuffledInserts.size() - 1);
+ SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ // Find the insertvector, vectorized in tree, if any.
+ Value *Base = VU;
+ while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
+ if (IEBase != User &&
+ (!IEBase->hasOneUse() ||
+ getInsertIndex(IEBase).value_or(Idx) == Idx))
+ break;
+ // Build the mask for the vectorized insertelement instructions.
+ if (const TreeEntry *E = getTreeEntry(IEBase)) {
+ do {
+ IEBase = cast<InsertElementInst>(Base);
+ int IEIdx = *getInsertIndex(IEBase);
+ assert(Mask[Idx] == UndefMaskElem &&
+ "InsertElementInstruction used already.");
+ Mask[IEIdx] = IEIdx;
+ Base = IEBase->getOperand(0);
+ } while (E == getTreeEntry(Base));
+ break;
+ }
+ Base = cast<InsertElementInst>(Base)->getOperand(0);
+ // After the vectorization the def-use chain has changed, need
+ // to look through original insertelement instructions, if they
+ // get replaced by vector instructions.
+ auto It = VectorToInsertElement.find(Base);
+ if (It != VectorToInsertElement.end())
+ Base = It->second;
+ }
+ }
+ SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask[Idx] = ExternalUse.Lane;
+ It->InsertElements.push_back(cast<InsertElementInst>(User));
+ continue;
+ }
+ }
+ }
+ }
+
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
@@ -7036,6 +8621,221 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
+ // Checks if the mask is an identity mask.
+ auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {
+ int Limit = Mask.size();
+ return VecTy->getNumElements() == Mask.size() &&
+ all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask);
+ };
+ // Tries to combine 2 different masks into single one.
+ auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {
+ SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+ if (ExtMask[I] == UndefMaskElem)
+ continue;
+ NewMask[I] = Mask[ExtMask[I]];
+ }
+ Mask.swap(NewMask);
+ };
+ // Peek through shuffles, trying to simplify the final shuffle code.
+ auto &&PeekThroughShuffles =
+ [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,
+ bool CheckForLengthChange = false) {
+ while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ // Exit if not a fixed vector type or changing size shuffle.
+ if (!isa<FixedVectorType>(SV->getType()) ||
+ (CheckForLengthChange && SV->changesLength()))
+ break;
+ // Exit if the identity or broadcast mask is found.
+ if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||
+ SV->isZeroEltSplat())
+ break;
+ bool IsOp1Undef = isUndefVector(SV->getOperand(0));
+ bool IsOp2Undef = isUndefVector(SV->getOperand(1));
+ if (!IsOp1Undef && !IsOp2Undef)
+ break;
+ SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
+ SV->getShuffleMask().end());
+ CombineMasks(ShuffleMask, Mask);
+ Mask.swap(ShuffleMask);
+ if (IsOp2Undef)
+ V = SV->getOperand(0);
+ else
+ V = SV->getOperand(1);
+ }
+ };
+ // Smart shuffle instruction emission, walks through shuffles trees and
+ // tries to find the best matching vector for the actual shuffle
+ // instruction.
+ auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,
+ &CombineMasks](Value *V1, Value *V2,
+ ArrayRef<int> Mask) -> Value * {
+ assert(V1 && "Expected at least one vector value.");
+ if (V2 && !isUndefVector(V2)) {
+ // Peek through shuffles.
+ Value *Op1 = V1;
+ Value *Op2 = V2;
+ int VF =
+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+ SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ for (int I = 0, E = Mask.size(); I < E; ++I) {
+ if (Mask[I] < VF)
+ CombinedMask1[I] = Mask[I];
+ else
+ CombinedMask2[I] = Mask[I] - VF;
+ }
+ Value *PrevOp1;
+ Value *PrevOp2;
+ do {
+ PrevOp1 = Op1;
+ PrevOp2 = Op2;
+ PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);
+ PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);
+ // Check if we have 2 resizing shuffles - need to peek through operands
+ // again.
+ if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
+ if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2))
+ if (SV1->getOperand(0)->getType() ==
+ SV2->getOperand(0)->getType() &&
+ SV1->getOperand(0)->getType() != SV1->getType() &&
+ isUndefVector(SV1->getOperand(1)) &&
+ isUndefVector(SV2->getOperand(1))) {
+ Op1 = SV1->getOperand(0);
+ Op2 = SV2->getOperand(0);
+ SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
+ SV1->getShuffleMask().end());
+ CombineMasks(ShuffleMask1, CombinedMask1);
+ CombinedMask1.swap(ShuffleMask1);
+ SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
+ SV2->getShuffleMask().end());
+ CombineMasks(ShuffleMask2, CombinedMask2);
+ CombinedMask2.swap(ShuffleMask2);
+ }
+ } while (PrevOp1 != Op1 || PrevOp2 != Op2);
+ VF = cast<VectorType>(Op1->getType())
+ ->getElementCount()
+ .getKnownMinValue();
+ for (int I = 0, E = Mask.size(); I < E; ++I) {
+ if (CombinedMask2[I] != UndefMaskElem) {
+ assert(CombinedMask1[I] == UndefMaskElem &&
+ "Expected undefined mask element");
+ CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
+ }
+ }
+ Value *Vec = Builder.CreateShuffleVector(
+ Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
+ CombinedMask1);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
+ }
+ if (isa<PoisonValue>(V1))
+ return PoisonValue::get(FixedVectorType::get(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+ Value *Op = V1;
+ SmallVector<int> CombinedMask(Mask.begin(), Mask.end());
+ PeekThroughShuffles(Op, CombinedMask);
+ if (!isa<FixedVectorType>(Op->getType()) ||
+ !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {
+ Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
+ }
+ return Op;
+ };
+
+ auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) {
+ unsigned VF = Mask.size();
+ unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+ if (VF != VecVF) {
+ if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
+ Vec = CreateShuffle(Vec, nullptr, Mask);
+ return std::make_pair(Vec, true);
+ }
+ SmallVector<int> ResizeMask(VF, UndefMaskElem);
+ for (unsigned I = 0; I < VF; ++I) {
+ if (Mask[I] != UndefMaskElem)
+ ResizeMask[Mask[I]] = Mask[I];
+ }
+ Vec = CreateShuffle(Vec, nullptr, ResizeMask);
+ }
+
+ return std::make_pair(Vec, false);
+ };
+ // Perform shuffling of the vectorize tree entries for better handling of
+ // external extracts.
+ for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
+ // Find the first and the last instruction in the list of insertelements.
+ sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
+ InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
+ InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
+ Builder.SetInsertPoint(LastInsert);
+ auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
+ Value *NewInst = performExtractsShuffleAction<Value>(
+ makeMutableArrayRef(Vector.data(), Vector.size()),
+ FirstInsert->getOperand(0),
+ [](Value *Vec) {
+ return cast<VectorType>(Vec->getType())
+ ->getElementCount()
+ .getKnownMinValue();
+ },
+ ResizeToVF,
+ [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
+ ArrayRef<Value *> Vals) {
+ assert((Vals.size() == 1 || Vals.size() == 2) &&
+ "Expected exactly 1 or 2 input values.");
+ if (Vals.size() == 1) {
+ // Do not create shuffle if the mask is a simple identity
+ // non-resizing mask.
+ if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
+ ->getNumElements() ||
+ !ShuffleVectorInst::isIdentityMask(Mask))
+ return CreateShuffle(Vals.front(), nullptr, Mask);
+ return Vals.front();
+ }
+ return CreateShuffle(Vals.front() ? Vals.front()
+ : FirstInsert->getOperand(0),
+ Vals.back(), Mask);
+ });
+ auto It = ShuffledInserts[I].InsertElements.rbegin();
+ // Rebuild buildvector chain.
+ InsertElementInst *II = nullptr;
+ if (It != ShuffledInserts[I].InsertElements.rend())
+ II = *It;
+ SmallVector<Instruction *> Inserts;
+ while (It != ShuffledInserts[I].InsertElements.rend()) {
+ assert(II && "Must be an insertelement instruction.");
+ if (*It == II)
+ ++It;
+ else
+ Inserts.push_back(cast<Instruction>(II));
+ II = dyn_cast<InsertElementInst>(II->getOperand(0));
+ }
+ for (Instruction *II : reverse(Inserts)) {
+ II->replaceUsesOfWith(II->getOperand(0), NewInst);
+ if (auto *NewI = dyn_cast<Instruction>(NewInst))
+ if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
+ II->moveAfter(NewI);
+ NewInst = II;
+ }
+ LastInsert->replaceAllUsesWith(NewInst);
+ for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
+ IE->replaceUsesOfWith(IE->getOperand(0),
+ PoisonValue::get(IE->getOperand(0)->getType()));
+ IE->replaceUsesOfWith(IE->getOperand(1),
+ PoisonValue::get(IE->getOperand(1)->getType()));
+ eraseInstruction(IE);
+ }
+ CSEBlocks.insert(LastInsert->getParent());
+ }
+
// For each vectorized value:
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
@@ -7050,6 +8850,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+ if (Entry->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(Scalar))
+ continue;
#ifndef NDEBUG
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
@@ -7057,7 +8860,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
// It is legal to delete users in the ignorelist.
- assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) ||
+ assert((getTreeEntry(U) ||
+ (UserIgnoreList && UserIgnoreList->contains(U)) ||
(isa_and_nonnull<Instruction>(U) &&
isDeleted(cast<Instruction>(U)))) &&
"Deleting out-of-tree value");
@@ -7225,9 +9029,11 @@ void BoUpSLP::optimizeGatherSequence() {
BoUpSLP::ScheduleData *
BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
- ScheduleData *Bundle = nullptr;
+ ScheduleData *Bundle = nullptr;
ScheduleData *PrevInBundle = nullptr;
for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member "
@@ -7239,8 +9045,6 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
} else {
Bundle = BundleMember;
}
- BundleMember->UnscheduledDepsInBundle = 0;
- Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
@@ -7257,7 +9061,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
- if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))
+ if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
+ doesNotNeedToSchedule(VL))
return nullptr;
// Initialize the instruction bundle.
@@ -7276,16 +9081,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
ReSchedule = true;
}
- if (ReSchedule) {
- resetSchedule();
- initialFillReadyList(ReadyInsts);
- }
if (Bundle) {
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
<< " in block " << BB->getName() << "\n");
calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
}
+ if (ReSchedule) {
+ resetSchedule();
+ initialFillReadyList(ReadyInsts);
+ }
+
// Now try to schedule the new bundle or (if no bundle) just calculate
// dependencies. As soon as the bundle is "ready" it means that there are no
// cyclic dependencies and we can schedule it. Note that's important that we
@@ -7293,14 +9099,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
!ReadyInsts.empty()) {
ScheduleData *Picked = ReadyInsts.pop_back_val();
- if (Picked->isSchedulingEntity() && Picked->isReady())
- schedule(Picked, ReadyInsts);
+ assert(Picked->isSchedulingEntity() && Picked->isReady() &&
+ "must be ready to schedule");
+ schedule(Picked, ReadyInsts);
}
};
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
if (!extendSchedulingRegion(V, S)) {
// If the scheduling region got new instructions at the lower end (or it
// is a new region for the first bundle). This makes it necessary to
@@ -7315,9 +9124,16 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
bool ReSchedule = false;
for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
+
+ // Make sure we don't leave the pieces of the bundle in the ready list when
+ // whole bundle might not be ready.
+ ReadyInsts.remove(BundleMember);
+
if (!BundleMember->IsScheduled)
continue;
// A bundle member was scheduled as single instruction before and now
@@ -7339,16 +9155,24 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
Value *OpValue) {
- if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))
+ if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
+ doesNotNeedToSchedule(VL))
return;
+ if (doesNotNeedToBeScheduled(OpValue))
+ OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
ScheduleData *Bundle = getScheduleData(OpValue);
LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
- assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+ assert(Bundle->isSchedulingEntity() &&
+ (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
"tried to unbundle something which is not a bundle");
+ // Remove the bundle from the ready list.
+ if (Bundle->isReady())
+ ReadyInsts.remove(Bundle);
+
// Un-bundle: make single instructions out of the bundle.
ScheduleData *BundleMember = Bundle;
while (BundleMember) {
@@ -7356,8 +9180,8 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
BundleMember->FirstInBundle = BundleMember;
ScheduleData *Next = BundleMember->NextInBundle;
BundleMember->NextInBundle = nullptr;
- BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
- if (BundleMember->UnscheduledDepsInBundle == 0) {
+ BundleMember->TE = nullptr;
+ if (BundleMember->unscheduledDepsInBundle() == 0) {
ReadyInsts.insert(BundleMember);
}
BundleMember = Next;
@@ -7380,9 +9204,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
+ !doesNotNeedToBeScheduled(I) &&
"phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled");
- auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
+ auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
return false;
@@ -7394,7 +9219,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
ExtraScheduleDataMap[I][S.OpValue] = SD;
return true;
};
- if (CheckSheduleForI(I))
+ if (CheckScheduleForI(I))
return true;
if (!ScheduleStart) {
// It's the first instruction in the new region.
@@ -7402,7 +9227,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
@@ -7430,7 +9255,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
+ CheckScheduleForI(I);
LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
<< "\n");
return true;
@@ -7444,7 +9269,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
nullptr);
ScheduleEnd = I->getNextNode();
if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
@@ -7456,7 +9281,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
ScheduleData *NextLoadStore) {
ScheduleData *CurrentLoadStore = PrevLoadStore;
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
- ScheduleData *SD = ScheduleDataMap[I];
+ // No need to allocate data for non-schedulable instructions.
+ if (doesNotNeedToBeScheduled(I))
+ continue;
+ ScheduleData *SD = ScheduleDataMap.lookup(I);
if (!SD) {
SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
@@ -7479,6 +9307,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
}
CurrentLoadStore = SD;
}
+
+ if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
+ match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+ RegionHasStackSave = true;
}
if (NextLoadStore) {
if (CurrentLoadStore)
@@ -7511,8 +9343,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// Handle def-use chain dependencies.
if (BundleMember->OpValue != BundleMember->Inst) {
- ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
BundleMember->Dependencies++;
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled)
@@ -7522,10 +9353,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
} else {
for (User *U : BundleMember->Inst->users()) {
- assert(isa<Instruction>(U) &&
- "user of instruction must be instruction");
- ScheduleData *UseSD = getScheduleData(U);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
BundleMember->Dependencies++;
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled)
@@ -7536,6 +9364,75 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
}
+ auto makeControlDependent = [&](Instruction *I) {
+ auto *DepDest = getScheduleData(I);
+ assert(DepDest && "must be in schedule window");
+ DepDest->ControlDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ };
+
+ // Any instruction which isn't safe to speculate at the begining of the
+ // block is control dependend on any early exit or non-willreturn call
+ // which proceeds it.
+ if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
+ for (Instruction *I = BundleMember->Inst->getNextNode();
+ I != ScheduleEnd; I = I->getNextNode()) {
+ if (isSafeToSpeculativelyExecute(I, &*BB->begin()))
+ continue;
+
+ // Add the dependency
+ makeControlDependent(I);
+
+ if (!isGuaranteedToTransferExecutionToSuccessor(I))
+ // Everything past here must be control dependent on I.
+ break;
+ }
+ }
+
+ if (RegionHasStackSave) {
+ // If we have an inalloc alloca instruction, it needs to be scheduled
+ // after any preceeding stacksave. We also need to prevent any alloca
+ // from reordering above a preceeding stackrestore.
+ if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
+ match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
+ for (Instruction *I = BundleMember->Inst->getNextNode();
+ I != ScheduleEnd; I = I->getNextNode()) {
+ if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
+ match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+ // Any allocas past here must be control dependent on I, and I
+ // must be memory dependend on BundleMember->Inst.
+ break;
+
+ if (!isa<AllocaInst>(I))
+ continue;
+
+ // Add the dependency
+ makeControlDependent(I);
+ }
+ }
+
+ // In addition to the cases handle just above, we need to prevent
+ // allocas from moving below a stacksave. The stackrestore case
+ // is currently thought to be conservatism.
+ if (isa<AllocaInst>(BundleMember->Inst)) {
+ for (Instruction *I = BundleMember->Inst->getNextNode();
+ I != ScheduleEnd; I = I->getNextNode()) {
+ if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
+ !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+ continue;
+
+ // Add the dependency
+ makeControlDependent(I);
+ break;
+ }
+ }
+ }
+
// Handle the memory dependencies (if any).
ScheduleData *DepDest = BundleMember->NextLoadStore;
if (!DepDest)
@@ -7598,7 +9495,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
}
if (InsertInReadyList && SD->isReady()) {
- ReadyInsts.push_back(SD);
+ ReadyInsts.insert(SD);
LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
<< "\n");
}
@@ -7625,11 +9522,18 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+ // A key point - if we got here, pre-scheduling was able to find a valid
+ // scheduling of the sub-graph of the scheduling window which consists
+ // of all vector bundles and their transitive users. As such, we do not
+ // need to reschedule anything *outside of* that subgraph.
+
BS->resetSchedule();
// For the real scheduling we use a more sophisticated ready-list: it is
// sorted by the original instruction location. This lets the final schedule
// be as close as possible to the original instruction order.
+ // WARNING: If changing this order causes a correctness issue, that means
+ // there is some missing dependence edge in the schedule data graph.
struct ScheduleDataCompare {
bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
return SD2->SchedulingPriority < SD1->SchedulingPriority;
@@ -7637,21 +9541,22 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
- // Ensure that all dependency data is updated and fill the ready-list with
- // initial instructions.
+ // Ensure that all dependency data is updated (for nodes in the sub-graph)
+ // and fill the ready-list with initial instructions.
int Idx = 0;
- int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
- BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+ BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
+ TreeEntry *SDTE = getTreeEntry(SD->Inst);
+ (void)SDTE;
assert((isVectorLikeInstWithConstOps(SD->Inst) ||
- SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
+ SD->isPartOfBundle() ==
+ (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;
- if (SD->isSchedulingEntity()) {
+
+ if (SD->isSchedulingEntity() && SD->isPartOfBundle())
BS->calculateDependencies(SD, false, this);
- NumToSchedule++;
- }
});
}
BS->initialFillReadyList(ReadyInsts);
@@ -7674,9 +9579,23 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
}
BS->schedule(picked, ReadyInsts);
- NumToSchedule--;
}
- assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+ // Check that we didn't break any of our invariants.
+#ifdef EXPENSIVE_CHECKS
+ BS->verify();
+#endif
+
+#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
+ // Check that all schedulable entities got scheduled
+ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
+ BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
+ assert(SD->IsScheduled && "must be scheduled at this point");
+ }
+ });
+ }
+#endif
// Avoid duplicate scheduling of the block.
BS->ScheduleStart = nullptr;
@@ -7686,11 +9605,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value (or value
// truncated just before storing) without traversing the expression tree.
// This is the common case.
- if (auto *Store = dyn_cast<StoreInst>(V)) {
- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
- return DL->getTypeSizeInBits(Trunc->getSrcTy());
+ if (auto *Store = dyn_cast<StoreInst>(V))
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
- }
if (auto *IEI = dyn_cast<InsertElementInst>(V))
return getVectorElementSize(IEI->getOperand(1));
@@ -8092,6 +10008,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
+ // Start new block - clear the list of reduction roots.
+ R.clearReductionData();
collectSeedInstructions(BB);
// Vectorize trees that end at stores.
@@ -8122,11 +10040,10 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
}
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
- unsigned Idx) {
+ unsigned Idx, unsigned MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);
- const unsigned MinVF = R.getMinVecRegSize() / Sz;
unsigned VF = Chain.size();
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
@@ -8265,9 +10182,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned EltSize = R.getVectorElementSize(Operands[0]);
unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
- unsigned MinVF = R.getMinVF(EltSize);
unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
MaxElts);
+ auto *Store = cast<StoreInst>(Operands[0]);
+ Type *StoreTy = Store->getValueOperand()->getType();
+ Type *ValueTy = StoreTy;
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+ ValueTy = Trunc->getSrcTy();
+ unsigned MinVF = TTI->getStoreMinimumVF(
+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
@@ -8277,7 +10200,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
if (!VectorizedStores.count(Slice.front()) &&
!VectorizedStores.count(Slice.back()) &&
- vectorizeStoreChain(Slice, R, Cnt)) {
+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Slice.begin(), Slice.end());
Changed = true;
@@ -8481,7 +10404,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!I)
return false;
- if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+ if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
+ isa<VectorType>(I->getType()))
return false;
Value *P = I->getParent();
@@ -8492,32 +10416,40 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
return false;
- // Try to vectorize V.
- if (tryToVectorizePair(Op0, Op1, R))
- return true;
+ // First collect all possible candidates
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Op0, Op1);
auto *A = dyn_cast<BinaryOperator>(Op0);
auto *B = dyn_cast<BinaryOperator>(Op1);
// Try to skip B.
- if (B && B->hasOneUse()) {
+ if (A && B && B->hasOneUse()) {
auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
- return true;
- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
- return true;
+ if (B0 && B0->getParent() == P)
+ Candidates.emplace_back(A, B0);
+ if (B1 && B1->getParent() == P)
+ Candidates.emplace_back(A, B1);
}
-
// Try to skip A.
- if (A && A->hasOneUse()) {
+ if (B && A && A->hasOneUse()) {
auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
- return true;
- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
- return true;
+ if (A0 && A0->getParent() == P)
+ Candidates.emplace_back(A0, B);
+ if (A1 && A1->getParent() == P)
+ Candidates.emplace_back(A1, B);
}
- return false;
+
+ if (Candidates.size() == 1)
+ return tryToVectorizePair(Op0, Op1, R);
+
+ // We have multiple options. Try to pick the single best.
+ Optional<int> BestCandidate = R.findBestRootPair(Candidates);
+ if (!BestCandidate)
+ return false;
+ return tryToVectorizePair(Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second, R);
}
namespace {
@@ -8552,15 +10484,16 @@ class HorizontalReduction {
using ReductionOpsType = SmallVector<Value *, 16>;
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
ReductionOpsListType ReductionOps;
- SmallVector<Value *, 32> ReducedVals;
+ /// List of possibly reduced values.
+ SmallVector<SmallVector<Value *>> ReducedVals;
+ /// Maps reduced value to the corresponding reduction operation.
+ DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
WeakTrackingVH ReductionRoot;
/// The type of reduction operation.
RecurKind RdxKind;
- const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
-
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -8604,26 +10537,6 @@ class HorizontalReduction {
return I->getOperand(Index);
}
- /// Checks if the ParentStackElem.first should be marked as a reduction
- /// operation with an extra argument or as extra argument itself.
- void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
- Value *ExtraArg) {
- if (ExtraArgs.count(ParentStackElem.first)) {
- ExtraArgs[ParentStackElem.first] = nullptr;
- // We ran into something like:
- // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
- // The whole ParentStackElem.first should be considered as an extra value
- // in this case.
- // Do not perform analysis of remaining operands of ParentStackElem.first
- // instruction, this whole instruction is an extra argument.
- ParentStackElem.second = INVALID_OPERAND_INDEX;
- } else {
- // We ran into something like:
- // ParentStackElem.first += ... + ExtraArg + ...
- ExtraArgs[ParentStackElem.first] = ExtraArg;
- }
- }
-
/// Creates reduction operation with the current opcode.
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
@@ -8682,7 +10595,7 @@ class HorizontalReduction {
}
/// Creates reduction operation with the current opcode with the IR flags
- /// from \p ReductionOps.
+ /// from \p ReductionOps, dropping nuw/nsw flags.
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
Value *RHS, const Twine &Name,
const ReductionOpsListType &ReductionOps) {
@@ -8696,31 +10609,21 @@ class HorizontalReduction {
Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
if (auto *Sel = dyn_cast<SelectInst>(Op)) {
- propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
- propagateIRFlags(Op, ReductionOps[1]);
+ propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
+ /*IncludeWrapFlags=*/false);
+ propagateIRFlags(Op, ReductionOps[1], nullptr,
+ /*IncludeWrapFlags=*/false);
return Op;
}
}
- propagateIRFlags(Op, ReductionOps[0]);
- return Op;
- }
-
- /// Creates reduction operation with the current opcode with the IR flags
- /// from \p I.
- static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
- Value *RHS, const Twine &Name, Instruction *I) {
- auto *SelI = dyn_cast<SelectInst>(I);
- Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
- if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
- if (auto *Sel = dyn_cast<SelectInst>(Op))
- propagateIRFlags(Sel->getCondition(), SelI->getCondition());
- }
- propagateIRFlags(Op, I);
+ propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
return Op;
}
- static RecurKind getRdxKind(Instruction *I) {
- assert(I && "Expected instruction for reduction matching");
+ static RecurKind getRdxKind(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return RecurKind::None;
if (match(I, m_Add(m_Value(), m_Value())))
return RecurKind::Add;
if (match(I, m_Mul(m_Value(), m_Value())))
@@ -8882,7 +10785,9 @@ public:
HorizontalReduction() = default;
/// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+ ScalarEvolution &SE, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
assert((!Phi || is_contained(Phi->operands(), Inst)) &&
"Phi needs to use the binary operator");
assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
@@ -8926,124 +10831,178 @@ public:
ReductionRoot = Inst;
- // The opcode for leaf values that we perform a reduction on.
- // For example: load(x) + load(y) + load(z) + fptoui(w)
- // The leaf opcode for 'w' does not match, so we don't include it as a
- // potential candidate for the reduction.
- unsigned LeafOpcode = 0;
-
- // Post-order traverse the reduction tree starting at Inst. We only handle
- // true trees containing binary operators or selects.
- SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
- initReductionOps(Inst);
- while (!Stack.empty()) {
- Instruction *TreeN = Stack.back().first;
- unsigned EdgeToVisit = Stack.back().second++;
- const RecurKind TreeRdxKind = getRdxKind(TreeN);
- bool IsReducedValue = TreeRdxKind != RdxKind;
-
- // Postorder visit.
- if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
- if (IsReducedValue)
- ReducedVals.push_back(TreeN);
- else {
- auto ExtraArgsIter = ExtraArgs.find(TreeN);
- if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
- // Check if TreeN is an extra argument of its parent operation.
- if (Stack.size() <= 1) {
- // TreeN can't be an extra argument as it is a root reduction
- // operation.
- return false;
- }
- // Yes, TreeN is an extra argument, do not add it to a list of
- // reduction operations.
- // Stack[Stack.size() - 2] always points to the parent operation.
- markExtraArg(Stack[Stack.size() - 2], TreeN);
- ExtraArgs.erase(TreeN);
- } else
- addReductionOps(TreeN);
- }
- // Retract.
- Stack.pop_back();
- continue;
- }
-
- // Visit operands.
- Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
- auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
- if (!EdgeInst) {
- // Edge value is not a reduction instruction or a leaf instruction.
- // (It may be a constant, function argument, or something else.)
- markExtraArg(Stack.back(), EdgeVal);
- continue;
+ // Iterate through all the operands of the possible reduction tree and
+ // gather all the reduced values, sorting them by their value id.
+ BasicBlock *BB = Inst->getParent();
+ bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
+ SmallVector<Instruction *> Worklist(1, Inst);
+ // Checks if the operands of the \p TreeN instruction are also reduction
+ // operations or should be treated as reduced values or an extra argument,
+ // which is not part of the reduction.
+ auto &&CheckOperands = [this, IsCmpSelMinMax,
+ BB](Instruction *TreeN,
+ SmallVectorImpl<Value *> &ExtraArgs,
+ SmallVectorImpl<Value *> &PossibleReducedVals,
+ SmallVectorImpl<Instruction *> &ReductionOps) {
+ for (int I = getFirstOperandIndex(TreeN),
+ End = getNumberOfOperands(TreeN);
+ I < End; ++I) {
+ Value *EdgeVal = getRdxOperand(TreeN, I);
+ ReducedValsToOps[EdgeVal].push_back(TreeN);
+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+ // Edge has wrong parent - mark as an extra argument.
+ if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
+ !hasSameParent(EdgeInst, BB)) {
+ ExtraArgs.push_back(EdgeVal);
+ continue;
+ }
+ // If the edge is not an instruction, or it is different from the main
+ // reduction opcode or has too many uses - possible reduced value.
+ if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+ IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+ !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+ !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+ PossibleReducedVals.push_back(EdgeVal);
+ continue;
+ }
+ ReductionOps.push_back(EdgeInst);
}
- RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
- // Continue analysis if the next operand is a reduction operation or
- // (possibly) a leaf value. If the leaf value opcode is not set,
- // the first met operation != reduction operation is considered as the
- // leaf opcode.
- // Only handle trees in the current basic block.
- // Each tree node needs to have minimal number of users except for the
- // ultimate reduction.
- const bool IsRdxInst = EdgeRdxKind == RdxKind;
- if (EdgeInst != Phi && EdgeInst != Inst &&
- hasSameParent(EdgeInst, Inst->getParent()) &&
- hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
- (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
- if (IsRdxInst) {
- // We need to be able to reassociate the reduction operations.
- if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), EdgeInst);
- continue;
- }
- } else if (!LeafOpcode) {
- LeafOpcode = EdgeInst->getOpcode();
+ };
+ // Try to regroup reduced values so that it gets more profitable to try to
+ // reduce them. Values are grouped by their value ids, instructions - by
+ // instruction op id and/or alternate op id, plus do extra analysis for
+ // loads (grouping them by the distabce between pointers) and cmp
+ // instructions (grouping them by the predicate).
+ MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
+ PossibleReducedVals;
+ initReductionOps(Inst);
+ while (!Worklist.empty()) {
+ Instruction *TreeN = Worklist.pop_back_val();
+ SmallVector<Value *> Args;
+ SmallVector<Value *> PossibleRedVals;
+ SmallVector<Instruction *> PossibleReductionOps;
+ CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
+ // If too many extra args - mark the instruction itself as a reduction
+ // value, not a reduction operation.
+ if (Args.size() < 2) {
+ addReductionOps(TreeN);
+ // Add extra args.
+ if (!Args.empty()) {
+ assert(Args.size() == 1 && "Expected only single argument.");
+ ExtraArgs[TreeN] = Args.front();
}
- Stack.push_back(
- std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
- continue;
+ // Add reduction values. The values are sorted for better vectorization
+ // results.
+ for (Value *V : PossibleRedVals) {
+ size_t Key, Idx;
+ std::tie(Key, Idx) = generateKeySubkey(
+ V, &TLI,
+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+ auto It = PossibleReducedVals.find(Key);
+ if (It != PossibleReducedVals.end()) {
+ for (const auto &LoadData : It->second) {
+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+ if (getPointersDiff(RLI->getType(),
+ RLI->getPointerOperand(), LI->getType(),
+ LI->getPointerOperand(), DL, SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ }
+ return hash_value(LI->getPointerOperand());
+ },
+ /*AllowAlternate=*/false);
+ ++PossibleReducedVals[Key][Idx]
+ .insert(std::make_pair(V, 0))
+ .first->second;
+ }
+ Worklist.append(PossibleReductionOps.rbegin(),
+ PossibleReductionOps.rend());
+ } else {
+ size_t Key, Idx;
+ std::tie(Key, Idx) = generateKeySubkey(
+ TreeN, &TLI,
+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+ auto It = PossibleReducedVals.find(Key);
+ if (It != PossibleReducedVals.end()) {
+ for (const auto &LoadData : It->second) {
+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(),
+ DL, SE, /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ }
+ return hash_value(LI->getPointerOperand());
+ },
+ /*AllowAlternate=*/false);
+ ++PossibleReducedVals[Key][Idx]
+ .insert(std::make_pair(TreeN, 0))
+ .first->second;
+ }
+ }
+ auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
+ // Sort values by the total number of values kinds to start the reduction
+ // from the longest possible reduced values sequences.
+ for (auto &PossibleReducedVals : PossibleReducedValsVect) {
+ auto PossibleRedVals = PossibleReducedVals.second.takeVector();
+ SmallVector<SmallVector<Value *>> PossibleRedValsVect;
+ for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
+ It != E; ++It) {
+ PossibleRedValsVect.emplace_back();
+ auto RedValsVect = It->second.takeVector();
+ stable_sort(RedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.second < P2.second;
+ });
+ for (const std::pair<Value *, unsigned> &Data : RedValsVect)
+ PossibleRedValsVect.back().append(Data.second, Data.first);
}
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), EdgeInst);
- }
+ stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.size() > P2.size();
+ });
+ ReducedVals.emplace_back();
+ for (ArrayRef<Value *> Data : PossibleRedValsVect)
+ ReducedVals.back().append(Data.rbegin(), Data.rend());
+ }
+ // Sort the reduced values by number of same/alternate opcode and/or pointer
+ // operand.
+ stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+ return P1.size() > P2.size();
+ });
return true;
}
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ constexpr int ReductionLimit = 4;
+ constexpr unsigned RegMaxNumber = 4;
+ constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
- unsigned NumReducedVals = ReducedVals.size();
- if (NumReducedVals < 4)
+ unsigned NumReducedVals = std::accumulate(
+ ReducedVals.begin(), ReducedVals.end(), 0,
+ [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
+ if (NumReducedVals < ReductionLimit)
return nullptr;
- // Intersect the fast-math-flags from all reduction operations.
- FastMathFlags RdxFMF;
- RdxFMF.set();
- for (ReductionOpsType &RdxOp : ReductionOps) {
- for (Value *RdxVal : RdxOp) {
- if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
- RdxFMF &= FPMO->getFastMathFlags();
- }
- }
-
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
- Builder.setFastMathFlags(RdxFMF);
+ // Track the reduced values in case if they are replaced by extractelement
+ // because of the vectorization.
+ DenseMap<Value *, WeakTrackingVH> TrackedVals;
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several times, so log each attempt
// to use it.
for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ TrackedVals.try_emplace(Pair.second, Pair.second);
}
// The compare instruction of a min/max is the insertion point for new
// instructions and may be replaced with a new compare instruction.
- auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+ auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
assert(isa<SelectInst>(RdxRootInst) &&
"Expected min/max reduction to have select root instruction");
Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
@@ -9055,164 +11014,390 @@ public:
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
- SmallVector<Value *, 16> IgnoreList;
- for (ReductionOpsType &RdxOp : ReductionOps)
- IgnoreList.append(RdxOp.begin(), RdxOp.end());
-
- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
- if (NumReducedVals > ReduxWidth) {
- // In the loop below, we are building a tree based on a window of
- // 'ReduxWidth' values.
- // If the operands of those values have common traits (compare predicate,
- // constant operand, etc), then we want to group those together to
- // minimize the cost of the reduction.
-
- // TODO: This should be extended to count common operands for
- // compares and binops.
-
- // Step 1: Count the number of times each compare predicate occurs.
- SmallDenseMap<unsigned, unsigned> PredCountMap;
- for (Value *RdxVal : ReducedVals) {
- CmpInst::Predicate Pred;
- if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
- ++PredCountMap[Pred];
- }
- // Step 2: Sort the values so the most common predicates come first.
- stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
- CmpInst::Predicate PredA, PredB;
- if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
- match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
- return PredCountMap[PredA] > PredCountMap[PredB];
- }
- return false;
- });
- }
+ SmallDenseSet<Value *> IgnoreList;
+ for (ReductionOpsType &RdxOps : ReductionOps)
+ for (Value *RdxOp : RdxOps) {
+ if (!RdxOp)
+ continue;
+ IgnoreList.insert(RdxOp);
+ }
+ bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
+
+ // Need to track reduced vals, they may be changed during vectorization of
+ // subvectors.
+ for (ArrayRef<Value *> Candidates : ReducedVals)
+ for (Value *V : Candidates)
+ TrackedVals.try_emplace(V, V);
+ DenseMap<Value *, unsigned> VectorizedVals;
Value *VectorizedTree = nullptr;
- unsigned i = 0;
- while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
- ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, IgnoreList);
- if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
- break;
- if (V.isLoadCombineReductionCandidate(RdxKind))
- break;
- V.reorderTopToBottom();
- V.reorderBottomToTop(/*IgnoreReorder=*/true);
- V.buildExternalUses(ExternallyUsedValues);
-
- // For a poison-safe boolean logic reduction, do not replace select
- // instructions with logic ops. All reduced values will be frozen (see
- // below) to prevent leaking poison.
- if (isa<SelectInst>(ReductionRoot) &&
- isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
- NumReducedVals != ReduxWidth)
- break;
+ bool CheckForReusedReductionOps = false;
+ // Try to vectorize elements based on their type.
+ for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+ ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
+ InstructionsState S = getSameOpcode(OrigReducedVals);
+ SmallVector<Value *> Candidates;
+ DenseMap<Value *, Value *> TrackedToOrig;
+ for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
+ Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
+ // Check if the reduction value was not overriden by the extractelement
+ // instruction because of the vectorization and exclude it, if it is not
+ // compatible with other values.
+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+ if (isVectorLikeInstWithConstOps(Inst) &&
+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
+ continue;
+ Candidates.push_back(RdxVal);
+ TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+ }
+ bool ShuffledExtracts = false;
+ // Try to handle shuffled extractelements.
+ if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
+ I + 1 < E) {
+ InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
+ if (NextS.getOpcode() == Instruction::ExtractElement &&
+ !NextS.isAltShuffle()) {
+ SmallVector<Value *> CommonCandidates(Candidates);
+ for (Value *RV : ReducedVals[I + 1]) {
+ Value *RdxVal = TrackedVals.find(RV)->second;
+ // Check if the reduction value was not overriden by the
+ // extractelement instruction because of the vectorization and
+ // exclude it, if it is not compatible with other values.
+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+ if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
+ continue;
+ CommonCandidates.push_back(RdxVal);
+ TrackedToOrig.try_emplace(RdxVal, RV);
+ }
+ SmallVector<int> Mask;
+ if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+ ++I;
+ Candidates.swap(CommonCandidates);
+ ShuffledExtracts = true;
+ }
+ }
+ }
+ unsigned NumReducedVals = Candidates.size();
+ if (NumReducedVals < ReductionLimit)
+ continue;
- V.computeMinimumValueSizes();
+ unsigned MaxVecRegSize = V.getMaxVecRegSize();
+ unsigned EltSize = V.getVectorElementSize(Candidates[0]);
+ unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);
+
+ unsigned ReduxWidth = std::min<unsigned>(
+ PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
+ unsigned Start = 0;
+ unsigned Pos = Start;
+ // Restarts vectorization attempt with lower vector factor.
+ unsigned PrevReduxWidth = ReduxWidth;
+ bool CheckForReusedReductionOpsLocal = false;
+ auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+ &CheckForReusedReductionOpsLocal,
+ &PrevReduxWidth, &V,
+ &IgnoreList](bool IgnoreVL = false) {
+ bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
+ if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
+ // Check if any of the reduction ops are gathered. If so, worth
+ // trying again with less number of reduction ops.
+ CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
+ }
+ ++Pos;
+ if (Pos < NumReducedVals - ReduxWidth + 1)
+ return IsAnyRedOpGathered;
+ Pos = Start;
+ ReduxWidth /= 2;
+ return IsAnyRedOpGathered;
+ };
+ while (Pos < NumReducedVals - ReduxWidth + 1 &&
+ ReduxWidth >= ReductionLimit) {
+ // Dependency in tree of the reduction ops - drop this attempt, try
+ // later.
+ if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
+ Start == 0) {
+ CheckForReusedReductionOps = true;
+ break;
+ }
+ PrevReduxWidth = ReduxWidth;
+ ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
+ // Beeing analyzed already - skip.
+ if (V.areAnalyzedReductionVals(VL)) {
+ (void)AdjustReducedVals(/*IgnoreVL=*/true);
+ continue;
+ }
+ // Early exit if any of the reduction values were deleted during
+ // previous vectorization attempts.
+ if (any_of(VL, [&V](Value *RedVal) {
+ auto *RedValI = dyn_cast<Instruction>(RedVal);
+ if (!RedValI)
+ return false;
+ return V.isDeleted(RedValI);
+ }))
+ break;
+ V.buildTree(VL, IgnoreList);
+ if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
+ if (V.isLoadCombineReductionCandidate(RdxKind)) {
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
+ V.reorderTopToBottom();
+ // No need to reorder the root node at all.
+ V.reorderBottomToTop(/*IgnoreReorder=*/true);
+ // Keep extracted other reduction values, if they are used in the
+ // vectorization trees.
+ BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
+ ExternallyUsedValues);
+ for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
+ if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
+ continue;
+ for_each(ReducedVals[Cnt],
+ [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
+ if (isa<Instruction>(V))
+ LocalExternallyUsedValues[TrackedVals[V]];
+ });
+ }
+ // Number of uses of the candidates in the vector of values.
+ SmallDenseMap<Value *, unsigned> NumUses;
+ for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (NumUses.count(V) > 0)
+ continue;
+ NumUses[V] = std::count(VL.begin(), VL.end(), V);
+ }
+ for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (NumUses.count(V) > 0)
+ continue;
+ NumUses[V] = std::count(VL.begin(), VL.end(), V);
+ }
+ // Gather externally used values.
+ SmallPtrSet<Value *, 4> Visited;
+ for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (!Visited.insert(V).second)
+ continue;
+ unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
+ if (NumOps != ReducedValsToOps.find(V)->second.size())
+ LocalExternallyUsedValues[V];
+ }
+ for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (!Visited.insert(V).second)
+ continue;
+ unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
+ if (NumOps != ReducedValsToOps.find(V)->second.size())
+ LocalExternallyUsedValues[V];
+ }
+ V.buildExternalUses(LocalExternallyUsedValues);
+
+ V.computeMinimumValueSizes();
+
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (Value *U : IgnoreList)
+ if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+ RdxFMF &= FPMO->getFastMathFlags();
+ // Estimate cost.
+ InstructionCost TreeCost = V.getTreeCost(VL);
+ InstructionCost ReductionCost =
+ getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
+ InstructionCost Cost = TreeCost + ReductionCost;
+ if (!Cost.isValid()) {
+ LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+ return nullptr;
+ }
+ if (Cost >= -SLPCostThreshold) {
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "HorSLPNotBeneficial",
+ ReducedValsToOps.find(VL[0])->second.front())
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost " << ore::NV("Cost", Cost)
+ << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
- // Estimate cost.
- InstructionCost TreeCost =
- V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
- InstructionCost ReductionCost =
- getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
- InstructionCost Cost = TreeCost + ReductionCost;
- if (!Cost.isValid()) {
- LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
- return nullptr;
- }
- if (Cost >= -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
- cast<Instruction>(VL[0]))
- << "Vectorizing horizontal reduction is possible"
- << "but not beneficial with cost " << ore::NV("Cost", Cost)
- << " and threshold "
- << ore::NV("Threshold", -SLPCostThreshold);
+ return OptimizationRemark(
+ SV_NAME, "VectorizedHorizontalReduction",
+ ReducedValsToOps.find(VL[0])->second.front())
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize());
});
- break;
- }
- LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
- << Cost << ". (HorRdx)\n");
- V.getORE()->emit([&]() {
- return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
- cast<Instruction>(VL[0]))
- << "Vectorized horizontal reduction with cost "
- << ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize());
- });
+ Builder.setFastMathFlags(RdxFMF);
- // Vectorize a tree.
- DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
- Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+ // Vectorize a tree.
+ Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
- // Emit a reduction. If the root is a select (min/max idiom), the insert
- // point is the compare condition of that select.
- Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- if (isCmpSelMinMax(RdxRootInst))
- Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
- else
- Builder.SetInsertPoint(RdxRootInst);
+ // Emit a reduction. If the root is a select (min/max idiom), the insert
+ // point is the compare condition of that select.
+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+ if (IsCmpSelMinMax)
+ Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
+ else
+ Builder.SetInsertPoint(RdxRootInst);
- // To prevent poison from leaking across what used to be sequential, safe,
- // scalar boolean logic operations, the reduction operand must be frozen.
- if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
- VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+ // To prevent poison from leaking across what used to be sequential,
+ // safe, scalar boolean logic operations, the reduction operand must be
+ // frozen.
+ if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+ VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
- Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ Value *ReducedSubTree =
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (!VectorizedTree) {
- // Initialize the final value in the reduction.
- VectorizedTree = ReducedSubTree;
- } else {
- // Update the final value in the reduction.
- Builder.SetCurrentDebugLocation(Loc);
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- ReducedSubTree, "op.rdx", ReductionOps);
+ if (!VectorizedTree) {
+ // Initialize the final value in the reduction.
+ VectorizedTree = ReducedSubTree;
+ } else {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(
+ cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ ReducedSubTree, "op.rdx", ReductionOps);
+ }
+ // Count vectorized reduced values to exclude them from final reduction.
+ for (Value *V : VL)
+ ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+ .first->getSecond();
+ Pos += ReduxWidth;
+ Start = Pos;
+ ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
}
- i += ReduxWidth;
- ReduxWidth = PowerOf2Floor(NumReducedVals - i);
}
-
if (VectorizedTree) {
// Finish the reduction.
- for (; i < NumReducedVals; ++i) {
- auto *I = cast<Instruction>(ReducedVals[i]);
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree =
- createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
+ // Need to add extra arguments and not vectorized possible reduction
+ // values.
+ // Try to avoid dependencies between the scalar remainders after
+ // reductions.
+ auto &&FinalGen =
+ [this, &Builder,
+ &TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
+ unsigned Sz = InstVals.size();
+ SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
+ Sz % 2);
+ for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
+ Instruction *RedOp = InstVals[I + 1].first;
+ Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+ Value *RdxVal1 = InstVals[I].second;
+ Value *StableRdxVal1 = RdxVal1;
+ auto It1 = TrackedVals.find(RdxVal1);
+ if (It1 != TrackedVals.end())
+ StableRdxVal1 = It1->second;
+ Value *RdxVal2 = InstVals[I + 1].second;
+ Value *StableRdxVal2 = RdxVal2;
+ auto It2 = TrackedVals.find(RdxVal2);
+ if (It2 != TrackedVals.end())
+ StableRdxVal2 = It2->second;
+ Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
+ StableRdxVal2, "op.rdx", ReductionOps);
+ ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
+ }
+ if (Sz % 2 == 1)
+ ExtraReds[Sz / 2] = InstVals.back();
+ return ExtraReds;
+ };
+ SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
+ SmallPtrSet<Value *, 8> Visited;
+ for (ArrayRef<Value *> Candidates : ReducedVals) {
+ for (Value *RdxVal : Candidates) {
+ if (!Visited.insert(RdxVal).second)
+ continue;
+ unsigned NumOps = VectorizedVals.lookup(RdxVal);
+ for (Instruction *RedOp :
+ makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
+ .drop_back(NumOps))
+ ExtraReductions.emplace_back(RedOp, RdxVal);
+ }
}
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
- for (auto *I : Pair.second) {
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- Pair.first, "op.extra", I);
- }
+ for (auto *I : Pair.second)
+ ExtraReductions.emplace_back(I, Pair.first);
+ }
+ // Iterate through all not-vectorized reduction values/extra arguments.
+ while (ExtraReductions.size() > 1) {
+ SmallVector<std::pair<Instruction *, Value *>> NewReds =
+ FinalGen(ExtraReductions);
+ ExtraReductions.swap(NewReds);
+ }
+ // Final reduction.
+ if (ExtraReductions.size() == 1) {
+ Instruction *RedOp = ExtraReductions.back().first;
+ Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+ Value *RdxVal = ExtraReductions.back().second;
+ Value *StableRdxVal = RdxVal;
+ auto It = TrackedVals.find(RdxVal);
+ if (It != TrackedVals.end())
+ StableRdxVal = It->second;
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ StableRdxVal, "op.rdx", ReductionOps);
}
ReductionRoot->replaceAllUsesWith(VectorizedTree);
- // Mark all scalar reduction ops for deletion, they are replaced by the
- // vector reductions.
- V.eraseInstructions(IgnoreList);
+ // The original scalar reduction is expected to have no remaining
+ // uses outside the reduction tree itself. Assert that we got this
+ // correct, replace internal uses with undef, and mark for eventual
+ // deletion.
+#ifndef NDEBUG
+ SmallSet<Value *, 4> IgnoreSet;
+ for (ArrayRef<Value *> RdxOps : ReductionOps)
+ IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
+#endif
+ for (ArrayRef<Value *> RdxOps : ReductionOps) {
+ for (Value *Ignore : RdxOps) {
+ if (!Ignore)
+ continue;
+#ifndef NDEBUG
+ for (auto *U : Ignore->users()) {
+ assert(IgnoreSet.count(U) &&
+ "All users must be either in the reduction ops list.");
+ }
+#endif
+ if (!Ignore->use_empty()) {
+ Value *Undef = UndefValue::get(Ignore->getType());
+ Ignore->replaceAllUsesWith(Undef);
+ }
+ V.eraseInstruction(cast<Instruction>(Ignore));
+ }
+ }
+ } else if (!CheckForReusedReductionOps) {
+ for (ReductionOpsType &RdxOps : ReductionOps)
+ for (Value *RdxOp : RdxOps)
+ V.analyzedReductionRoot(cast<Instruction>(RdxOp));
}
return VectorizedTree;
}
- unsigned numReductionValues() const { return ReducedVals.size(); }
-
private:
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
- Value *FirstReducedVal, unsigned ReduxWidth,
- FastMathFlags FMF) {
+ ArrayRef<Value *> ReducedVals,
+ unsigned ReduxWidth, FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ Value *FirstReducedVal = ReducedVals.front();
Type *ScalarTy = FirstReducedVal->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
- InstructionCost VectorCost, ScalarCost;
+ InstructionCost VectorCost = 0, ScalarCost;
+ // If all of the reduced values are constant, the vector cost is 0, since
+ // the reduction value can be calculated at the compile time.
+ bool AllConsts = all_of(ReducedVals, isConstant);
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@@ -9222,17 +11407,22 @@ private:
case RecurKind::FAdd:
case RecurKind::FMul: {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
- VectorCost =
- TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+ if (!AllConsts)
+ VectorCost =
+ TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
break;
}
case RecurKind::FMax:
case RecurKind::FMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*IsUnsigned=*/false, CostKind);
+ if (!AllConsts) {
+ auto *VecCondTy =
+ cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ VectorCost =
+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*IsUnsigned=*/false, CostKind);
+ }
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
@@ -9245,11 +11435,14 @@ private:
case RecurKind::UMax:
case RecurKind::UMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- bool IsUnsigned =
- RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
- CostKind);
+ if (!AllConsts) {
+ auto *VecCondTy =
+ cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ bool IsUnsigned =
+ RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
+ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ IsUnsigned, CostKind);
+ }
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
@@ -9463,7 +11656,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
/// performed.
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI,
+ TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
+ const TargetLibraryInfo &TLI,
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
@@ -9482,7 +11676,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// horizontal reduction.
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
- // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
+ // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
// CmpInsts so we can skip extra attempts in
// tryToVectorizeHorReductionOrInstOperands and save compile time.
std::queue<std::pair<Instruction *, unsigned>> Stack;
@@ -9490,13 +11684,16 @@ static bool tryToVectorizeHorReductionOrInstOperands(
SmallPtrSet<Value *, 8> VisitedInstrs;
SmallVector<WeakTrackingVH> PostponedInsts;
bool Res = false;
- auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
- Value *&B1) -> Value * {
+ auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
+ Value *&B0,
+ Value *&B1) -> Value * {
+ if (R.isAnalyzedReductionRoot(Inst))
+ return nullptr;
bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst))
+ if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
return HorRdx.tryToReduce(R, TTI);
}
return nullptr;
@@ -9541,7 +11738,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Do not try to vectorize CmpInst operands, this is done separately.
// Final attempt for binop args vectorization should happen after the loop
// to try to find reductions.
- if (!isa<CmpInst>(Inst))
+ if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
PostponedInsts.push_back(Inst);
}
@@ -9554,8 +11751,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
if (auto *I = dyn_cast<Instruction>(Op))
// Do not try to vectorize CmpInst operands, this is done
// separately.
- if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
- I->getParent() == BB)
+ if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
+ !R.isDeleted(I) && I->getParent() == BB)
Stack.emplace(I, Level);
}
// Try to vectorized binops where reductions were not found.
@@ -9579,8 +11776,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
return tryToVectorize(I, R);
};
- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
- ExtraVectorization);
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
+ *TLI, ExtraVectorization);
}
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
@@ -9748,12 +11945,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
continue;
- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- else if (isa<CmpInst>(I))
+ } else if (isa<CmpInst>(I)) {
PostponedCmps.push_back(I);
+ continue;
+ }
+ // Try to find reductions in buildvector sequnces.
+ OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
}
if (AtTerminator) {
// Try to find reductions first.
@@ -10171,7 +12372,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
DT->getNode(I2->getParent());
assert(NodeI1 && "Should only process reachable instructions");
- assert(NodeI1 && "Should only process reachable instructions");
+ assert(NodeI2 && "Should only process reachable instructions");
assert((NodeI1 == NodeI2) ==
(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");