src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2022-07-04 19:20:19 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-02-08 19:02:26 +0000
commit	81ad626541db97eb356e2c1d4a20eb2a26a766ab (patch)
tree	311b6a8987c32b1e1dcbab65c54cfac3fdb56175 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent	5fff09660e06a66bed6482da9c70df328e16bbb6 (diff)
parent	145449b1e420787bb99721a429341fa6be3adfb6 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

4331

1 files changed, 3266 insertions, 1065 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 644372483edd..019a09665a67 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -53,7 +53,6 @@

#include "llvm/IR/Constant.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

-#include "llvm/IR/DebugLoc.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Dominators.h"

#include "llvm/IR/Function.h"

@@ -64,7 +63,6 @@

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/Module.h"

-#include "llvm/IR/NoFolder.h"

#include "llvm/IR/Operator.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/Type.h"

@@ -72,8 +70,9 @@

#include "llvm/IR/User.h"

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

+#ifdef EXPENSIVE_CHECKS

#include "llvm/IR/Verifier.h"

-#include "llvm/InitializePasses.h"

+#endif

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

@@ -87,6 +86,7 @@

#include "llvm/Support/MathExtras.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Utils/InjectTLIMappings.h"

+#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Vectorize.h"

#include <algorithm>

@@ -164,13 +164,14 @@ static cl::opt<int> LookAheadMaxDepth(

"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,

cl::desc("The maximum look-ahead depth for operand reordering scores"));

-// The Look-ahead heuristic goes through the users of the bundle to calculate

-// the users cost in getExternalUsesCost(). To avoid compilation time increase

-// we limit the number of users visited to this value.

-static cl::opt<unsigned> LookAheadUsersBudget(

- "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,

- cl::desc("The maximum number of users to visit while visiting the "

- "predecessors. This prevents compilation time increase."));

+// The maximum depth that the look-ahead score heuristic will explore

+// when it probing among candidates for vectorization tree roots.

+// The higher this value, the higher the compilation time overhead but unlike

+// similar limit for operands ordering this is less frequently used, hence

+// impact of higher value is less noticeable.

+static cl::opt<int> RootLookAheadMaxDepth(

+ "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,

+ cl::desc("The maximum look-ahead depth for searching best rooting option"));

static cl::opt<bool>

ViewSLPTree("view-slp-tree", cl::Hidden,

@@ -471,17 +472,36 @@ static bool isValidForAlternation(unsigned Opcode) {

return true;

}

+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

+ unsigned BaseIndex = 0);

+/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.

+/// compatible instructions or constants, or just some other regular values.

+static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,

+ Value *Op1) {

+ return (isConstant(BaseOp0) && isConstant(Op0)) ||

+ (isConstant(BaseOp1) && isConstant(Op1)) ||

+ (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&

+ !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||

+ getSameOpcode({BaseOp0, Op0}).getOpcode() ||

+ getSameOpcode({BaseOp1, Op1}).getOpcode();

/// \returns analysis of the Instructions in \p VL described in

/// InstructionsState, the Opcode that we suppose the whole list

/// could be vectorized even if its structure is diverse.

static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

- unsigned BaseIndex = 0) {

+ unsigned BaseIndex) {

// Make sure these are all Instructions.

if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))

return InstructionsState(VL[BaseIndex], nullptr, nullptr);

bool IsCastOp = isa<CastInst>(VL[BaseIndex]);

bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);

+ bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);

+ CmpInst::Predicate BasePred =

+ IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()

+ : CmpInst::BAD_ICMP_PREDICATE;

unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();

unsigned AltOpcode = Opcode;

unsigned AltIndex = BaseIndex;

@@ -514,6 +534,57 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

continue;

}

+ } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {

+ auto *BaseInst = cast<Instruction>(VL[BaseIndex]);

+ auto *Inst = cast<Instruction>(VL[Cnt]);

+ Type *Ty0 = BaseInst->getOperand(0)->getType();

+ Type *Ty1 = Inst->getOperand(0)->getType();

+ if (Ty0 == Ty1) {

+ Value *BaseOp0 = BaseInst->getOperand(0);

+ Value *BaseOp1 = BaseInst->getOperand(1);

+ Value *Op0 = Inst->getOperand(0);

+ Value *Op1 = Inst->getOperand(1);

+ CmpInst::Predicate CurrentPred =

+ cast<CmpInst>(VL[Cnt])->getPredicate();

+ CmpInst::Predicate SwappedCurrentPred =

+ CmpInst::getSwappedPredicate(CurrentPred);

+ // Check for compatible operands. If the corresponding operands are not

+ // compatible - need to perform alternate vectorization.

+ if (InstOpcode == Opcode) {

+ if (BasePred == CurrentPred &&

+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1))

+ continue;

+ if (BasePred == SwappedCurrentPred &&

+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0))

+ continue;

+ if (E == 2 &&

+ (BasePred == CurrentPred || BasePred == SwappedCurrentPred))

+ continue;

+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);

+ CmpInst::Predicate AltPred = AltInst->getPredicate();

+ Value *AltOp0 = AltInst->getOperand(0);

+ Value *AltOp1 = AltInst->getOperand(1);

+ // Check if operands are compatible with alternate operands.

+ if (AltPred == CurrentPred &&

+ areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1))

+ continue;

+ if (AltPred == SwappedCurrentPred &&

+ areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))

+ continue;

+ }

+ if (BaseIndex == AltIndex && BasePred != CurrentPred) {

+ assert(isValidForAlternation(Opcode) &&

+ isValidForAlternation(InstOpcode) &&

+ "Cast isn't safe for alternation, logic needs to be updated!");

+ AltIndex = Cnt;

+ continue;

+ }

+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);

+ CmpInst::Predicate AltPred = AltInst->getPredicate();

+ if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||

+ AltPred == CurrentPred || AltPred == SwappedCurrentPred)

+ continue;

+ }

} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)

continue;

return InstructionsState(VL[BaseIndex], nullptr, nullptr);

@@ -570,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

CallInst *CI = cast<CallInst>(UserInst);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

- if (hasVectorInstrinsicScalarOpd(ID, i))

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

return (CI->getArgOperand(i) == Scalar);

}

LLVM_FALLTHROUGH;

@@ -666,11 +737,11 @@ static void inversePermutation(ArrayRef<unsigned> Indices,

/// \returns inserting index of InsertElement or InsertValue instruction,

/// using Offset as base offset for index.

-static Optional<unsigned> getInsertIndex(Value *InsertInst,

+static Optional<unsigned> getInsertIndex(const Value *InsertInst,

unsigned Offset = 0) {

int Index = Offset;

- if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {

- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {

+ if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {

+ if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {

auto *VT = cast<FixedVectorType>(IE->getType());

if (CI->getValue().uge(VT->getNumElements()))

return None;

@@ -681,13 +752,13 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst,

return None;

}

- auto *IV = cast<InsertValueInst>(InsertInst);

+ const auto *IV = cast<InsertValueInst>(InsertInst);

Type *CurrentType = IV->getType();

for (unsigned I : IV->indices()) {

- if (auto *ST = dyn_cast<StructType>(CurrentType)) {

+ if (const auto *ST = dyn_cast<StructType>(CurrentType)) {

Index *= ST->getNumElements();

CurrentType = ST->getElementType(I);

- } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {

+ } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {

Index *= AT->getNumElements();

CurrentType = AT->getElementType();

} else {

@@ -698,11 +769,7 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst,

return Index;

}

-/// Reorders the list of scalars in accordance with the given \p Order and then

-/// the \p Mask. \p Order - is the original order of the scalars, need to

-/// reorder scalars into an unordered state at first according to the given

-/// order. Then the ordered scalars are shuffled once again in accordance with

-/// the provided mask.

+/// Reorders the list of scalars in accordance with the given \p Mask.

static void reorderScalars(SmallVectorImpl<Value *> &Scalars,

ArrayRef<int> Mask) {

assert(!Mask.empty() && "Expected non-empty mask.");

@@ -714,6 +781,58 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,

Scalars[Mask[I]] = Prev[I];

}

+/// Checks if the provided value does not require scheduling. It does not

+/// require scheduling if this is not an instruction or it is an instruction

+/// that does not read/write memory and all operands are either not instructions

+/// or phi nodes or instructions from different blocks.

+static bool areAllOperandsNonInsts(Value *V) {

+ auto *I = dyn_cast<Instruction>(V);

+ if (!I)

+ return true;

+ return !mayHaveNonDefUseDependency(*I) &&

+ all_of(I->operands(), [I](Value *V) {

+ auto *IO = dyn_cast<Instruction>(V);

+ if (!IO)

+ return true;

+ return isa<PHINode>(IO) || IO->getParent() != I->getParent();

+ });

+/// Checks if the provided value does not require scheduling. It does not

+/// require scheduling if this is not an instruction or it is an instruction

+/// that does not read/write memory and all users are phi nodes or instructions

+/// from the different blocks.

+static bool isUsedOutsideBlock(Value *V) {

+ auto *I = dyn_cast<Instruction>(V);

+ if (!I)

+ return true;

+ // Limits the number of uses to save compile time.

+ constexpr int UsesLimit = 8;

+ return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&

+ all_of(I->users(), [I](User *U) {

+ auto *IU = dyn_cast<Instruction>(U);

+ if (!IU)

+ return true;

+ return IU->getParent() != I->getParent() || isa<PHINode>(IU);

+ });

+/// Checks if the specified value does not require scheduling. It does not

+/// require scheduling if all operands and all users do not need to be scheduled

+/// in the current basic block.

+static bool doesNotNeedToBeScheduled(Value *V) {

+ return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);

+/// Checks if the specified array of instructions does not require scheduling.

+/// It is so if all either instructions have operands that do not require

+/// scheduling or their users do not require scheduling since they are phis or

+/// in other basic blocks.

+static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {

+ return !VL.empty() &&

+ (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));

namespace slpvectorizer {

/// Bottom Up SLP Vectorizer.

@@ -734,8 +853,8 @@ public:

TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,

DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,

const DataLayout *DL, OptimizationRemarkEmitter *ORE)

- : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),

- DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {

+ : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),

+ DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {

CodeMetrics::collectEphemeralValues(F, AC, EphValues);

// Use the vector register size specified by the target unless overridden

// by a command-line option.

@@ -776,7 +895,10 @@ public:

/// Construct a vectorizable tree that starts at \p Roots, ignoring users for

/// the purpose of scheduling and extraction in the \p UserIgnoreLst.

void buildTree(ArrayRef<Value *> Roots,

- ArrayRef<Value *> UserIgnoreLst = None);

+ const SmallDenseSet<Value *> &UserIgnoreLst);

+ /// Construct a vectorizable tree that starts at \p Roots.

+ void buildTree(ArrayRef<Value *> Roots);

/// Builds external uses of the vectorized scalars, i.e. the list of

/// vectorized scalars to be extracted, their lanes and their scalar users. \p

@@ -797,6 +919,7 @@ public:

}

MinBWs.clear();

InstrElementSize.clear();

+ UserIgnoreList = nullptr;

}

unsigned getTreeSize() const { return VectorizableTree.size(); }

@@ -810,6 +933,9 @@ public:

/// ExtractElement, ExtractValue), which can be part of the graph.

Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);

+ /// Sort loads into increasing pointers offsets to allow greater clustering.

+ Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);

/// Gets reordering data for the given tree entry. If the entry is vectorized

/// - just return ReorderIndices, otherwise check if the scalars can be

/// reordered and return the most optimal order.

@@ -924,96 +1050,18 @@ public:

#endif

};

- /// A helper data structure to hold the operands of a vector of instructions.

- /// This supports a fixed vector length for all operand vectors.

- class VLOperands {

- /// For each operand we need (i) the value, and (ii) the opcode that it

- /// would be attached to if the expression was in a left-linearized form.

- /// This is required to avoid illegal operand reordering.

- /// For example:

- /// \verbatim

- /// 0 Op1

- /// |/

- /// Op1 Op2 Linearized + Op2

- /// \ / ----------> |/

- /// - -

- ///

- /// Op1 - Op2 (0 + Op1) - Op2

- /// \endverbatim

- ///

- /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.

- ///

- /// Another way to think of this is to track all the operations across the

- /// path from the operand all the way to the root of the tree and to

- /// calculate the operation that corresponds to this path. For example, the

- /// path from Op2 to the root crosses the RHS of the '-', therefore the

- /// corresponding operation is a '-' (which matches the one in the

- /// linearized tree, as shown above).

- ///

- /// For lack of a better term, we refer to this operation as Accumulated

- /// Path Operation (APO).

- struct OperandData {

- OperandData() = default;

- OperandData(Value *V, bool APO, bool IsUsed)

- : V(V), APO(APO), IsUsed(IsUsed) {}

- /// The operand value.

- Value *V = nullptr;

- /// TreeEntries only allow a single opcode, or an alternate sequence of

- /// them (e.g, +, -). Therefore, we can safely use a boolean value for the

- /// APO. It is set to 'true' if 'V' is attached to an inverse operation

- /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise

- /// (e.g., Add/Mul)

- bool APO = false;

- /// Helper data for the reordering function.

- bool IsUsed = false;

- };

- /// During operand reordering, we are trying to select the operand at lane

- /// that matches best with the operand at the neighboring lane. Our

- /// selection is based on the type of value we are looking for. For example,

- /// if the neighboring lane has a load, we need to look for a load that is

- /// accessing a consecutive address. These strategies are summarized in the

- /// 'ReorderingMode' enumerator.

- enum class ReorderingMode {

- Load, ///< Matching loads to consecutive memory addresses

- Opcode, ///< Matching instructions based on opcode (same or alternate)

- Constant, ///< Matching constants

- Splat, ///< Matching the same instruction multiple times (broadcast)

- Failed, ///< We failed to create a vectorizable group

- };

- using OperandDataVec = SmallVector<OperandData, 2>;

- /// A vector of operand vectors.

- SmallVector<OperandDataVec, 4> OpsVec;

+ /// A helper class used for scoring candidates for two consecutive lanes.

+ class LookAheadHeuristics {

const DataLayout &DL;

ScalarEvolution &SE;

const BoUpSLP &R;

+ int NumLanes; // Total number of lanes (aka vectorization factor).

+ int MaxLevel; // The maximum recursion depth for accumulating score.

- /// \returns the operand data at \p OpIdx and \p Lane.

- OperandData &getData(unsigned OpIdx, unsigned Lane) {

- return OpsVec[OpIdx][Lane];

- }

- /// \returns the operand data at \p OpIdx and \p Lane. Const version.

- const OperandData &getData(unsigned OpIdx, unsigned Lane) const {

- return OpsVec[OpIdx][Lane];

- }

- /// Clears the used flag for all entries.

- void clearUsed() {

- for (unsigned OpIdx = 0, NumOperands = getNumOperands();

- OpIdx != NumOperands; ++OpIdx)

- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

- ++Lane)

- OpsVec[OpIdx][Lane].IsUsed = false;

- }

- /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.

- void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {

- std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

- }

+ public:

+ LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,

+ const BoUpSLP &R, int NumLanes, int MaxLevel)

+ : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}

// The hard-coded scores listed here are not very important, though it shall

// be higher for better matches to improve the resulting cost. When

@@ -1028,6 +1076,11 @@ public:

/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

static const int ScoreConsecutiveLoads = 4;

+ /// The same load multiple times. This should have a better score than

+ /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it

+ /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for

+ /// a vector load and 1.0 for a broadcast.

+ static const int ScoreSplatLoads = 3;

/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

static const int ScoreReversedLoads = 3;

/// ExtractElementInst from same vector and consecutive indexes.

@@ -1046,43 +1099,67 @@ public:

static const int ScoreUndef = 1;

/// Score for failing to find a decent match.

static const int ScoreFail = 0;

- /// User exteranl to the vectorized code.

- static const int ExternalUseCost = 1;

- /// The user is internal but in a different lane.

- static const int UserInDiffLaneCost = ExternalUseCost;

+ /// Score if all users are vectorized.

+ static const int ScoreAllUserVectorized = 1;

/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

- static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,

- ScalarEvolution &SE, int NumLanes) {

- if (V1 == V2)

- return VLOperands::ScoreSplat;

+ /// \p U1 and \p U2 are the users of \p V1 and \p V2.

+ /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p

+ /// MainAltOps.

+ int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,

+ ArrayRef<Value *> MainAltOps) const {

+ if (V1 == V2) {

+ if (isa<LoadInst>(V1)) {

+ // Retruns true if the users of V1 and V2 won't need to be extracted.

+ auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {

+ // Bail out if we have too many uses to save compilation time.

+ static constexpr unsigned Limit = 8;

+ if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))

+ return false;

+ auto AllUsersVectorized = [U1, U2, this](Value *V) {

+ return llvm::all_of(V->users(), [U1, U2, this](Value *U) {

+ return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;

+ });

+ };

+ return AllUsersVectorized(V1) && AllUsersVectorized(V2);

+ };

+ // A broadcast of a load can be cheaper on some targets.

+ if (R.TTI->isLegalBroadcastLoad(V1->getType(),

+ ElementCount::getFixed(NumLanes)) &&

+ ((int)V1->getNumUses() == NumLanes ||

+ AllUsersAreInternal(V1, V2)))

+ return LookAheadHeuristics::ScoreSplatLoads;

+ }

+ return LookAheadHeuristics::ScoreSplat;

+ }

auto *LI1 = dyn_cast<LoadInst>(V1);

auto *LI2 = dyn_cast<LoadInst>(V2);

if (LI1 && LI2) {

if (LI1->getParent() != LI2->getParent())

- return VLOperands::ScoreFail;

+ return LookAheadHeuristics::ScoreFail;

Optional<int> Dist = getPointersDiff(

LI1->getType(), LI1->getPointerOperand(), LI2->getType(),

LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);

- if (!Dist)

- return VLOperands::ScoreFail;

+ if (!Dist || *Dist == 0)

+ return LookAheadHeuristics::ScoreFail;

// The distance is too large - still may be profitable to use masked

// loads/gathers.

if (std::abs(*Dist) > NumLanes / 2)

- return VLOperands::ScoreAltOpcodes;

+ return LookAheadHeuristics::ScoreAltOpcodes;

// This still will detect consecutive loads, but we might have "holes"

// in some cases. It is ok for non-power-2 vectorization and may produce

// better results. It should not affect current vectorization.

- return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads

- : VLOperands::ScoreReversedLoads;

+ return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads

+ : LookAheadHeuristics::ScoreReversedLoads;

}

auto *C1 = dyn_cast<Constant>(V1);

auto *C2 = dyn_cast<Constant>(V2);

if (C1 && C2)

- return VLOperands::ScoreConstants;

+ return LookAheadHeuristics::ScoreConstants;

// Extracts from consecutive indexes of the same vector better score as

// the extracts could be optimized away.

@@ -1091,7 +1168,7 @@ public:

if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {

// Undefs are always profitable for extractelements.

if (isa<UndefValue>(V2))

- return VLOperands::ScoreConsecutiveExtracts;

+ return LookAheadHeuristics::ScoreConsecutiveExtracts;

Value *EV2 = nullptr;

ConstantInt *Ex2Idx = nullptr;

if (match(V2,

@@ -1099,108 +1176,62 @@ public:

m_Undef())))) {

// Undefs are always profitable for extractelements.

if (!Ex2Idx)

- return VLOperands::ScoreConsecutiveExtracts;

+ return LookAheadHeuristics::ScoreConsecutiveExtracts;

if (isUndefVector(EV2) && EV2->getType() == EV1->getType())

- return VLOperands::ScoreConsecutiveExtracts;

+ return LookAheadHeuristics::ScoreConsecutiveExtracts;

if (EV2 == EV1) {

int Idx1 = Ex1Idx->getZExtValue();

int Idx2 = Ex2Idx->getZExtValue();

int Dist = Idx2 - Idx1;

// The distance is too large - still may be profitable to use

// shuffles.

+ if (std::abs(Dist) == 0)

+ return LookAheadHeuristics::ScoreSplat;

if (std::abs(Dist) > NumLanes / 2)

- return VLOperands::ScoreAltOpcodes;

- return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts

- : VLOperands::ScoreReversedExtracts;

+ return LookAheadHeuristics::ScoreSameOpcode;

+ return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts

+ : LookAheadHeuristics::ScoreReversedExtracts;

}

+ return LookAheadHeuristics::ScoreAltOpcodes;

}

+ return LookAheadHeuristics::ScoreFail;

}

auto *I1 = dyn_cast<Instruction>(V1);

auto *I2 = dyn_cast<Instruction>(V2);

if (I1 && I2) {

if (I1->getParent() != I2->getParent())

- return VLOperands::ScoreFail;

- InstructionsState S = getSameOpcode({I1, I2});

+ return LookAheadHeuristics::ScoreFail;

+ SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());

+ Ops.push_back(I1);

+ Ops.push_back(I2);

+ InstructionsState S = getSameOpcode(Ops);

// Note: Only consider instructions with <= 2 operands to avoid

// complexity explosion.

- if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)

- return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes

- : VLOperands::ScoreSameOpcode;

+ if (S.getOpcode() &&

+ (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||

+ !S.isAltShuffle()) &&

+ all_of(Ops, [&S](Value *V) {

+ return cast<Instruction>(V)->getNumOperands() ==

+ S.MainOp->getNumOperands();

+ }))

+ return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes

+ : LookAheadHeuristics::ScoreSameOpcode;

}

if (isa<UndefValue>(V2))

- return VLOperands::ScoreUndef;

- return VLOperands::ScoreFail;

- }

- /// Holds the values and their lanes that are taking part in the look-ahead

- /// score calculation. This is used in the external uses cost calculation.

- /// Need to hold all the lanes in case of splat/broadcast at least to

- /// correctly check for the use in the different lane.

- SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;

- /// \returns the additional cost due to uses of \p LHS and \p RHS that are

- /// either external to the vectorized code, or require shuffling.

- int getExternalUsesCost(const std::pair<Value *, int> &LHS,

- const std::pair<Value *, int> &RHS) {

- int Cost = 0;

- std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};

- for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {

- Value *V = Values[Idx].first;

- if (isa<Constant>(V)) {

- // Since this is a function pass, it doesn't make semantic sense to

- // walk the users of a subclass of Constant. The users could be in

- // another function, or even another module that happens to be in

- // the same LLVMContext.

- continue;

- }

+ return LookAheadHeuristics::ScoreUndef;

- // Calculate the absolute lane, using the minimum relative lane of LHS

- // and RHS as base and Idx as the offset.

- int Ln = std::min(LHS.second, RHS.second) + Idx;

- assert(Ln >= 0 && "Bad lane calculation");

- unsigned UsersBudget = LookAheadUsersBudget;

- for (User *U : V->users()) {

- if (const TreeEntry *UserTE = R.getTreeEntry(U)) {

- // The user is in the VectorizableTree. Check if we need to insert.

- int UserLn = UserTE->findLaneForValue(U);

- assert(UserLn >= 0 && "Bad lane");

- // If the values are different, check just the line of the current

- // value. If the values are the same, need to add UserInDiffLaneCost

- // only if UserLn does not match both line numbers.

- if ((LHS.first != RHS.first && UserLn != Ln) ||

- (LHS.first == RHS.first && UserLn != LHS.second &&

- UserLn != RHS.second)) {

- Cost += UserInDiffLaneCost;

- break;

- }

- } else {

- // Check if the user is in the look-ahead code.

- auto It2 = InLookAheadValues.find(U);

- if (It2 != InLookAheadValues.end()) {

- // The user is in the look-ahead code. Check the lane.

- if (!It2->getSecond().contains(Ln)) {

- Cost += UserInDiffLaneCost;

- break;

- }

- } else {

- // The user is neither in SLP tree nor in the look-ahead code.

- Cost += ExternalUseCost;

- break;

- }

- // Limit the number of visited uses to cap compilation time.

- if (--UsersBudget == 0)

- break;

- }

- return Cost;

+ return LookAheadHeuristics::ScoreFail;

}

- /// Go through the operands of \p LHS and \p RHS recursively until \p

- /// MaxLevel, and return the cummulative score. For example:

+ /// Go through the operands of \p LHS and \p RHS recursively until

+ /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are

+ /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands

+ /// of \p U1 and \p U2), except at the beginning of the recursion where

+ /// these are set to nullptr.

+ ///

+ /// For example:

/// \verbatim

/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]

/// \ / \ / \ / \ /

@@ -1211,8 +1242,8 @@ public:

/// each level recursively, accumulating the score. It starts from matching

/// the additions at level 0, then moves on to the loads (level 1). The

/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and

- /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while

- /// {A[0],C[0]} has a score of VLOperands::ScoreFail.

+ /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while

+ /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.

/// Please note that the order of the operands does not matter, as we

/// evaluate the score of all profitable combinations of operands. In

/// other words the score of G1 and G4 is the same as G1 and G2. This

@@ -1220,18 +1251,13 @@ public:

/// Look-ahead SLP: Auto-vectorization in the presence of commutative

/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,

/// Luís F. W. Góes

- int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,

- const std::pair<Value *, int> &RHS, int CurrLevel,

- int MaxLevel) {

+ int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,

+ Instruction *U2, int CurrLevel,

+ ArrayRef<Value *> MainAltOps) const {

- Value *V1 = LHS.first;

- Value *V2 = RHS.first;

// Get the shallow score of V1 and V2.

- int ShallowScoreAtThisLevel = std::max(

- (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -

- getExternalUsesCost(LHS, RHS));

- int Lane1 = LHS.second;

- int Lane2 = RHS.second;

+ int ShallowScoreAtThisLevel =

+ getShallowScore(LHS, RHS, U1, U2, MainAltOps);

// If reached MaxLevel,

// or if V1 and V2 are not instructions,

@@ -1239,20 +1265,17 @@ public:

// or if they are not consecutive,

// or if profitable to vectorize loads or extractelements, early return

// the current cost.

- auto *I1 = dyn_cast<Instruction>(V1);

- auto *I2 = dyn_cast<Instruction>(V2);

+ auto *I1 = dyn_cast<Instruction>(LHS);

+ auto *I2 = dyn_cast<Instruction>(RHS);

if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

- ShallowScoreAtThisLevel == VLOperands::ScoreFail ||

+ ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||

(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||

+ (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||

(isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&

ShallowScoreAtThisLevel))

return ShallowScoreAtThisLevel;

assert(I1 && I2 && "Should have early exited.");

- // Keep track of in-tree values for determining the external-use cost.

- InLookAheadValues[V1].insert(Lane1);

- InLookAheadValues[V2].insert(Lane2);

// Contains the I2 operand indexes that got matched with I1 operands.

SmallSet<unsigned, 4> Op2Used;

@@ -1275,11 +1298,12 @@ public:

if (Op2Used.count(OpIdx2))

continue;

// Recursively calculate the cost at each level

- int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},

- {I2->getOperand(OpIdx2), Lane2},

- CurrLevel + 1, MaxLevel);

+ int TmpScore =

+ getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),

+ I1, I2, CurrLevel + 1, None);

// Look for the best score.

- if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {

+ if (TmpScore > LookAheadHeuristics::ScoreFail &&

+ TmpScore > MaxTmpScore) {

MaxTmpScore = TmpScore;

MaxOpIdx2 = OpIdx2;

FoundBest = true;

@@ -1293,24 +1317,213 @@ public:

}

return ShallowScoreAtThisLevel;

}

+ };

+ /// A helper data structure to hold the operands of a vector of instructions.

+ /// This supports a fixed vector length for all operand vectors.

+ class VLOperands {

+ /// For each operand we need (i) the value, and (ii) the opcode that it

+ /// would be attached to if the expression was in a left-linearized form.

+ /// This is required to avoid illegal operand reordering.

+ /// For example:

+ /// \verbatim

+ /// 0 Op1

+ /// |/

+ /// Op1 Op2 Linearized + Op2

+ /// \ / ----------> |/

+ /// - -

+ ///

+ /// Op1 - Op2 (0 + Op1) - Op2

+ /// \endverbatim

+ ///

+ /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.

+ ///

+ /// Another way to think of this is to track all the operations across the

+ /// path from the operand all the way to the root of the tree and to

+ /// calculate the operation that corresponds to this path. For example, the

+ /// path from Op2 to the root crosses the RHS of the '-', therefore the

+ /// corresponding operation is a '-' (which matches the one in the

+ /// linearized tree, as shown above).

+ ///

+ /// For lack of a better term, we refer to this operation as Accumulated

+ /// Path Operation (APO).

+ struct OperandData {

+ OperandData() = default;

+ OperandData(Value *V, bool APO, bool IsUsed)

+ : V(V), APO(APO), IsUsed(IsUsed) {}

+ /// The operand value.

+ Value *V = nullptr;

+ /// TreeEntries only allow a single opcode, or an alternate sequence of

+ /// them (e.g, +, -). Therefore, we can safely use a boolean value for the

+ /// APO. It is set to 'true' if 'V' is attached to an inverse operation

+ /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise

+ /// (e.g., Add/Mul)

+ bool APO = false;

+ /// Helper data for the reordering function.

+ bool IsUsed = false;

+ };

+ /// During operand reordering, we are trying to select the operand at lane

+ /// that matches best with the operand at the neighboring lane. Our

+ /// selection is based on the type of value we are looking for. For example,

+ /// if the neighboring lane has a load, we need to look for a load that is

+ /// accessing a consecutive address. These strategies are summarized in the

+ /// 'ReorderingMode' enumerator.

+ enum class ReorderingMode {

+ Load, ///< Matching loads to consecutive memory addresses

+ Opcode, ///< Matching instructions based on opcode (same or alternate)

+ Constant, ///< Matching constants

+ Splat, ///< Matching the same instruction multiple times (broadcast)

+ Failed, ///< We failed to create a vectorizable group

+ };

+ using OperandDataVec = SmallVector<OperandData, 2>;

+ /// A vector of operand vectors.

+ SmallVector<OperandDataVec, 4> OpsVec;

+ const DataLayout &DL;

+ ScalarEvolution &SE;

+ const BoUpSLP &R;

+ /// \returns the operand data at \p OpIdx and \p Lane.

+ OperandData &getData(unsigned OpIdx, unsigned Lane) {

+ return OpsVec[OpIdx][Lane];

+ }

+ /// \returns the operand data at \p OpIdx and \p Lane. Const version.

+ const OperandData &getData(unsigned OpIdx, unsigned Lane) const {

+ return OpsVec[OpIdx][Lane];

+ }

+ /// Clears the used flag for all entries.

+ void clearUsed() {

+ for (unsigned OpIdx = 0, NumOperands = getNumOperands();

+ OpIdx != NumOperands; ++OpIdx)

+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

+ ++Lane)

+ OpsVec[OpIdx][Lane].IsUsed = false;

+ }

+ /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.

+ void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {

+ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

+ }

+ /// \param Lane lane of the operands under analysis.

+ /// \param OpIdx operand index in \p Lane lane we're looking the best

+ /// candidate for.

+ /// \param Idx operand index of the current candidate value.

+ /// \returns The additional score due to possible broadcasting of the

+ /// elements in the lane. It is more profitable to have power-of-2 unique

+ /// elements in the lane, it will be vectorized with higher probability

+ /// after removing duplicates. Currently the SLP vectorizer supports only

+ /// vectorization of the power-of-2 number of unique scalars.

+ int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {

+ Value *IdxLaneV = getData(Idx, Lane).V;

+ if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)

+ return 0;

+ SmallPtrSet<Value *, 4> Uniques;

+ for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {

+ if (Ln == Lane)

+ continue;

+ Value *OpIdxLnV = getData(OpIdx, Ln).V;

+ if (!isa<Instruction>(OpIdxLnV))

+ return 0;

+ Uniques.insert(OpIdxLnV);

+ }

+ int UniquesCount = Uniques.size();

+ int UniquesCntWithIdxLaneV =

+ Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;

+ Value *OpIdxLaneV = getData(OpIdx, Lane).V;

+ int UniquesCntWithOpIdxLaneV =

+ Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;

+ if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)

+ return 0;

+ return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -

+ UniquesCntWithOpIdxLaneV) -

+ (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);

+ }

+ /// \param Lane lane of the operands under analysis.

+ /// \param OpIdx operand index in \p Lane lane we're looking the best

+ /// candidate for.

+ /// \param Idx operand index of the current candidate value.

+ /// \returns The additional score for the scalar which users are all

+ /// vectorized.

+ int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {

+ Value *IdxLaneV = getData(Idx, Lane).V;

+ Value *OpIdxLaneV = getData(OpIdx, Lane).V;

+ // Do not care about number of uses for vector-like instructions

+ // (extractelement/extractvalue with constant indices), they are extracts

+ // themselves and already externally used. Vectorization of such

+ // instructions does not add extra extractelement instruction, just may

+ // remove it.

+ if (isVectorLikeInstWithConstOps(IdxLaneV) &&

+ isVectorLikeInstWithConstOps(OpIdxLaneV))

+ return LookAheadHeuristics::ScoreAllUserVectorized;

+ auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);

+ if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))

+ return 0;

+ return R.areAllUsersVectorized(IdxLaneI, None)

+ ? LookAheadHeuristics::ScoreAllUserVectorized

+ : 0;

+ }

+ /// Score scaling factor for fully compatible instructions but with

+ /// different number of external uses. Allows better selection of the

+ /// instructions with less external uses.

+ static const int ScoreScaleFactor = 10;

/// \Returns the look-ahead score, which tells us how much the sub-trees

/// rooted at \p LHS and \p RHS match, the more they match the higher the

/// score. This helps break ties in an informed way when we cannot decide on

/// the order of the operands by just considering the immediate

/// predecessors.

- int getLookAheadScore(const std::pair<Value *, int> &LHS,

- const std::pair<Value *, int> &RHS) {

- InLookAheadValues.clear();

- return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);

+ int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,

+ int Lane, unsigned OpIdx, unsigned Idx,

+ bool &IsUsed) {

+ LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),

+ LookAheadMaxDepth);

+ // Keep track of the instruction stack as we recurse into the operands

+ // during the look-ahead score exploration.

+ int Score =

+ LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,

+ /*CurrLevel=*/1, MainAltOps);

+ if (Score) {

+ int SplatScore = getSplatScore(Lane, OpIdx, Idx);

+ if (Score <= -SplatScore) {

+ // Set the minimum score for splat-like sequence to avoid setting

+ // failed state.

+ Score = 1;

+ } else {

+ Score += SplatScore;

+ // Scale score to see the difference between different operands

+ // and similar operands but all vectorized/not all vectorized

+ // uses. It does not affect actual selection of the best

+ // compatible operand in general, just allows to select the

+ // operand with all vectorized uses.

+ Score *= ScoreScaleFactor;

+ Score += getExternalUseScore(Lane, OpIdx, Idx);

+ IsUsed = true;

+ }

+ return Score;

}

+ /// Best defined scores per lanes between the passes. Used to choose the

+ /// best operand (with the highest score) between the passes.

+ /// The key - {Operand Index, Lane}.

+ /// The value - the best score between the passes for the lane and the

+ /// operand.

+ SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>

+ BestScoresPerLanes;

// Search all operands in Ops[*][Lane] for the one that matches best

// Ops[OpIdx][LastLane] and return its opreand index.

// If no good match can be found, return None.

- Optional<unsigned>

- getBestOperand(unsigned OpIdx, int Lane, int LastLane,

- ArrayRef<ReorderingMode> ReorderingModes) {

+ Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,

+ ArrayRef<ReorderingMode> ReorderingModes,

+ ArrayRef<Value *> MainAltOps) {

unsigned NumOperands = getNumOperands();

// The operand of the previous lane at OpIdx.

@@ -1318,6 +1531,8 @@ public:

// Our strategy mode for OpIdx.

ReorderingMode RMode = ReorderingModes[OpIdx];

+ if (RMode == ReorderingMode::Failed)

+ return None;

// The linearized opcode of the operand at OpIdx, Lane.

bool OpIdxAPO = getData(OpIdx, Lane).APO;

@@ -1329,7 +1544,15 @@ public:

Optional<unsigned> Idx = None;

unsigned Score = 0;

} BestOp;

+ BestOp.Score =

+ BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)

+ .first->second;

+ // Track if the operand must be marked as used. If the operand is set to

+ // Score 1 explicitly (because of non power-of-2 unique scalars, we may

+ // want to reestimate the operands again on the following iterations).

+ bool IsUsed =

+ RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;

// Iterate through all unused operands and look for the best.

for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {

// Get the operand at Idx and Lane.

@@ -1355,11 +1578,12 @@ public:

bool LeftToRight = Lane > LastLane;

Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

Value *OpRight = (LeftToRight) ? Op : OpLastLane;

- unsigned Score =

- getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});

- if (Score > BestOp.Score) {

+ int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,

+ OpIdx, Idx, IsUsed);

+ if (Score > static_cast<int>(BestOp.Score)) {

BestOp.Idx = Idx;

BestOp.Score = Score;

+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;

}

break;

}

@@ -1368,12 +1592,12 @@ public:

BestOp.Idx = Idx;

break;

case ReorderingMode::Failed:

- return None;

+ llvm_unreachable("Not expected Failed reordering mode.");

}

if (BestOp.Idx) {

- getData(BestOp.Idx.getValue(), Lane).IsUsed = true;

+ getData(*BestOp.Idx, Lane).IsUsed = IsUsed;

return BestOp.Idx;

}

// If we could not find a good match return None.

@@ -1690,6 +1914,10 @@ public:

// rest of the lanes. We are visiting the nodes in a circular fashion,

// using FirstLane as the center point and increasing the radius

// distance.

+ SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);

+ for (unsigned I = 0; I < NumOperands; ++I)

+ MainAltOps[I].push_back(getData(I, FirstLane).V);

for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {

// Visit the lane on the right and then the lane on the left.

for (int Direction : {+1, -1}) {

@@ -1702,21 +1930,29 @@ public:

// Look for a good match for each operand.

for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

// Search for the operand that matches SortedOps[OpIdx][Lane-1].

- Optional<unsigned> BestIdx =

- getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);

+ Optional<unsigned> BestIdx = getBestOperand(

+ OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);

// By not selecting a value, we allow the operands that follow to

// select a better matching value. We will get a non-null value in

// the next run of getBestOperand().

if (BestIdx) {

// Swap the current operand with the one returned by

// getBestOperand().

- swap(OpIdx, BestIdx.getValue(), Lane);

+ swap(OpIdx, *BestIdx, Lane);

} else {

// We failed to find a best operand, set mode to 'Failed'.

ReorderingModes[OpIdx] = ReorderingMode::Failed;

// Enable the second pass.

StrategyFailed = true;

}

+ // Try to get the alternate opcode and follow it during analysis.

+ if (MainAltOps[OpIdx].size() != 2) {

+ OperandData &AltOp = getData(OpIdx, Lane);

+ InstructionsState OpS =

+ getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});

+ if (OpS.getOpcode() && OpS.isAltShuffle())

+ MainAltOps[OpIdx].push_back(AltOp.V);

+ }

}

@@ -1780,15 +2016,109 @@ public:

#endif

};

+ /// Evaluate each pair in \p Candidates and return index into \p Candidates

+ /// for a pair which have highest score deemed to have best chance to form

+ /// root of profitable tree to vectorize. Return None if no candidate scored

+ /// above the LookAheadHeuristics::ScoreFail.

+ /// \param Limit Lower limit of the cost, considered to be good enough score.

+ Optional<int>

+ findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,

+ int Limit = LookAheadHeuristics::ScoreFail) {

+ LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,

+ RootLookAheadMaxDepth);

+ int BestScore = Limit;

+ Optional<int> Index = None;

+ for (int I : seq<int>(0, Candidates.size())) {

+ int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,

+ Candidates[I].second,

+ /*U1=*/nullptr, /*U2=*/nullptr,

+ /*Level=*/1, None);

+ if (Score > BestScore) {

+ BestScore = Score;

+ Index = I;

+ }

+ return Index;

+ }

/// Checks if the instruction is marked for deletion.

bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }

- /// Marks values operands for later deletion by replacing them with Undefs.

- void eraseInstructions(ArrayRef<Value *> AV);

+ /// Removes an instruction from its block and eventually deletes it.

+ /// It's like Instruction::eraseFromParent() except that the actual deletion

+ /// is delayed until BoUpSLP is destructed.

+ void eraseInstruction(Instruction *I) {

+ DeletedInstructions.insert(I);

+ }

+ /// Checks if the instruction was already analyzed for being possible

+ /// reduction root.

+ bool isAnalyzedReductionRoot(Instruction *I) const {

+ return AnalyzedReductionsRoots.count(I);

+ }

+ /// Register given instruction as already analyzed for being possible

+ /// reduction root.

+ void analyzedReductionRoot(Instruction *I) {

+ AnalyzedReductionsRoots.insert(I);

+ }

+ /// Checks if the provided list of reduced values was checked already for

+ /// vectorization.

+ bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {

+ return AnalyzedReductionVals.contains(hash_value(VL));

+ }

+ /// Adds the list of reduced values to list of already checked values for the

+ /// vectorization.

+ void analyzedReductionVals(ArrayRef<Value *> VL) {

+ AnalyzedReductionVals.insert(hash_value(VL));

+ }

+ /// Clear the list of the analyzed reduction root instructions.

+ void clearReductionData() {

+ AnalyzedReductionsRoots.clear();

+ AnalyzedReductionVals.clear();

+ }

+ /// Checks if the given value is gathered in one of the nodes.

+ bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {

+ return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });

+ }

~BoUpSLP();

private:

+ /// Check if the operands on the edges \p Edges of the \p UserTE allows

+ /// reordering (i.e. the operands can be reordered because they have only one

+ /// user and reordarable).

+ /// \param ReorderableGathers List of all gather nodes that require reordering

+ /// (e.g., gather of extractlements or partially vectorizable loads).

+ /// \param GatherOps List of gather operand nodes for \p UserTE that require

+ /// reordering, subset of \p NonVectorized.

+ bool

+ canReorderOperands(TreeEntry *UserTE,

+ SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

+ ArrayRef<TreeEntry *> ReorderableGathers,

+ SmallVectorImpl<TreeEntry *> &GatherOps);

+ /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

+ /// if any. If it is not vectorized (gather node), returns nullptr.

+ TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {

+ ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);

+ TreeEntry *TE = nullptr;

+ const auto *It = find_if(VL, [this, &TE](Value *V) {

+ TE = getTreeEntry(V);

+ return TE;

+ });

+ if (It != VL.end() && TE->isSame(VL))

+ return TE;

+ return nullptr;

+ }

+ /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

+ /// if any. If it is not vectorized (gather node), returns nullptr.

+ const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,

+ unsigned OpIdx) const {

+ return const_cast<BoUpSLP *>(this)->getVectorizedOperand(

+ const_cast<TreeEntry *>(UserTE), OpIdx);

+ }

/// Checks if all users of \p I are the part of the vectorization tree.

bool areAllUsersVectorized(Instruction *I,

ArrayRef<Value *> VectorizedVals) const;

@@ -1815,12 +2145,17 @@ private:

/// Vectorize a single entry in the tree, starting in \p VL.

Value *vectorizeTree(ArrayRef<Value *> VL);

+ /// Create a new vector from a list of scalar values. Produces a sequence

+ /// which exploits values reused across lanes, and arranges the inserts

+ /// for ease of later optimization.

+ Value *createBuildVector(ArrayRef<Value *> VL);

/// \returns the scalarization cost for this type. Scalarization in this

/// context means the creation of vectors from a group of scalars. If \p

/// NeedToShuffle is true, need to add a cost of reshuffling some of the

/// vector elements.

InstructionCost getGatherCost(FixedVectorType *Ty,

- const DenseSet<unsigned> &ShuffledIndices,

+ const APInt &ShuffledIndices,

bool NeedToShuffle) const;

/// Checks if the gathered \p VL can be represented as shuffle(s) of previous

@@ -1855,6 +2190,29 @@ private:

const DataLayout &DL,

ScalarEvolution &SE,

const BoUpSLP &R);

+ /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the

+ /// users of \p TE and collects the stores. It returns the map from the store

+ /// pointers to the collected stores.

+ DenseMap<Value *, SmallVector<StoreInst *, 4>>

+ collectUserStores(const BoUpSLP::TreeEntry *TE) const;

+ /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the

+ /// stores in \p StoresVec can for a vector instruction. If so it returns true

+ /// and populates \p ReorderIndices with the shuffle indices of the the stores

+ /// when compared to the sorted vector.

+ bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

+ OrdersType &ReorderIndices) const;

+ /// Iterates through the users of \p TE, looking for scalar stores that can be

+ /// potentially vectorized in a future SLP-tree. If found, it keeps track of

+ /// their order and builds an order index vector for each store bundle. It

+ /// returns all these order vectors found.

+ /// We run this after the tree has formed, otherwise we may come across user

+ /// instructions that are not yet in the tree.

+ SmallVector<OrdersType, 1>

+ findExternalStoreUsersReorderIndices(TreeEntry *TE) const;

struct TreeEntry {

using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;

TreeEntry(VecTreeTy &Container) : Container(Container) {}

@@ -2199,15 +2557,21 @@ private:

ScalarToTreeEntry[V] = Last;

}

// Update the scheduler bundle to point to this TreeEntry.

- unsigned Lane = 0;

- for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;

- BundleMember = BundleMember->NextInBundle) {

- BundleMember->TE = Last;

- BundleMember->Lane = Lane;

- ++Lane;

- }

- assert((!Bundle.getValue() || Lane == VL.size()) &&

+ ScheduleData *BundleMember = *Bundle;

+ assert((BundleMember || isa<PHINode>(S.MainOp) ||

+ isVectorLikeInstWithConstOps(S.MainOp) ||

+ doesNotNeedToSchedule(VL)) &&

"Bundle and VL out of sync");

+ if (BundleMember) {

+ for (Value *V : VL) {

+ if (doesNotNeedToBeScheduled(V))

+ continue;

+ assert(BundleMember && "Unexpected end of bundle.");

+ BundleMember->TE = Last;

+ BundleMember = BundleMember->NextInBundle;

+ }

+ assert(!BundleMember && "Bundle and VL out of sync");

} else {

MustGather.insert(VL.begin(), VL.end());

}

@@ -2241,7 +2605,7 @@ private:

/// Maps a specific scalar to its tree entry.

SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;

- /// Maps a value to the proposed vectorizable size.

+ /// Maps a value to the proposed vectorizable size.

SmallDenseMap<Value *, unsigned> InstrElementSize;

/// A list of scalars that we found that we need to keep as scalars.

@@ -2272,12 +2636,12 @@ private:

// First check if the result is already in the cache.

AliasCacheKey key = std::make_pair(Inst1, Inst2);

Optional<bool> &result = AliasCache[key];

- if (result.hasValue()) {

+ if (result) {

return result.getValue();

}

bool aliased = true;

if (Loc1.Ptr && isSimple(Inst1))

- aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));

+ aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

// Store the result in the cache.

result = aliased;

return aliased;

@@ -2289,20 +2653,23 @@ private:

/// TODO: consider moving this to the AliasAnalysis itself.

DenseMap<AliasCacheKey, Optional<bool>> AliasCache;

- /// Removes an instruction from its block and eventually deletes it.

- /// It's like Instruction::eraseFromParent() except that the actual deletion

- /// is delayed until BoUpSLP is destructed.

- /// This is required to ensure that there are no incorrect collisions in the

- /// AliasCache, which can happen if a new instruction is allocated at the

- /// same address as a previously deleted instruction.

- void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {

- auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;

- It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;

- }

+ // Cache for pointerMayBeCaptured calls inside AA. This is preserved

+ // globally through SLP because we don't perform any action which

+ // invalidates capture results.

+ BatchAAResults BatchAA;

/// Temporary store for deleted instructions. Instructions will be deleted

- /// eventually when the BoUpSLP is destructed.

- DenseMap<Instruction *, bool> DeletedInstructions;

+ /// eventually when the BoUpSLP is destructed. The deferral is required to

+ /// ensure that there are no incorrect collisions in the AliasCache, which

+ /// can happen if a new instruction is allocated at the same address as a

+ /// previously deleted instruction.

+ DenseSet<Instruction *> DeletedInstructions;

+ /// Set of the instruction, being analyzed already for reductions.

+ SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;

+ /// Set of hashes for the list of reduction values already being analyzed.

+ DenseSet<size_t> AnalyzedReductionVals;

/// A list of values that need to extracted out of the tree.

/// This list holds pairs of (Internal Scalar : External User). External User

@@ -2336,14 +2703,39 @@ private:

NextLoadStore = nullptr;

IsScheduled = false;

SchedulingRegionID = BlockSchedulingRegionID;

- UnscheduledDepsInBundle = UnscheduledDeps;

clearDependencies();

OpValue = OpVal;

TE = nullptr;

- Lane = -1;

+ }

+ /// Verify basic self consistency properties

+ void verify() {

+ if (hasValidDependencies()) {

+ assert(UnscheduledDeps <= Dependencies && "invariant");

+ } else {

+ assert(UnscheduledDeps == Dependencies && "invariant");

+ }

+ if (IsScheduled) {

+ assert(isSchedulingEntity() &&

+ "unexpected scheduled state");

+ for (const ScheduleData *BundleMember = this; BundleMember;

+ BundleMember = BundleMember->NextInBundle) {

+ assert(BundleMember->hasValidDependencies() &&

+ BundleMember->UnscheduledDeps == 0 &&

+ "unexpected scheduled state");

+ assert((BundleMember == this || !BundleMember->IsScheduled) &&

+ "only bundle is marked scheduled");

+ }

+ assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&

+ "all bundle members must be in same basic block");

}

/// Returns true if the dependency information has been calculated.

+ /// Note that depenendency validity can vary between instructions within

+ /// a single bundle.

bool hasValidDependencies() const { return Dependencies != InvalidDeps; }

/// Returns true for single instructions and for bundle representatives

@@ -2353,7 +2745,7 @@ private:

/// Returns true if it represents an instruction bundle and not only a

/// single instruction.

bool isPartOfBundle() const {

- return NextInBundle != nullptr || FirstInBundle != this;

+ return NextInBundle != nullptr || FirstInBundle != this || TE;

}

/// Returns true if it is ready for scheduling, i.e. it has no more

@@ -2361,20 +2753,23 @@ private:

bool isReady() const {

assert(isSchedulingEntity() &&

"can't consider non-scheduling entity for ready list");

- return UnscheduledDepsInBundle == 0 && !IsScheduled;

+ return unscheduledDepsInBundle() == 0 && !IsScheduled;

}

- /// Modifies the number of unscheduled dependencies, also updating it for

- /// the whole bundle.

+ /// Modifies the number of unscheduled dependencies for this instruction,

+ /// and returns the number of remaining dependencies for the containing

+ /// bundle.

int incrementUnscheduledDeps(int Incr) {

+ assert(hasValidDependencies() &&

+ "increment of unscheduled deps would be meaningless");

UnscheduledDeps += Incr;

- return FirstInBundle->UnscheduledDepsInBundle += Incr;

+ return FirstInBundle->unscheduledDepsInBundle();

}

/// Sets the number of unscheduled dependencies to the number of

/// dependencies.

void resetUnscheduledDeps() {

- incrementUnscheduledDeps(Dependencies - UnscheduledDeps);

+ UnscheduledDeps = Dependencies;

}

/// Clears all dependency information.

@@ -2382,6 +2777,19 @@ private:

Dependencies = InvalidDeps;

resetUnscheduledDeps();

MemoryDependencies.clear();

+ ControlDependencies.clear();

+ }

+ int unscheduledDepsInBundle() const {

+ assert(isSchedulingEntity() && "only meaningful on the bundle");

+ int Sum = 0;

+ for (const ScheduleData *BundleMember = this; BundleMember;

+ BundleMember = BundleMember->NextInBundle) {

+ if (BundleMember->UnscheduledDeps == InvalidDeps)

+ return InvalidDeps;

+ Sum += BundleMember->UnscheduledDeps;

+ }

+ return Sum;

}

void dump(raw_ostream &os) const {

@@ -2402,6 +2810,12 @@ private:

Instruction *Inst = nullptr;

+ /// Opcode of the current instruction in the schedule data.

+ Value *OpValue = nullptr;

+ /// The TreeEntry that this instruction corresponds to.

+ TreeEntry *TE = nullptr;

/// Points to the head in an instruction bundle (and always to this for

/// single instructions).

ScheduleData *FirstInBundle = nullptr;

@@ -2418,6 +2832,12 @@ private:

/// This list is derived on demand in calculateDependencies().

SmallVector<ScheduleData *, 4> MemoryDependencies;

+ /// List of instructions which this instruction could be control dependent

+ /// on. Allowing such nodes to be scheduled below this one could introduce

+ /// a runtime fault which didn't exist in the original program.

+ /// ex: this is a load or udiv following a readonly call which inf loops

+ SmallVector<ScheduleData *, 4> ControlDependencies;

/// This ScheduleData is in the current scheduling region if this matches

/// the current SchedulingRegionID of BlockScheduling.

int SchedulingRegionID = 0;

@@ -2437,22 +2857,9 @@ private:

/// Note that this is negative as long as Dependencies is not calculated.

int UnscheduledDeps = InvalidDeps;

- /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for

- /// single instructions.

- int UnscheduledDepsInBundle = InvalidDeps;

/// True if this instruction is scheduled (or considered as scheduled in the

/// dry-run).

bool IsScheduled = false;

- /// Opcode of the current instruction in the schedule data.

- Value *OpValue = nullptr;

- /// The TreeEntry that this instruction corresponds to.

- TreeEntry *TE = nullptr;

- /// The lane of this node in the TreeEntry.

- int Lane = -1;

};

#ifndef NDEBUG

@@ -2467,6 +2874,21 @@ private:

friend struct DOTGraphTraits<BoUpSLP *>;

/// Contains all scheduling data for a basic block.

+ /// It does not schedules instructions, which are not memory read/write

+ /// instructions and their operands are either constants, or arguments, or

+ /// phis, or instructions from others blocks, or their users are phis or from

+ /// the other blocks. The resulting vector instructions can be placed at the

+ /// beginning of the basic block without scheduling (if operands does not need

+ /// to be scheduled) or at the end of the block (if users are outside of the

+ /// block). It allows to save some compile time and memory used by the

+ /// compiler.

+ /// ScheduleData is assigned for each instruction in between the boundaries of

+ /// the tree entry, even for those, which are not part of the graph. It is

+ /// required to correctly follow the dependencies between the instructions and

+ /// their correct scheduling. The ScheduleData is not allocated for the

+ /// instructions, which do not require scheduling, like phis, nodes with

+ /// extractelements/insertelements only or nodes with instructions, with

+ /// uses/operands outside of the block.

struct BlockScheduling {

BlockScheduling(BasicBlock *BB)

: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}

@@ -2477,6 +2899,7 @@ private:

ScheduleEnd = nullptr;

FirstLoadStoreInRegion = nullptr;

LastLoadStoreInRegion = nullptr;

+ RegionHasStackSave = false;

// Reduce the maximum schedule region size by the size of the

// previous scheduling run.

@@ -2490,20 +2913,29 @@ private:

++SchedulingRegionID;

}

- ScheduleData *getScheduleData(Value *V) {

- ScheduleData *SD = ScheduleDataMap[V];

- if (SD && SD->SchedulingRegionID == SchedulingRegionID)

+ ScheduleData *getScheduleData(Instruction *I) {

+ if (BB != I->getParent())

+ // Avoid lookup if can't possibly be in map.

+ return nullptr;

+ ScheduleData *SD = ScheduleDataMap.lookup(I);

+ if (SD && isInSchedulingRegion(SD))

return SD;

return nullptr;

}

+ ScheduleData *getScheduleData(Value *V) {

+ if (auto *I = dyn_cast<Instruction>(V))

+ return getScheduleData(I);

+ return nullptr;

+ }

ScheduleData *getScheduleData(Value *V, Value *Key) {

if (V == Key)

return getScheduleData(V);

auto I = ExtraScheduleDataMap.find(V);

if (I != ExtraScheduleDataMap.end()) {

- ScheduleData *SD = I->second[Key];

- if (SD && SD->SchedulingRegionID == SchedulingRegionID)

+ ScheduleData *SD = I->second.lookup(Key);

+ if (SD && isInSchedulingRegion(SD))

return SD;

}

return nullptr;

@@ -2524,7 +2956,7 @@ private:

BundleMember = BundleMember->NextInBundle) {

if (BundleMember->Inst != BundleMember->OpValue)

continue;

// Handle the def-use chain dependencies.

// Decrement the unscheduled counter and insert to ready list if ready.

@@ -2546,10 +2978,12 @@ private:

};

// If BundleMember is a vector bundle, its operands may have been

- // reordered duiring buildTree(). We therefore need to get its operands

+ // reordered during buildTree(). We therefore need to get its operands

// through the TreeEntry.

if (TreeEntry *TE = BundleMember->TE) {

- int Lane = BundleMember->Lane;

+ // Need to search for the lane since the tree entry can be reordered.

+ int Lane = std::distance(TE->Scalars.begin(),

+ find(TE->Scalars, BundleMember->Inst));

assert(Lane >= 0 && "Lane not set");

// Since vectorization tree is being built recursively this assertion

@@ -2558,7 +2992,7 @@ private:

// where their second (immediate) operand is not added. Since

// immediates do not affect scheduler behavior this is considered

// okay.

- auto *In = TE->getMainOp();

+ auto *In = BundleMember->Inst;

assert(In &&

(isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||

In->getNumOperands() == TE->getNumOperands()) &&

@@ -2578,7 +3012,8 @@ private:

}

// Handle the memory dependencies.

for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {

- if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {

+ if (MemoryDepSD->hasValidDependencies() &&

+ MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {

// There are no more unscheduled dependencies after decrementing,

// so we can put the dependent instruction into the ready list.

ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;

@@ -2589,6 +3024,48 @@ private:

<< "SLP: gets ready (mem): " << *DepBundle << "\n");

}

+ // Handle the control dependencies.

+ for (ScheduleData *DepSD : BundleMember->ControlDependencies) {

+ if (DepSD->incrementUnscheduledDeps(-1) == 0) {

+ // There are no more unscheduled dependencies after decrementing,

+ // so we can put the dependent instruction into the ready list.

+ ScheduleData *DepBundle = DepSD->FirstInBundle;

+ assert(!DepBundle->IsScheduled &&

+ "already scheduled bundle gets ready");

+ ReadyList.insert(DepBundle);

+ LLVM_DEBUG(dbgs()

+ << "SLP: gets ready (ctl): " << *DepBundle << "\n");

+ }

+ /// Verify basic self consistency properties of the data structure.

+ void verify() {

+ if (!ScheduleStart)

+ return;

+ assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&

+ ScheduleStart->comesBefore(ScheduleEnd) &&

+ "Not a valid scheduling region?");

+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

+ auto *SD = getScheduleData(I);

+ if (!SD)

+ continue;

+ assert(isInSchedulingRegion(SD) &&

+ "primary schedule data not in window?");

+ assert(isInSchedulingRegion(SD->FirstInBundle) &&

+ "entire bundle in window!");

+ (void)SD;

+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });

+ }

+ for (auto *SD : ReadyInsts) {

+ assert(SD->isSchedulingEntity() && SD->isReady() &&

+ "item in ready list not ready?");

+ (void)SD;

}

@@ -2599,7 +3076,7 @@ private:

auto I = ExtraScheduleDataMap.find(V);

if (I != ExtraScheduleDataMap.end())

for (auto &P : I->second)

- if (P.second->SchedulingRegionID == SchedulingRegionID)

+ if (isInSchedulingRegion(P.second))

Action(P.second);

}

@@ -2608,10 +3085,11 @@ private:

void initialFillReadyList(ReadyListType &ReadyList) {

for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

doForAllOpcodes(I, [&](ScheduleData *SD) {

- if (SD->isSchedulingEntity() && SD->isReady()) {

+ if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&

+ SD->isReady()) {

ReadyList.insert(SD);

LLVM_DEBUG(dbgs()

- << "SLP: initially in ready list: " << *I << "\n");

+ << "SLP: initially in ready list: " << *SD << "\n");

}

});

}

@@ -2669,18 +3147,14 @@ private:

/// Attaches ScheduleData to Instruction.

/// Note that the mapping survives during all vectorization iterations, i.e.

/// ScheduleData structures are recycled.

- DenseMap<Value *, ScheduleData *> ScheduleDataMap;

+ DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;

/// Attaches ScheduleData to Instruction with the leading key.

DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>

ExtraScheduleDataMap;

- struct ReadyList : SmallVector<ScheduleData *, 8> {

- void insert(ScheduleData *SD) { push_back(SD); }

- };

/// The ready-list for scheduling (only used for the dry-run).

- ReadyList ReadyInsts;

+ SetVector<ScheduleData *> ReadyInsts;

/// The first instruction of the scheduling region.

Instruction *ScheduleStart = nullptr;

@@ -2696,6 +3170,11 @@ private:

/// (can be null).

ScheduleData *LastLoadStoreInRegion = nullptr;

+ /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling

+ /// region? Used to optimize the dependence calculation for the

+ /// common case where there isn't.

+ bool RegionHasStackSave = false;

/// The current size of the scheduling region.

int ScheduleRegionSize = 0;

@@ -2704,8 +3183,8 @@ private:

/// The ID of the scheduling region. For a new vectorization iteration this

/// is incremented which "removes" all ScheduleData from the region.

- // Make sure that the initial SchedulingRegionID is greater than the

- // initial SchedulingRegionID in ScheduleData (which is 0).

+ /// Make sure that the initial SchedulingRegionID is greater than the

+ /// initial SchedulingRegionID in ScheduleData (which is 0).

int SchedulingRegionID = 1;

};

@@ -2717,7 +3196,7 @@ private:

void scheduleBlock(BlockScheduling *BS);

/// List of users to ignore during scheduling and that don't need extracting.

- ArrayRef<Value *> UserIgnoreList;

+ const SmallDenseSet<Value *> *UserIgnoreList = nullptr;

/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of

/// sorted SmallVectors of unsigned.

@@ -2748,7 +3227,6 @@ private:

ScalarEvolution *SE;

TargetTransformInfo *TTI;

TargetLibraryInfo *TLI;

- AAResults *AA;

LoopInfo *LI;

DominatorTree *DT;

AssumptionCache *AC;

@@ -2865,20 +3343,25 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

} // end namespace llvm

BoUpSLP::~BoUpSLP() {

- for (const auto &Pair : DeletedInstructions) {

- // Replace operands of ignored instructions with Undefs in case if they were

- // marked for deletion.

- if (Pair.getSecond()) {

- Value *Undef = UndefValue::get(Pair.getFirst()->getType());

- Pair.getFirst()->replaceAllUsesWith(Undef);

- }

- Pair.getFirst()->dropAllReferences();

- }

- for (const auto &Pair : DeletedInstructions) {

- assert(Pair.getFirst()->use_empty() &&

+ SmallVector<WeakTrackingVH> DeadInsts;

+ for (auto *I : DeletedInstructions) {

+ for (Use &U : I->operands()) {

+ auto *Op = dyn_cast<Instruction>(U.get());

+ if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&

+ wouldInstructionBeTriviallyDead(Op, TLI))

+ DeadInsts.emplace_back(Op);

+ }

+ I->dropAllReferences();

+ }

+ for (auto *I : DeletedInstructions) {

+ assert(I->use_empty() &&

"trying to erase instruction with users.");

- Pair.getFirst()->eraseFromParent();

+ I->eraseFromParent();

}

+ // Cleanup any dead scalar code feeding the vectorized instructions

+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);

#ifdef EXPENSIVE_CHECKS

// If we could guarantee that this call is not extremely slow, we could

// remove the ifdef limitation (see PR47712).

@@ -2886,13 +3369,6 @@ BoUpSLP::~BoUpSLP() {

#endif

}

-void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {

- for (auto *V : AV) {

- if (auto *I = dyn_cast<Instruction>(V))

- eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);

- };

/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses

/// contains original mask for the scalars reused in the node. Procedure

/// transform this mask in accordance with the given \p Mask.

@@ -2997,6 +3473,189 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

return None;

}

+namespace {

+/// Tracks the state we can represent the loads in the given sequence.

+enum class LoadsState { Gather, Vectorize, ScatterVectorize };

+} // anonymous namespace

+/// Checks if the given array of loads can be represented as a vectorized,

+/// scatter or just simple gather.

+static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

+ const TargetTransformInfo &TTI,

+ const DataLayout &DL, ScalarEvolution &SE,

+ LoopInfo &LI,

+ SmallVectorImpl<unsigned> &Order,

+ SmallVectorImpl<Value *> &PointerOps) {

+ // Check that a vectorized load would load the same memory as a scalar

+ // load. For example, we don't want to vectorize loads that are smaller

+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores

+ // from such a struct, we read/write packed bits disagreeing with the

+ // unvectorized version.

+ Type *ScalarTy = VL0->getType();

+ if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))

+ return LoadsState::Gather;

+ // Make sure all loads in the bundle are simple - we can't vectorize

+ // atomic or volatile loads.

+ PointerOps.clear();

+ PointerOps.resize(VL.size());

+ auto *POIter = PointerOps.begin();

+ for (Value *V : VL) {

+ auto *L = cast<LoadInst>(V);

+ if (!L->isSimple())

+ return LoadsState::Gather;

+ *POIter = L->getPointerOperand();

+ ++POIter;

+ }

+ Order.clear();

+ // Check the order of pointer operands or that all pointers are the same.

+ bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);

+ if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {

+ if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front()))

+ return false;

+ auto *GEP = dyn_cast<GetElementPtrInst>(P);

+ if (!GEP)

+ return false;

+ auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());

+ return GEP->getNumOperands() == 2 &&

+ ((isConstant(GEP->getOperand(1)) &&

+ isConstant(GEP0->getOperand(1))) ||

+ getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)})

+ .getOpcode());

+ })) {

+ if (IsSorted) {

+ Value *Ptr0;

+ Value *PtrN;

+ if (Order.empty()) {

+ Ptr0 = PointerOps.front();

+ PtrN = PointerOps.back();

+ } else {

+ Ptr0 = PointerOps[Order.front()];

+ PtrN = PointerOps[Order.back()];

+ }

+ Optional<int> Diff =

+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);

+ // Check that the sorted loads are consecutive.

+ if (static_cast<unsigned>(*Diff) == VL.size() - 1)

+ return LoadsState::Vectorize;

+ }

+ // TODO: need to improve analysis of the pointers, if not all of them are

+ // GEPs or have > 2 operands, we end up with a gather node, which just

+ // increases the cost.

+ Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());

+ bool ProfitableGatherPointers =

+ static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {

+ return L && L->isLoopInvariant(V);

+ })) <= VL.size() / 2 && VL.size() > 2;

+ if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {

+ auto *GEP = dyn_cast<GetElementPtrInst>(P);

+ return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||

+ (GEP && GEP->getNumOperands() == 2);

+ })) {

+ Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();

+ for (Value *V : VL)

+ CommonAlignment =

+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

+ if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&

+ !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))

+ return LoadsState::ScatterVectorize;

+ }

+ return LoadsState::Gather;

+bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,

+ const DataLayout &DL, ScalarEvolution &SE,

+ SmallVectorImpl<unsigned> &SortedIndices) {

+ assert(llvm::all_of(

+ VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&

+ "Expected list of pointer operands.");

+ // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each

+ // Ptr into, sort and return the sorted indices with values next to one

+ // another.

+ MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;

+ Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));

+ unsigned Cnt = 1;

+ for (Value *Ptr : VL.drop_front()) {

+ bool Found = any_of(Bases, [&](auto &Base) {

+ Optional<int> Diff =

+ getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,

+ /*StrictCheck=*/true);

+ if (!Diff)

+ return false;

+ Base.second.emplace_back(Ptr, *Diff, Cnt++);

+ return true;

+ });

+ if (!Found) {

+ // If we haven't found enough to usefully cluster, return early.

+ if (Bases.size() > VL.size() / 2 - 1)

+ return false;

+ // Not found already - add a new Base

+ Bases[Ptr].emplace_back(Ptr, 0, Cnt++);

+ }

+ // For each of the bases sort the pointers by Offset and check if any of the

+ // base become consecutively allocated.

+ bool AnyConsecutive = false;

+ for (auto &Base : Bases) {

+ auto &Vec = Base.second;

+ if (Vec.size() > 1) {

+ llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,

+ const std::tuple<Value *, int, unsigned> &Y) {

+ return std::get<1>(X) < std::get<1>(Y);

+ });

+ int InitialOffset = std::get<1>(Vec[0]);

+ AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {

+ return std::get<1>(P.value()) == int(P.index()) + InitialOffset;

+ });

+ }

+ // Fill SortedIndices array only if it looks worth-while to sort the ptrs.

+ SortedIndices.clear();

+ if (!AnyConsecutive)

+ return false;

+ for (auto &Base : Bases) {

+ for (auto &T : Base.second)

+ SortedIndices.push_back(std::get<2>(T));

+ }

+ assert(SortedIndices.size() == VL.size() &&

+ "Expected SortedIndices to be the size of VL");

+ return true;

+Optional<BoUpSLP::OrdersType>

+BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {

+ assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");

+ Type *ScalarTy = TE.Scalars[0]->getType();

+ SmallVector<Value *> Ptrs;

+ Ptrs.reserve(TE.Scalars.size());

+ for (Value *V : TE.Scalars) {

+ auto *L = dyn_cast<LoadInst>(V);

+ if (!L || !L->isSimple())

+ return None;

+ Ptrs.push_back(L->getPointerOperand());

+ }

+ BoUpSLP::OrdersType Order;

+ if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))

+ return Order;

+ return None;

Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,

bool TopToBottom) {

// No need to reorder if need to shuffle reuses, still need to shuffle the

@@ -3037,6 +3696,9 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,

}

if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))

return CurrentOrder;

+ if (TE.Scalars.size() >= 4)

+ if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE))

+ return Order;

}

return None;

}

@@ -3047,13 +3709,55 @@ void BoUpSLP::reorderTopToBottom() {

// ExtractElement gather nodes which can be vectorized and need to handle

// their ordering.

DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

+ // AltShuffles can also have a preferred ordering that leads to fewer

+ // instructions, e.g., the addsub instruction in x86.

+ DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;

+ // Maps a TreeEntry to the reorder indices of external users.

+ DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>

+ ExternalUserReorderMap;

+ // FIXME: Workaround for syntax error reported by MSVC buildbots.

+ TargetTransformInfo &TTIRef = *TTI;

// Find all reorderable nodes with the given VF.

// Currently the are vectorized stores,loads,extracts + some gathering of

// extracts.

- for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](

+ for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,

+ &GathersToOrders, &ExternalUserReorderMap,

+ &AltShufflesToOrders](

const std::unique_ptr<TreeEntry> &TE) {

+ // Look for external users that will probably be vectorized.

+ SmallVector<OrdersType, 1> ExternalUserReorderIndices =

+ findExternalStoreUsersReorderIndices(TE.get());

+ if (!ExternalUserReorderIndices.empty()) {

+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

+ ExternalUserReorderMap.try_emplace(TE.get(),

+ std::move(ExternalUserReorderIndices));

+ }

+ // Patterns like [fadd,fsub] can be combined into a single instruction in

+ // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need

+ // to take into account their order when looking for the most used order.

+ if (TE->isAltShuffle()) {

+ VectorType *VecTy =

+ FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());

+ unsigned Opcode0 = TE->getOpcode();

+ unsigned Opcode1 = TE->getAltOpcode();

+ // The opcode mask selects between the two opcodes.

+ SmallBitVector OpcodeMask(TE->Scalars.size(), 0);

+ for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))

+ if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)

+ OpcodeMask.set(Lane);

+ // If this pattern is supported by the target then we consider the order.

+ if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

+ AltShufflesToOrders.try_emplace(TE.get(), OrdersType());

+ }

+ // TODO: Check the reverse order too.

+ }

if (Optional<OrdersType> CurrentOrder =

- getReorderingData(*TE.get(), /*TopToBottom=*/true)) {

+ getReorderingData(*TE, /*TopToBottom=*/true)) {

// Do not include ordering for nodes used in the alt opcode vectorization,

// better to reorder them during bottom-to-top stage. If follow the order

// here, it causes reordering of the whole graph though actually it is

@@ -3071,10 +3775,7 @@ void BoUpSLP::reorderTopToBottom() {

EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;

}))

return;

- if (UserTE->UserTreeIndices.empty())

- UserTE = nullptr;

- else

- UserTE = UserTE->UserTreeIndices.back().UserTE;

+ UserTE = UserTE->UserTreeIndices.back().UserTE;

++Cnt;

}

VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

@@ -3105,11 +3806,30 @@ void BoUpSLP::reorderTopToBottom() {

if (!OpTE->ReuseShuffleIndices.empty())

continue;

// Count number of orders uses.

- const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {

- if (OpTE->State == TreeEntry::NeedToGather)

- return GathersToOrders.find(OpTE)->second;

+ const auto &Order = [OpTE, &GathersToOrders,

+ &AltShufflesToOrders]() -> const OrdersType & {

+ if (OpTE->State == TreeEntry::NeedToGather) {

+ auto It = GathersToOrders.find(OpTE);

+ if (It != GathersToOrders.end())

+ return It->second;

+ }

+ if (OpTE->isAltShuffle()) {

+ auto It = AltShufflesToOrders.find(OpTE);

+ if (It != AltShufflesToOrders.end())

+ return It->second;

+ }

return OpTE->ReorderIndices;

}();

+ // First consider the order of the external scalar users.

+ auto It = ExternalUserReorderMap.find(OpTE);

+ if (It != ExternalUserReorderMap.end()) {

+ const auto &ExternalUserReorderIndices = It->second;

+ for (const OrdersType &ExtOrder : ExternalUserReorderIndices)

+ ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;

+ // No other useful reorder data in this entry.

+ if (Order.empty())

+ continue;

+ }

// Stores actually store the mask, not the order, need to invert.

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

@@ -3199,6 +3919,57 @@ void BoUpSLP::reorderTopToBottom() {

}

+bool BoUpSLP::canReorderOperands(

+ TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

+ ArrayRef<TreeEntry *> ReorderableGathers,

+ SmallVectorImpl<TreeEntry *> &GatherOps) {

+ for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {

+ if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {

+ return OpData.first == I &&

+ OpData.second->State == TreeEntry::Vectorize;

+ }))

+ continue;

+ if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {

+ // Do not reorder if operand node is used by many user nodes.

+ if (any_of(TE->UserTreeIndices,

+ [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))

+ return false;

+ // Add the node to the list of the ordered nodes with the identity

+ // order.

+ Edges.emplace_back(I, TE);

+ // Add ScatterVectorize nodes to the list of operands, where just

+ // reordering of the scalars is required. Similar to the gathers, so

+ // simply add to the list of gathered ops.

+ // If there are reused scalars, process this node as a regular vectorize

+ // node, just reorder reuses mask.

+ if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())

+ GatherOps.push_back(TE);

+ continue;

+ }

+ TreeEntry *Gather = nullptr;

+ if (count_if(ReorderableGathers,

+ [&Gather, UserTE, I](TreeEntry *TE) {

+ assert(TE->State != TreeEntry::Vectorize &&

+ "Only non-vectorized nodes are expected.");

+ if (any_of(TE->UserTreeIndices,

+ [UserTE, I](const EdgeInfo &EI) {

+ return EI.UserTE == UserTE && EI.EdgeIdx == I;

+ })) {

+ assert(TE->isSame(UserTE->getOperand(I)) &&

+ "Operand entry does not match operands.");

+ Gather = TE;

+ return true;

+ }

+ return false;

+ }) > 1 &&

+ !all_of(UserTE->getOperand(I), isConstant))

+ return false;

+ if (Gather)

+ GatherOps.push_back(Gather);

+ }

+ return true;

void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

SetVector<TreeEntry *> OrderedEntries;

DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

@@ -3212,49 +3983,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

if (TE->State != TreeEntry::Vectorize)

NonVectorized.push_back(TE.get());

if (Optional<OrdersType> CurrentOrder =

- getReorderingData(*TE.get(), /*TopToBottom=*/false)) {

+ getReorderingData(*TE, /*TopToBottom=*/false)) {

OrderedEntries.insert(TE.get());

if (TE->State != TreeEntry::Vectorize)

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

}

});

- // Checks if the operands of the users are reordarable and have only single

- // use.

- auto &&CheckOperands =

- [this, &NonVectorized](const auto &Data,

- SmallVectorImpl<TreeEntry *> &GatherOps) {

- for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {

- if (any_of(Data.second,

- [I](const std::pair<unsigned, TreeEntry *> &OpData) {

- return OpData.first == I &&

- OpData.second->State == TreeEntry::Vectorize;

- }))

- continue;

- ArrayRef<Value *> VL = Data.first->getOperand(I);

- const TreeEntry *TE = nullptr;

- const auto *It = find_if(VL, [this, &TE](Value *V) {

- TE = getTreeEntry(V);

- return TE;

- });

- if (It != VL.end() && TE->isSame(VL))

- return false;

- TreeEntry *Gather = nullptr;

- if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {

- assert(TE->State != TreeEntry::Vectorize &&

- "Only non-vectorized nodes are expected.");

- if (TE->isSame(VL)) {

- Gather = TE;

- return true;

- }

- return false;

- }) > 1)

- return false;

- if (Gather)

- GatherOps.push_back(Gather);

- }

- return true;

- };

// 1. Propagate order to the graph nodes, which use only reordered nodes.

// I.e., if the node has operands, that are reordered, try to make at least

// one operand order in the natural order and reorder others + reorder the

@@ -3263,7 +3998,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

while (!OrderedEntries.empty()) {

// 1. Filter out only reordered nodes.

// 2. If the entry has multiple uses - skip it and jump to the next node.

- MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;

+ DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;

SmallVector<TreeEntry *> Filtered;

for (TreeEntry *TE : OrderedEntries) {

if (!(TE->State == TreeEntry::Vectorize ||

@@ -3291,10 +4026,17 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

// Erase filtered entries.

for_each(Filtered,

[&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });

- for (const auto &Data : Users) {

+ SmallVector<

+ std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>

+ UsersVec(Users.begin(), Users.end());

+ sort(UsersVec, [](const auto &Data1, const auto &Data2) {

+ return Data1.first->Idx > Data2.first->Idx;

+ });

+ for (auto &Data : UsersVec) {

// Check that operands are used only in the User node.

SmallVector<TreeEntry *> GatherOps;

- if (!CheckOperands(Data, GatherOps)) {

+ if (!canReorderOperands(Data.first, Data.second, NonVectorized,

+ GatherOps)) {

for_each(Data.second,

[&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

OrderedEntries.remove(Op.second);

@@ -3310,18 +4052,22 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

// the same node my be considered several times, though might be not

// profitable.

SmallPtrSet<const TreeEntry *, 4> VisitedOps;

+ SmallPtrSet<const TreeEntry *, 4> VisitedUsers;

for (const auto &Op : Data.second) {

TreeEntry *OpTE = Op.second;

if (!VisitedOps.insert(OpTE).second)

continue;

- if (!OpTE->ReuseShuffleIndices.empty() ||

- (IgnoreReorder && OpTE == VectorizableTree.front().get()))

+ if (!OpTE->ReuseShuffleIndices.empty())

continue;

const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {

if (OpTE->State == TreeEntry::NeedToGather)

return GathersToOrders.find(OpTE)->second;

return OpTE->ReorderIndices;

}();

+ unsigned NumOps = count_if(

+ Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {

+ return P.second == OpTE;

+ });

// Stores actually store the mask, not the order, need to invert.

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

@@ -3333,14 +4079,52 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);

});

fixupOrderingIndices(CurrentOrder);

- ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;

+ OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=

+ NumOps;

} else {

- ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

+ OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;

+ }

+ auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));

+ const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](

+ const TreeEntry *TE) {

+ if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

+ (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||

+ (IgnoreReorder && TE->Idx == 0))

+ return true;

+ if (TE->State == TreeEntry::NeedToGather) {

+ auto It = GathersToOrders.find(TE);

+ if (It != GathersToOrders.end())

+ return !It->second.empty();

+ return true;

+ }

+ return false;

+ };

+ for (const EdgeInfo &EI : OpTE->UserTreeIndices) {

+ TreeEntry *UserTE = EI.UserTE;

+ if (!VisitedUsers.insert(UserTE).second)

+ continue;

+ // May reorder user node if it requires reordering, has reused

+ // scalars, is an alternate op vectorize node or its op nodes require

+ // reordering.

+ if (AllowsReordering(UserTE))

+ continue;

+ // Check if users allow reordering.

+ // Currently look up just 1 level of operands to avoid increase of

+ // the compile time.

+ // Profitable to reorder if definitely more operands allow

+ // reordering rather than those with natural order.

+ ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];

+ if (static_cast<unsigned>(count_if(

+ Ops, [UserTE, &AllowsReordering](

+ const std::pair<unsigned, TreeEntry *> &Op) {

+ return AllowsReordering(Op.second) &&

+ all_of(Op.second->UserTreeIndices,

+ [UserTE](const EdgeInfo &EI) {

+ return EI.UserTE == UserTE;

+ });

+ })) <= Ops.size() / 2)

+ ++Res.first->second;

}

- OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=

- OpTE->UserTreeIndices.size();

- assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");

- --OrdersUses[{}];

}

// If no orders - skip current nodes and jump to the next one, if any.

if (OrdersUses.empty()) {

@@ -3381,7 +4165,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

OrderedEntries.remove(TE);

if (!VisitedOps.insert(TE).second)

continue;

- if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {

+ if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {

// Just reorder reuses indices.

reorderReuses(TE->ReuseShuffleIndices, Mask);

continue;

@@ -3393,6 +4177,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

TE->ReorderIndices.empty()) &&

"Non-matching sizes of user/operand entries.");

reorderOrder(TE->ReorderIndices, Mask);

+ if (IgnoreReorder && TE == VectorizableTree.front().get())

+ IgnoreReorder = false;

}

// For gathers just need to reorder its scalars.

for (TreeEntry *Gather : GatherOps) {

@@ -3484,7 +4270,7 @@ void BoUpSLP::buildExternalUses(

}

// Ignore users in the user ignore list.

- if (is_contained(UserIgnoreList, UserInst))

+ if (UserIgnoreList && UserIgnoreList->contains(UserInst))

continue;

LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "

@@ -3495,78 +4281,270 @@ void BoUpSLP::buildExternalUses(

}

+DenseMap<Value *, SmallVector<StoreInst *, 4>>

+BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {

+ DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;

+ for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {

+ Value *V = TE->Scalars[Lane];

+ // To save compilation time we don't visit if we have too many users.

+ static constexpr unsigned UsersLimit = 4;

+ if (V->hasNUsesOrMore(UsersLimit))

+ break;

+ // Collect stores per pointer object.

+ for (User *U : V->users()) {

+ auto *SI = dyn_cast<StoreInst>(U);

+ if (SI == nullptr || !SI->isSimple() ||

+ !isValidElementType(SI->getValueOperand()->getType()))

+ continue;

+ // Skip entry if already

+ if (getTreeEntry(U))

+ continue;

+ Value *Ptr = getUnderlyingObject(SI->getPointerOperand());

+ auto &StoresVec = PtrToStoresMap[Ptr];

+ // For now just keep one store per pointer object per lane.

+ // TODO: Extend this to support multiple stores per pointer per lane

+ if (StoresVec.size() > Lane)

+ continue;

+ // Skip if in different BBs.

+ if (!StoresVec.empty() &&

+ SI->getParent() != StoresVec.back()->getParent())

+ continue;

+ // Make sure that the stores are of the same type.

+ if (!StoresVec.empty() &&

+ SI->getValueOperand()->getType() !=

+ StoresVec.back()->getValueOperand()->getType())

+ continue;

+ StoresVec.push_back(SI);

+ }

+ return PtrToStoresMap;

+bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

+ OrdersType &ReorderIndices) const {

+ // We check whether the stores in StoreVec can form a vector by sorting them

+ // and checking whether they are consecutive.

+ // To avoid calling getPointersDiff() while sorting we create a vector of

+ // pairs {store, offset from first} and sort this instead.

+ SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());

+ StoreInst *S0 = StoresVec[0];

+ StoreOffsetVec[0] = {S0, 0};

+ Type *S0Ty = S0->getValueOperand()->getType();

+ Value *S0Ptr = S0->getPointerOperand();

+ for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {

+ StoreInst *SI = StoresVec[Idx];

+ Optional<int> Diff =

+ getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),

+ SI->getPointerOperand(), *DL, *SE,

+ /*StrictCheck=*/true);

+ // We failed to compare the pointers so just abandon this StoresVec.

+ if (!Diff)

+ return false;

+ StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};

+ }

+ // Sort the vector based on the pointers. We create a copy because we may

+ // need the original later for calculating the reorder (shuffle) indices.

+ stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,

+ const std::pair<StoreInst *, int> &Pair2) {

+ int Offset1 = Pair1.second;

+ int Offset2 = Pair2.second;

+ return Offset1 < Offset2;

+ });

+ // Check if the stores are consecutive by checking if their difference is 1.

+ for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))

+ if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)

+ return false;

+ // Calculate the shuffle indices according to their offset against the sorted

+ // StoreOffsetVec.

+ ReorderIndices.reserve(StoresVec.size());

+ for (StoreInst *SI : StoresVec) {

+ unsigned Idx = find_if(StoreOffsetVec,

+ [SI](const std::pair<StoreInst *, int> &Pair) {

+ return Pair.first == SI;

+ }) -

+ StoreOffsetVec.begin();

+ ReorderIndices.push_back(Idx);

+ }

+ // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in

+ // reorderTopToBottom() and reorderBottomToTop(), so we are following the

+ // same convention here.

+ auto IsIdentityOrder = [](const OrdersType &Order) {

+ for (unsigned Idx : seq<unsigned>(0, Order.size()))

+ if (Idx != Order[Idx])

+ return false;

+ return true;

+ };

+ if (IsIdentityOrder(ReorderIndices))

+ ReorderIndices.clear();

+ return true;

+#ifndef NDEBUG

+LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {

+ for (unsigned Idx : Order)

+ dbgs() << Idx << ", ";

+ dbgs() << "\n";

+#endif

+SmallVector<BoUpSLP::OrdersType, 1>

+BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {

+ unsigned NumLanes = TE->Scalars.size();

+ DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =

+ collectUserStores(TE);

+ // Holds the reorder indices for each candidate store vector that is a user of

+ // the current TreeEntry.

+ SmallVector<OrdersType, 1> ExternalReorderIndices;

+ // Now inspect the stores collected per pointer and look for vectorization

+ // candidates. For each candidate calculate the reorder index vector and push

+ // it into `ExternalReorderIndices`

+ for (const auto &Pair : PtrToStoresMap) {

+ auto &StoresVec = Pair.second;

+ // If we have fewer than NumLanes stores, then we can't form a vector.

+ if (StoresVec.size() != NumLanes)

+ continue;

+ // If the stores are not consecutive then abandon this StoresVec.

+ OrdersType ReorderIndices;

+ if (!CanFormVector(StoresVec, ReorderIndices))

+ continue;

+ // We now know that the scalars in StoresVec can form a vector instruction,

+ // so set the reorder indices.

+ ExternalReorderIndices.push_back(ReorderIndices);

+ }

+ return ExternalReorderIndices;

void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

- ArrayRef<Value *> UserIgnoreLst) {

+ const SmallDenseSet<Value *> &UserIgnoreLst) {

deleteTree();

- UserIgnoreList = UserIgnoreLst;

+ UserIgnoreList = &UserIgnoreLst;

if (!allSameType(Roots))

return;

buildTree_rec(Roots, 0, EdgeInfo());

}

-namespace {

-/// Tracks the state we can represent the loads in the given sequence.

-enum class LoadsState { Gather, Vectorize, ScatterVectorize };

-} // anonymous namespace

-/// Checks if the given array of loads can be represented as a vectorized,

-/// scatter or just simple gather.

-static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

- const TargetTransformInfo &TTI,

- const DataLayout &DL, ScalarEvolution &SE,

- SmallVectorImpl<unsigned> &Order,

- SmallVectorImpl<Value *> &PointerOps) {

- // Check that a vectorized load would load the same memory as a scalar

- // load. For example, we don't want to vectorize loads that are smaller

- // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

- // treats loading/storing it as an i8 struct. If we vectorize loads/stores

- // from such a struct, we read/write packed bits disagreeing with the

- // unvectorized version.

- Type *ScalarTy = VL0->getType();

- if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))

- return LoadsState::Gather;

+void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {

+ deleteTree();

+ if (!allSameType(Roots))

+ return;

+ buildTree_rec(Roots, 0, EdgeInfo());

- // Make sure all loads in the bundle are simple - we can't vectorize

- // atomic or volatile loads.

- PointerOps.clear();

- PointerOps.resize(VL.size());

- auto *POIter = PointerOps.begin();

+/// \return true if the specified list of values has only one instruction that

+/// requires scheduling, false otherwise.

+#ifndef NDEBUG

+static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {

+ Value *NeedsScheduling = nullptr;

for (Value *V : VL) {

- auto *L = cast<LoadInst>(V);

- if (!L->isSimple())

- return LoadsState::Gather;

- *POIter = L->getPointerOperand();

- ++POIter;

+ if (doesNotNeedToBeScheduled(V))

+ continue;

+ if (!NeedsScheduling) {

+ NeedsScheduling = V;

+ continue;

+ }

+ return false;

}

+ return NeedsScheduling;

+#endif

- Order.clear();

- // Check the order of pointer operands.

- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {

- Value *Ptr0;

- Value *PtrN;

- if (Order.empty()) {

- Ptr0 = PointerOps.front();

- PtrN = PointerOps.back();

+/// Generates key/subkey pair for the given value to provide effective sorting

+/// of the values and better detection of the vectorizable values sequences. The

+/// keys/subkeys can be used for better sorting of the values themselves (keys)

+/// and in values subgroups (subkeys).

+static std::pair<size_t, size_t> generateKeySubkey(

+ Value *V, const TargetLibraryInfo *TLI,

+ function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,

+ bool AllowAlternate) {

+ hash_code Key = hash_value(V->getValueID() + 2);

+ hash_code SubKey = hash_value(0);

+ // Sort the loads by the distance between the pointers.

+ if (auto *LI = dyn_cast<LoadInst>(V)) {

+ Key = hash_combine(hash_value(Instruction::Load), Key);

+ if (LI->isSimple())

+ SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));

+ else

+ SubKey = hash_value(LI);

+ } else if (isVectorLikeInstWithConstOps(V)) {

+ // Sort extracts by the vector operands.

+ if (isa<ExtractElementInst, UndefValue>(V))

+ Key = hash_value(Value::UndefValueVal + 1);

+ if (auto *EI = dyn_cast<ExtractElementInst>(V)) {

+ if (!isUndefVector(EI->getVectorOperand()) &&

+ !isa<UndefValue>(EI->getIndexOperand()))

+ SubKey = hash_value(EI->getVectorOperand());

+ }

+ } else if (auto *I = dyn_cast<Instruction>(V)) {

+ // Sort other instructions just by the opcodes except for CMPInst.

+ // For CMP also sort by the predicate kind.

+ if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&

+ isValidForAlternation(I->getOpcode())) {

+ if (AllowAlternate)

+ Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);

+ else

+ Key = hash_combine(hash_value(I->getOpcode()), Key);

+ SubKey = hash_combine(

+ hash_value(I->getOpcode()), hash_value(I->getType()),

+ hash_value(isa<BinaryOperator>(I)

+ ? I->getType()

+ : cast<CastInst>(I)->getOperand(0)->getType()));

+ // For casts, look through the only operand to improve compile time.

+ if (isa<CastInst>(I)) {

+ std::pair<size_t, size_t> OpVals =

+ generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,

+ /*=AllowAlternate*/ true);

+ Key = hash_combine(OpVals.first, Key);

+ SubKey = hash_combine(OpVals.first, SubKey);

+ }

+ } else if (auto *CI = dyn_cast<CmpInst>(I)) {

+ CmpInst::Predicate Pred = CI->getPredicate();

+ if (CI->isCommutative())

+ Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));

+ CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);

+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),

+ hash_value(SwapPred),

+ hash_value(CI->getOperand(0)->getType()));

+ } else if (auto *Call = dyn_cast<CallInst>(I)) {

+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);

+ if (isTriviallyVectorizable(ID)) {

+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));

+ } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {

+ SubKey = hash_combine(hash_value(I->getOpcode()),

+ hash_value(Call->getCalledFunction()));

+ } else {

+ Key = hash_combine(hash_value(Call), Key);

+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));

+ }

+ for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())

+ SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),

+ hash_value(Op.Tag), SubKey);

+ } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

+ if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))

+ SubKey = hash_value(Gep->getPointerOperand());

+ else

+ SubKey = hash_value(Gep);

+ } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&

+ !isa<ConstantInt>(I->getOperand(1))) {

+ // Do not try to vectorize instructions with potentially high cost.

+ SubKey = hash_value(I);

} else {

- Ptr0 = PointerOps[Order.front()];

- PtrN = PointerOps[Order.back()];

+ SubKey = hash_value(I->getOpcode());

}

- Optional<int> Diff =

- getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);

- // Check that the sorted loads are consecutive.

- if (static_cast<unsigned>(*Diff) == VL.size() - 1)

- return LoadsState::Vectorize;

- Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();

- for (Value *V : VL)

- CommonAlignment =

- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());

- if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),

- CommonAlignment))

- return LoadsState::ScatterVectorize;

+ Key = hash_combine(hash_value(I->getParent()), Key);

}

- return LoadsState::Gather;

+ return std::make_pair(Key, SubKey);

}

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

@@ -3651,10 +4629,84 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// If all of the operands are identical or constant we have a simple solution.

// If we deal with insert/extract instructions, they all must have constant

// indices, otherwise we should gather them, not try to vectorize.

- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||

- (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&

- !all_of(VL, isVectorLikeInstWithConstOps))) {

- LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");

+ // If alternate op node with 2 elements with gathered operands - do not

+ // vectorize.

+ auto &&NotProfitableForVectorization = [&S, this,

+ Depth](ArrayRef<Value *> VL) {

+ if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)

+ return false;

+ if (VectorizableTree.size() < MinTreeSize)

+ return false;

+ if (Depth >= RecursionMaxDepth - 1)

+ return true;

+ // Check if all operands are extracts, part of vector node or can build a

+ // regular vectorize node.

+ SmallVector<unsigned, 2> InstsCount(VL.size(), 0);

+ for (Value *V : VL) {

+ auto *I = cast<Instruction>(V);

+ InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {

+ return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);

+ }));

+ }

+ bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);

+ if ((IsCommutative &&

+ std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||

+ (!IsCommutative &&

+ all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))

+ return true;

+ assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");

+ SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

+ auto *I1 = cast<Instruction>(VL.front());

+ auto *I2 = cast<Instruction>(VL.back());

+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)

+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),

+ I2->getOperand(Op));

+ if (static_cast<unsigned>(count_if(

+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);

+ })) >= S.MainOp->getNumOperands() / 2)

+ return false;

+ if (S.MainOp->getNumOperands() > 2)

+ return true;

+ if (IsCommutative) {

+ // Check permuted operands.

+ Candidates.clear();

+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)

+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),

+ I2->getOperand((Op + 1) % E));

+ if (any_of(

+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);

+ }))

+ return false;

+ }

+ return true;

+ };

+ SmallVector<unsigned> SortedIndices;

+ BasicBlock *BB = nullptr;

+ bool AreAllSameInsts =

+ (S.getOpcode() && allSameBlock(VL)) ||

+ (S.OpValue->getType()->isPointerTy() && UserTreeIdx.UserTE &&

+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&

+ VL.size() > 2 &&

+ all_of(VL,

+ [&BB](Value *V) {

+ auto *I = dyn_cast<GetElementPtrInst>(V);

+ if (!I)

+ return doesNotNeedToBeScheduled(V);

+ if (!BB)

+ BB = I->getParent();

+ return BB == I->getParent() && I->getNumOperands() == 2;

+ }) &&

+ BB &&

+ sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,

+ SortedIndices));

+ if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts ||

+ (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(

+ S.OpValue) &&

+ !all_of(VL, isVectorLikeInstWithConstOps)) ||

+ NotProfitableForVectorization(VL)) {

+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");

if (TryToFindDuplicates(S))

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

@@ -3665,12 +4717,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// the same block.

// Don't vectorize ephemeral values.

- for (Value *V : VL) {

- if (EphValues.count(V)) {

- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

- << ") is ephemeral.\n");

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

- return;

+ if (!EphValues.empty()) {

+ for (Value *V : VL) {

+ if (EphValues.count(V)) {

+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

+ << ") is ephemeral.\n");

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

+ return;

+ }

}

@@ -3708,20 +4762,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

// The reduction nodes (stored in UserIgnoreList) also should stay scalar.

- for (Value *V : VL) {

- if (is_contained(UserIgnoreList, V)) {

- LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

- if (TryToFindDuplicates(S))

- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- return;

+ if (UserIgnoreList && !UserIgnoreList->empty()) {

+ for (Value *V : VL) {

+ if (UserIgnoreList && UserIgnoreList->contains(V)) {

+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ return;

+ }

}

+ // Special processing for sorted pointers for ScatterVectorize node with

+ // constant indeces only.

+ if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) &&

+ UserTreeIdx.UserTE &&

+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {

+ assert(S.OpValue->getType()->isPointerTy() &&

+ count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=

+ 2 &&

+ "Expected pointers only.");

+ // Reset S to make it GetElementPtr kind of node.

+ const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });

+ assert(It != VL.end() && "Expected at least one GEP.");

+ S = getSameOpcode(*It);

+ }

// Check that all of the users of the scalars that we want to vectorize are

// schedulable.

auto *VL0 = cast<Instruction>(S.OpValue);

- BasicBlock *BB = VL0->getParent();

+ BB = VL0->getParent();

if (!DT->isReachableFromEntry(BB)) {

// Don't go into unreachable blocks. They may contain instructions with

@@ -3739,9 +4810,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (!BSRef)

BSRef = std::make_unique<BlockScheduling>(BB);

- BlockScheduling &BS = *BSRef.get();

+ BlockScheduling &BS = *BSRef;

Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);

+#ifdef EXPENSIVE_CHECKS

+ // Make sure we didn't break any internal invariants

+ BS.verify();

+#endif

if (!Bundle) {

LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");

assert((!BS.getScheduleData(VL0) ||

@@ -3761,10 +4836,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Check for terminator values (e.g. invoke).

for (Value *V : VL)

- for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {

- Instruction *Term = dyn_cast<Instruction>(

- cast<PHINode>(V)->getIncomingValueForBlock(

- PH->getIncomingBlock(I)));

+ for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {

+ Instruction *Term = dyn_cast<Instruction>(Incoming);

if (Term && Term->isTerminator()) {

LLVM_DEBUG(dbgs()

<< "SLP: Need to swizzle PHINodes (terminator use).\n");

@@ -3908,7 +4981,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

SmallVector<Value *> PointerOps;

OrdersType CurrentOrder;

TreeEntry *TE = nullptr;

- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,

+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,

PointerOps)) {

case LoadsState::Vectorize:

if (CurrentOrder.empty()) {

@@ -4089,7 +5162,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::GetElementPtr: {

// We don't combine GEPs with complicated (nested) indexing.

for (Value *V : VL) {

- if (cast<Instruction>(V)->getNumOperands() != 2) {

+ auto *I = dyn_cast<GetElementPtrInst>(V);

+ if (!I)

+ continue;

+ if (I->getNumOperands() != 2) {

LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");

BS.cancelScheduling(VL, VL0);

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

@@ -4100,9 +5176,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// We can't combine several GEPs into one vector if they operate on

// different types.

- Type *Ty0 = VL0->getOperand(0)->getType();

+ Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();

for (Value *V : VL) {

- Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();

+ auto *GEP = dyn_cast<GEPOperator>(V);

+ if (!GEP)

+ continue;

+ Type *CurTy = GEP->getSourceElementType();

if (Ty0 != CurTy) {

LLVM_DEBUG(dbgs()

<< "SLP: not-vectorizable GEP (different types).\n");

@@ -4113,15 +5192,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

+ bool IsScatterUser =

+ UserTreeIdx.UserTE &&

+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

// We don't combine GEPs with non-constant indexes.

Type *Ty1 = VL0->getOperand(1)->getType();

for (Value *V : VL) {

- auto Op = cast<Instruction>(V)->getOperand(1);

- if (!isa<ConstantInt>(Op) ||

+ auto *I = dyn_cast<GetElementPtrInst>(V);

+ if (!I)

+ continue;

+ auto *Op = I->getOperand(1);

+ if ((!IsScatterUser && !isa<ConstantInt>(Op)) ||

(Op->getType() != Ty1 &&

- Op->getType()->getScalarSizeInBits() >

- DL->getIndexSizeInBits(

- V->getType()->getPointerAddressSpace()))) {

+ ((IsScatterUser && !isa<ConstantInt>(Op)) ||

+ Op->getType()->getScalarSizeInBits() >

+ DL->getIndexSizeInBits(

+ V->getType()->getPointerAddressSpace())))) {

LLVM_DEBUG(dbgs()

<< "SLP: not-vectorizable GEP (non-constant indexes).\n");

BS.cancelScheduling(VL, VL0);

@@ -4136,9 +5222,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");

SmallVector<ValueList, 2> Operands(2);

// Prepare the operand vector for pointer operands.

- for (Value *V : VL)

- Operands.front().push_back(

- cast<GetElementPtrInst>(V)->getPointerOperand());

+ for (Value *V : VL) {

+ auto *GEP = dyn_cast<GetElementPtrInst>(V);

+ if (!GEP) {

+ Operands.front().push_back(V);

+ continue;

+ }

+ Operands.front().push_back(GEP->getPointerOperand());

+ }

TE->setOperand(0, Operands.front());

// Need to cast all indices to the same type before vectorization to

// avoid crash.

@@ -4149,9 +5240,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();

Type *Ty = all_of(VL,

[VL0Ty, IndexIdx](Value *V) {

- return VL0Ty == cast<GetElementPtrInst>(V)

- ->getOperand(IndexIdx)

- ->getType();

+ auto *GEP = dyn_cast<GetElementPtrInst>(V);

+ if (!GEP)

+ return true;

+ return VL0Ty == GEP->getOperand(IndexIdx)->getType();

})

? VL0Ty

: DL->getIndexType(cast<GetElementPtrInst>(VL0)

@@ -4159,10 +5251,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

->getScalarType());

// Prepare the operand vector.

for (Value *V : VL) {

- auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);

- auto *CI = cast<ConstantInt>(Op);

- Operands.back().push_back(ConstantExpr::getIntegerCast(

- CI, Ty, CI->getValue().isSignBitSet()));

+ auto *I = dyn_cast<GetElementPtrInst>(V);

+ if (!I) {

+ Operands.back().push_back(

+ ConstantInt::get(Ty, 0, /*isSigned=*/false));

+ continue;

+ }

+ auto *Op = I->getOperand(IndexIdx);

+ auto *CI = dyn_cast<ConstantInt>(Op);

+ if (!CI)

+ Operands.back().push_back(Op);

+ else

+ Operands.back().push_back(ConstantExpr::getIntegerCast(

+ CI, Ty, CI->getValue().isSignBitSet()));

}

TE->setOperand(IndexIdx, Operands.back());

@@ -4268,7 +5369,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

unsigned NumArgs = CI->arg_size();

SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);

for (unsigned j = 0; j != NumArgs; ++j)

- if (hasVectorInstrinsicScalarOpd(ID, j))

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j))

ScalarArgs[j] = CI->getArgOperand(j);

for (Value *V : VL) {

CallInst *CI2 = dyn_cast<CallInst>(V);

@@ -4287,7 +5388,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Some intrinsics have scalar arguments and should be same in order for

// them to be vectorized.

for (unsigned j = 0; j != NumArgs; ++j) {

- if (hasVectorInstrinsicScalarOpd(ID, j)) {

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {

Value *A1J = CI2->getArgOperand(j);

if (ScalarArgs[j] != A1J) {

BS.cancelScheduling(VL, VL0);

@@ -4320,7 +5421,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

// For scalar operands no need to to create an entry since no need to

// vectorize it.

- if (hasVectorInstrinsicScalarOpd(ID, i))

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

continue;

ValueList Operands;

// Prepare the operand vector.

@@ -4347,9 +5448,42 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

// Reorder operands if reordering would enable vectorization.

- if (isa<BinaryOperator>(VL0)) {

+ auto *CI = dyn_cast<CmpInst>(VL0);

+ if (isa<BinaryOperator>(VL0) || CI) {

ValueList Left, Right;

- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

+ if (!CI || all_of(VL, [](Value *V) {

+ return cast<CmpInst>(V)->isCommutative();

+ })) {

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

+ } else {

+ CmpInst::Predicate P0 = CI->getPredicate();

+ CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();

+ assert(P0 != AltP0 &&

+ "Expected different main/alternate predicates.");

+ CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);

+ Value *BaseOp0 = VL0->getOperand(0);

+ Value *BaseOp1 = VL0->getOperand(1);

+ // Collect operands - commute if it uses the swapped predicate or

+ // alternate operation.

+ for (Value *V : VL) {

+ auto *Cmp = cast<CmpInst>(V);

+ Value *LHS = Cmp->getOperand(0);

+ Value *RHS = Cmp->getOperand(1);

+ CmpInst::Predicate CurrentPred = Cmp->getPredicate();

+ if (P0 == AltP0Swapped) {

+ if (CI != Cmp && S.AltOp != Cmp &&

+ ((P0 == CurrentPred &&

+ !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||

+ (AltP0 == CurrentPred &&

+ areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))))

+ std::swap(LHS, RHS);

+ } else if (P0 != CurrentPred && AltP0 != CurrentPred) {

+ std::swap(LHS, RHS);

+ }

+ Left.push_back(LHS);

+ Right.push_back(RHS);

+ }

TE->setOperand(0, Left);

TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

@@ -4493,7 +5627,9 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I,

ArrayRef<Value *> VectorizedVals) const {

return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||

all_of(I->users(), [this](User *U) {

- return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);

+ return ScalarToTreeEntry.count(U) > 0 ||

+ isVectorLikeInstWithConstOps(U) ||

+ (isa<ExtractElementInst>(U) && MustGather.contains(U));

});

}

@@ -4550,19 +5686,21 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

// Process extracts in blocks of EltsPerVector to check if the source vector

// operand can be re-used directly. If not, add the cost of creating a shuffle

// to extract the values into a vector register.

+ SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);

for (auto *V : VL) {

++Idx;

- // Need to exclude undefs from analysis.

- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)

- continue;

// Reached the start of a new vector registers.

if (Idx % EltsPerVector == 0) {

+ RegMask.assign(EltsPerVector, UndefMaskElem);

AllConsecutive = true;

continue;

}

+ // Need to exclude undefs from analysis.

+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)

+ continue;

// Check all extracts for a vector register on the target directly

// extract values in order.

unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

@@ -4570,6 +5708,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

CurrentIdx % EltsPerVector == Idx % EltsPerVector;

+ RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;

}

if (AllConsecutive)

@@ -4581,10 +5720,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

// If we have a series of extracts which are not consecutive and hence

// cannot re-use the source vector register directly, compute the shuffle

- // cost to extract the a vector with EltsPerVector elements.

+ // cost to extract the vector with EltsPerVector elements.

Cost += TTI.getShuffleCost(

TargetTransformInfo::SK_PermuteSingleSrc,

- FixedVectorType::get(VecTy->getElementType(), EltsPerVector));

+ FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);

}

return Cost;

}

@@ -4592,12 +5731,12 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

/// Build shuffle mask for shuffle graph entries and lists of main and alternate

/// operations operands.

static void

-buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

- ArrayRef<int> ReusesIndices,

- const function_ref<bool(Instruction *)> IsAltOp,

- SmallVectorImpl<int> &Mask,

- SmallVectorImpl<Value *> *OpScalars = nullptr,

- SmallVectorImpl<Value *> *AltScalars = nullptr) {

+buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

+ ArrayRef<int> ReusesIndices,

+ const function_ref<bool(Instruction *)> IsAltOp,

+ SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<Value *> *OpScalars = nullptr,

+ SmallVectorImpl<Value *> *AltScalars = nullptr) {

unsigned Sz = VL.size();

Mask.assign(Sz, UndefMaskElem);

SmallVector<int> OrderMask;

@@ -4627,6 +5766,29 @@ buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

}

+/// Checks if the specified instruction \p I is an alternate operation for the

+/// given \p MainOp and \p AltOp instructions.

+static bool isAlternateInstruction(const Instruction *I,

+ const Instruction *MainOp,

+ const Instruction *AltOp) {

+ if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) {

+ auto *AltCI0 = cast<CmpInst>(AltOp);

+ auto *CI = cast<CmpInst>(I);

+ CmpInst::Predicate P0 = CI0->getPredicate();

+ CmpInst::Predicate AltP0 = AltCI0->getPredicate();

+ assert(P0 != AltP0 && "Expected different main/alternate predicates.");

+ CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);

+ CmpInst::Predicate CurrentPred = CI->getPredicate();

+ if (P0 == AltP0Swapped)

+ return I == AltCI0 ||

+ (I != MainOp &&

+ !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),

+ CI->getOperand(0), CI->getOperand(1)));

+ return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;

+ }

+ return I->getOpcode() == AltOp->getOpcode();

InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

ArrayRef<Value *> VectorizedVals) {

ArrayRef<Value*> VL = E->Scalars;

@@ -4740,7 +5902,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

SmallVector<const TreeEntry *> Entries;

Optional<TargetTransformInfo::ShuffleKind> Shuffle =

isGatherShuffledEntry(E, Mask, Entries);

- if (Shuffle.hasValue()) {

+ if (Shuffle) {

InstructionCost GatherCost = 0;

if (ShuffleVectorInst::isIdentityMask(Mask)) {

// Perfect match in the graph, will reuse the previously vectorized

@@ -4776,7 +5938,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

SmallVector<int> Mask;

Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =

isFixedVectorShuffle(VL, Mask);

- if (ShuffleKind.hasValue()) {

+ if (ShuffleKind) {

// Found the bunch of extractelement instructions that must be gathered

// into a vector and can be represented as a permutation elements in a

// single input vector or of 2 input vectors.

@@ -4794,7 +5956,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// broadcast.

assert(VecTy == FinalVecTy &&

"No reused scalars expected for broadcast.");

- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);

+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,

+ /*Mask=*/None, /*Index=*/0,

+ /*SubTp=*/nullptr, /*Args=*/VL[0]);

}

InstructionCost ReuseShuffleCost = 0;

if (NeedToShuffleReuses)

@@ -4818,8 +5982,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {

SmallVector<Value *> PointerOps;

OrdersType CurrentOrder;

- LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,

- *SE, CurrentOrder, PointerOps);

+ LoadsState LS =

+ canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,

+ CurrentOrder, PointerOps);

switch (LS) {

case LoadsState::Vectorize:

case LoadsState::ScatterVectorize:

@@ -4909,7 +6074,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

assert((E->State == TreeEntry::Vectorize ||

E->State == TreeEntry::ScatterVectorize) &&

"Unhandled state");

- assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");

+ assert(E->getOpcode() &&

+ ((allSameType(VL) && allSameBlock(VL)) ||

+ (E->getOpcode() == Instruction::GetElementPtr &&

+ E->getMainOp()->getType()->isPointerTy())) &&

+ "Invalid VL");

Instruction *VL0 = E->getMainOp();

unsigned ShuffleOrOp =

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

@@ -4981,28 +6150,60 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

assert(E->ReuseShuffleIndices.empty() &&

"Unique insertelements only are expected.");

auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());

unsigned const NumElts = SrcVecTy->getNumElements();

unsigned const NumScalars = VL.size();

+ unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);

+ unsigned OffsetBeg = *getInsertIndex(VL.front());

+ unsigned OffsetEnd = OffsetBeg;

+ for (Value *V : VL.drop_front()) {

+ unsigned Idx = *getInsertIndex(V);

+ if (OffsetBeg > Idx)

+ OffsetBeg = Idx;

+ else if (OffsetEnd < Idx)

+ OffsetEnd = Idx;

+ }

+ unsigned VecScalarsSz = PowerOf2Ceil(NumElts);

+ if (NumOfParts > 0)

+ VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);

+ unsigned VecSz =

+ (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *

+ VecScalarsSz;

+ unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);

+ unsigned InsertVecSz = std::min<unsigned>(

+ PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),

+ ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *

+ VecScalarsSz);

+ bool IsWholeSubvector =

+ OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);

+ // Check if we can safely insert a subvector. If it is not possible, just

+ // generate a whole-sized vector and shuffle the source vector and the new

+ // subvector.

+ if (OffsetBeg + InsertVecSz > VecSz) {

+ // Align OffsetBeg to generate correct mask.

+ OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);

+ InsertVecSz = VecSz;

+ }

APInt DemandedElts = APInt::getZero(NumElts);

// TODO: Add support for Instruction::InsertValue.

SmallVector<int> Mask;

if (!E->ReorderIndices.empty()) {

inversePermutation(E->ReorderIndices, Mask);

- Mask.append(NumElts - NumScalars, UndefMaskElem);

+ Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);

} else {

- Mask.assign(NumElts, UndefMaskElem);

- std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

+ Mask.assign(VecSz, UndefMaskElem);

+ std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);

}

- unsigned Offset = *getInsertIndex(VL0, 0);

bool IsIdentity = true;

- SmallVector<int> PrevMask(NumElts, UndefMaskElem);

+ SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);

Mask.swap(PrevMask);

for (unsigned I = 0; I < NumScalars; ++I) {

unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);

DemandedElts.setBit(InsertIdx);

- IsIdentity &= InsertIdx - Offset == I;

- Mask[InsertIdx - Offset] = I;

+ IsIdentity &= InsertIdx - OffsetBeg == I;

+ Mask[InsertIdx - OffsetBeg] = I;

}

assert(Offset < NumElts && "Failed to find vector index offset");

@@ -5010,32 +6211,41 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,

/*Insert*/ true, /*Extract*/ false);

- if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {

- // FIXME: Replace with SK_InsertSubvector once it is properly supported.

- unsigned Sz = PowerOf2Ceil(Offset + NumScalars);

- Cost += TTI->getShuffleCost(

- TargetTransformInfo::SK_PermuteSingleSrc,

- FixedVectorType::get(SrcVecTy->getElementType(), Sz));

- } else if (!IsIdentity) {

- auto *FirstInsert =

- cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

- return !is_contained(E->Scalars,

- cast<Instruction>(V)->getOperand(0));

- }));

- if (isUndefVector(FirstInsert->getOperand(0))) {

- Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);

+ // First cost - resize to actual vector size if not identity shuffle or

+ // need to shift the vector.

+ // Do not calculate the cost if the actual size is the register size and

+ // we can merge this shuffle with the following SK_Select.

+ auto *InsertVecTy =

+ FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);

+ if (!IsIdentity)

+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

+ InsertVecTy, Mask);

+ auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

+ return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

+ }));

+ // Second cost - permutation with subvector, if some elements are from the

+ // initial vector or inserting a subvector.

+ // TODO: Implement the analysis of the FirstInsert->getOperand(0)

+ // subvector of ActualVecTy.

+ if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts &&

+ !IsWholeSubvector) {

+ if (InsertVecSz != VecSz) {

+ auto *ActualVecTy =

+ FixedVectorType::get(SrcVecTy->getElementType(), VecSz);

+ Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,

+ None, OffsetBeg - Offset, InsertVecTy);

} else {

- SmallVector<int> InsertMask(NumElts);

- std::iota(InsertMask.begin(), InsertMask.end(), 0);

- for (unsigned I = 0; I < NumElts; I++) {

+ for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)

+ Mask[I] = I;

+ for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;

+ I <= End; ++I)

if (Mask[I] != UndefMaskElem)

- InsertMask[Offset + I] = NumElts + I;

- }

- Cost +=

- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);

+ Mask[I] = I + VecSz;

+ for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)

+ Mask[I] = I;

+ Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);

}

return Cost;

}

case Instruction::ZExt:

@@ -5116,9 +6326,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// If the selects are the only uses of the compares, they will be dead

// and we can adjust the cost by removing their cost.

if (IntrinsicAndUse.second)

- IntrinsicCost -=

- TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,

- CmpInst::BAD_ICMP_PREDICATE, CostKind);

+ IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,

+ MaskTy, VecPred, CostKind);

VecCost = std::min(VecCost, IntrinsicCost);

}

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));

@@ -5198,7 +6407,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TargetTransformInfo::OperandValueKind Op1VK =

TargetTransformInfo::OK_AnyValue;

TargetTransformInfo::OperandValueKind Op2VK =

- TargetTransformInfo::OK_UniformConstantValue;

+ any_of(VL,

+ [](Value *V) {

+ return isa<GetElementPtrInst>(V) &&

+ !isConstant(

+ cast<GetElementPtrInst>(V)->getOperand(1));

+ })

+ ? TargetTransformInfo::OK_AnyValue

+ : TargetTransformInfo::OK_UniformConstantValue;

InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(

Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);

@@ -5229,7 +6445,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

Align CommonAlignment = Alignment;

for (Value *V : VL)

CommonAlignment =

- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());

+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

VecLdCost = TTI->getGatherScatterOpCost(

Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),

/*VariableMask=*/false, CommonAlignment, CostKind, VL0);

@@ -5279,7 +6495,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

((Instruction::isBinaryOp(E->getOpcode()) &&

Instruction::isBinaryOp(E->getAltOpcode())) ||

(Instruction::isCast(E->getOpcode()) &&

- Instruction::isCast(E->getAltOpcode()))) &&

+ Instruction::isCast(E->getAltOpcode())) ||

+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

"Invalid Shuffle Vector Operand");

InstructionCost ScalarCost = 0;

if (NeedToShuffleReuses) {

@@ -5327,6 +6544,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);

VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,

CostKind);

+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

+ VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,

+ Builder.getInt1Ty(),

+ CI0->getPredicate(), CostKind, VL0);

+ VecCost += TTI->getCmpSelInstrCost(

+ E->getOpcode(), ScalarTy, Builder.getInt1Ty(),

+ cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,

+ E->getAltOp());

} else {

Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();

Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();

@@ -5338,16 +6563,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TTI::CastContextHint::None, CostKind);

}

- SmallVector<int> Mask;

- buildSuffleEntryMask(

- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

- [E](Instruction *I) {

- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

- return I->getOpcode() == E->getAltOpcode();

- },

- Mask);

- CommonCost =

- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);

+ if (E->ReuseShuffleIndices.empty()) {

+ CommonCost =

+ TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);

+ } else {

+ SmallVector<int> Mask;

+ buildShuffleEntryMask(

+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

+ [E](Instruction *I) {

+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

+ return I->getOpcode() == E->getAltOpcode();

+ },

+ Mask);

+ CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,

+ FinalVecTy, Mask);

+ }

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));

return CommonCost + VecCost - ScalarCost;

}

@@ -5475,7 +6705,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {

// No need to vectorize inserts of gathered values.

if (VectorizableTree.size() == 2 &&

isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&

- VectorizableTree[1]->State == TreeEntry::NeedToGather)

+ VectorizableTree[1]->State == TreeEntry::NeedToGather &&

+ (VectorizableTree[1]->getVectorFactor() <= 2 ||

+ !(isSplat(VectorizableTree[1]->Scalars) ||

+ allConstant(VectorizableTree[1]->Scalars))))

return true;

// We can vectorize the tree if its size is greater than or equal to the

@@ -5605,20 +6838,26 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,

return false;

auto *IE1 = VU;

auto *IE2 = V;

+ unsigned Idx1 = *getInsertIndex(IE1);

+ unsigned Idx2 = *getInsertIndex(IE2);

// Go through the vector operand of insertelement instructions trying to find

// either VU as the original vector for IE2 or V as the original vector for

// IE1.

do {

- if (IE2 == VU || IE1 == V)

- return true;

+ if (IE2 == VU)

+ return VU->hasOneUse();

+ if (IE1 == V)

+ return V->hasOneUse();

if (IE1) {

- if (IE1 != VU && !IE1->hasOneUse())

+ if ((IE1 != VU && !IE1->hasOneUse()) ||

+ getInsertIndex(IE1).value_or(Idx2) == Idx2)

IE1 = nullptr;

else

IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));

}

if (IE2) {

- if (IE2 != V && !IE2->hasOneUse())

+ if ((IE2 != V && !IE2->hasOneUse()) ||

+ getInsertIndex(IE2).value_or(Idx1) == Idx1)

IE2 = nullptr;

else

IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));

@@ -5627,6 +6866,153 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,

return false;

}

+/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the

+/// buildvector sequence.

+static bool isFirstInsertElement(const InsertElementInst *IE1,

+ const InsertElementInst *IE2) {

+ if (IE1 == IE2)

+ return false;

+ const auto *I1 = IE1;

+ const auto *I2 = IE2;

+ const InsertElementInst *PrevI1;

+ const InsertElementInst *PrevI2;

+ unsigned Idx1 = *getInsertIndex(IE1);

+ unsigned Idx2 = *getInsertIndex(IE2);

+ do {

+ if (I2 == IE1)

+ return true;

+ if (I1 == IE2)

+ return false;

+ PrevI1 = I1;

+ PrevI2 = I2;

+ if (I1 && (I1 == IE1 || I1->hasOneUse()) &&

+ getInsertIndex(I1).value_or(Idx2) != Idx2)

+ I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));

+ if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&

+ getInsertIndex(I2).value_or(Idx1) != Idx1)

+ I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));

+ } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));

+ llvm_unreachable("Two different buildvectors not expected.");

+namespace {

+/// Returns incoming Value *, if the requested type is Value * too, or a default

+/// value, otherwise.

+struct ValueSelect {

+ template <typename U>

+ static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type

+ get(Value *V) {

+ return V;

+ }

+ template <typename U>

+ static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type

+ get(Value *) {

+ return U();

+ }

+};

+} // namespace

+/// Does the analysis of the provided shuffle masks and performs the requested

+/// actions on the vectors with the given shuffle masks. It tries to do it in

+/// several steps.

+/// 1. If the Base vector is not undef vector, resizing the very first mask to

+/// have common VF and perform action for 2 input vectors (including non-undef

+/// Base). Other shuffle masks are combined with the resulting after the 1 stage

+/// and processed as a shuffle of 2 elements.

+/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the

+/// action only for 1 vector with the given mask, if it is not the identity

+/// mask.

+/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2

+/// vectors, combing the masks properly between the steps.

+template <typename T>

+static T *performExtractsShuffleAction(

+ MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,

+ function_ref<unsigned(T *)> GetVF,

+ function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction,

+ function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {

+ assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");

+ SmallVector<int> Mask(ShuffleMask.begin()->second);

+ auto VMIt = std::next(ShuffleMask.begin());

+ T *Prev = nullptr;

+ bool IsBaseNotUndef = !isUndefVector(Base);

+ if (IsBaseNotUndef) {

+ // Base is not undef, need to combine it with the next subvectors.

+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);

+ for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {

+ if (Mask[Idx] == UndefMaskElem)

+ Mask[Idx] = Idx;

+ else

+ Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;

+ }

+ auto *V = ValueSelect::get<T *>(Base);

+ (void)V;

+ assert((!V || GetVF(V) == Mask.size()) &&

+ "Expected base vector of VF number of elements.");

+ Prev = Action(Mask, {nullptr, Res.first});

+ } else if (ShuffleMask.size() == 1) {

+ // Base is undef and only 1 vector is shuffled - perform the action only for

+ // single vector, if the mask is not the identity mask.

+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);

+ if (Res.second)

+ // Identity mask is found.

+ Prev = Res.first;

+ else

+ Prev = Action(Mask, {ShuffleMask.begin()->first});

+ } else {

+ // Base is undef and at least 2 input vectors shuffled - perform 2 vectors

+ // shuffles step by step, combining shuffle between the steps.

+ unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);

+ unsigned Vec2VF = GetVF(VMIt->first);

+ if (Vec1VF == Vec2VF) {

+ // No need to resize the input vectors since they are of the same size, we

+ // can shuffle them directly.

+ ArrayRef<int> SecMask = VMIt->second;

+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

+ if (SecMask[I] != UndefMaskElem) {

+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");

+ Mask[I] = SecMask[I] + Vec1VF;

+ }

+ Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});

+ } else {

+ // Vectors of different sizes - resize and reshuffle.

+ std::pair<T *, bool> Res1 =

+ ResizeAction(ShuffleMask.begin()->first, Mask);

+ std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second);

+ ArrayRef<int> SecMask = VMIt->second;

+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

+ if (Mask[I] != UndefMaskElem) {

+ assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");

+ if (Res1.second)

+ Mask[I] = I;

+ } else if (SecMask[I] != UndefMaskElem) {

+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");

+ Mask[I] = (Res2.second ? I : SecMask[I]) + VF;

+ }

+ Prev = Action(Mask, {Res1.first, Res2.first});

+ }

+ VMIt = std::next(VMIt);

+ }

+ // Perform requested actions for the remaining masks/vectors.

+ for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {

+ // Shuffle other input vectors, if any.

+ std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second);

+ ArrayRef<int> SecMask = VMIt->second;

+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

+ if (SecMask[I] != UndefMaskElem) {

+ assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&

+ "Multiple uses of scalars.");

+ Mask[I] = (Res.second ? I : SecMask[I]) + VF;

+ } else if (Mask[I] != UndefMaskElem) {

+ Mask[I] = I;

+ }

+ Prev = Action(Mask, {Prev, Res.first});

+ }

+ return Prev;

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

InstructionCost Cost = 0;

LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "

@@ -5635,7 +7021,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

unsigned BundleWidth = VectorizableTree[0]->Scalars.size();

for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {

- TreeEntry &TE = *VectorizableTree[I].get();

+ TreeEntry &TE = *VectorizableTree[I];

InstructionCost C = getEntryCost(&TE, VectorizedVals);

Cost += C;

@@ -5647,9 +7033,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

SmallPtrSet<Value *, 16> ExtractCostCalculated;

InstructionCost ExtractCost = 0;

- SmallVector<unsigned> VF;

- SmallVector<SmallVector<int>> ShuffleMask;

- SmallVector<Value *> FirstUsers;

+ SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;

+ SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;

SmallVector<APInt> DemandedElts;

for (ExternalUser &EU : ExternalUses) {

// We only add extract cost once for the same scalar.

@@ -5678,37 +7063,55 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

Optional<unsigned> InsertIdx = getInsertIndex(VU);

if (InsertIdx) {

- auto *It = find_if(FirstUsers, [VU](Value *V) {

- return areTwoInsertFromSameBuildVector(VU,

- cast<InsertElementInst>(V));

- });

+ const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);

+ auto *It =

+ find_if(FirstUsers,

+ [VU](const std::pair<Value *, const TreeEntry *> &Pair) {

+ return areTwoInsertFromSameBuildVector(

+ VU, cast<InsertElementInst>(Pair.first));

+ });

int VecId = -1;

if (It == FirstUsers.end()) {

- VF.push_back(FTy->getNumElements());

- ShuffleMask.emplace_back(VF.back(), UndefMaskElem);

+ (void)ShuffleMasks.emplace_back();

+ SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];

+ if (Mask.empty())

+ Mask.assign(FTy->getNumElements(), UndefMaskElem);

// Find the insertvector, vectorized in tree, if any.

Value *Base = VU;

- while (isa<InsertElementInst>(Base)) {

+ while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {

+ if (IEBase != EU.User &&

+ (!IEBase->hasOneUse() ||

+ getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))

+ break;

// Build the mask for the vectorized insertelement instructions.

- if (const TreeEntry *E = getTreeEntry(Base)) {

- VU = cast<InsertElementInst>(Base);

+ if (const TreeEntry *E = getTreeEntry(IEBase)) {

+ VU = IEBase;

do {

- int Idx = E->findLaneForValue(Base);

- ShuffleMask.back()[Idx] = Idx;

- Base = cast<InsertElementInst>(Base)->getOperand(0);

+ IEBase = cast<InsertElementInst>(Base);

+ int Idx = *getInsertIndex(IEBase);

+ assert(Mask[Idx] == UndefMaskElem &&

+ "InsertElementInstruction used already.");

+ Mask[Idx] = Idx;

+ Base = IEBase->getOperand(0);

} while (E == getTreeEntry(Base));

break;

}

Base = cast<InsertElementInst>(Base)->getOperand(0);

}

- FirstUsers.push_back(VU);

- DemandedElts.push_back(APInt::getZero(VF.back()));

+ FirstUsers.emplace_back(VU, ScalarTE);

+ DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));

VecId = FirstUsers.size() - 1;

} else {

+ if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))

+ It->first = VU;

VecId = std::distance(FirstUsers.begin(), It);

}

- ShuffleMask[VecId][*InsertIdx] = EU.Lane;

- DemandedElts[VecId].setBit(*InsertIdx);

+ int InIdx = *InsertIdx;

+ SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];

+ if (Mask.empty())

+ Mask.assign(FTy->getNumElements(), UndefMaskElem);

+ Mask[InIdx] = EU.Lane;

+ DemandedElts[VecId].setBit(InIdx);

continue;

}

@@ -5734,86 +7137,75 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

InstructionCost SpillCost = getSpillCost();

Cost += SpillCost + ExtractCost;

- if (FirstUsers.size() == 1) {

- int Limit = ShuffleMask.front().size() * 2;

- if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&

- !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {

- InstructionCost C = TTI->getShuffleCost(

+ auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) {

+ InstructionCost C = 0;

+ unsigned VF = Mask.size();

+ unsigned VecVF = TE->getVectorFactor();

+ if (VF != VecVF &&

+ (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||

+ (all_of(Mask,

+ [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&

+ !ShuffleVectorInst::isIdentityMask(Mask)))) {

+ SmallVector<int> OrigMask(VecVF, UndefMaskElem);

+ std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

+ OrigMask.begin());

+ C = TTI->getShuffleCost(

TTI::SK_PermuteSingleSrc,

- cast<FixedVectorType>(FirstUsers.front()->getType()),

- ShuffleMask.front());

- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

- << " for final shuffle of insertelement external users "

- << *VectorizableTree.front()->Scalars.front() << ".\n"

- << "SLP: Current total cost = " << Cost << "\n");

+ FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);

+ LLVM_DEBUG(

+ dbgs() << "SLP: Adding cost " << C

+ << " for final shuffle of insertelement external users.\n";

+ TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");

Cost += C;

+ return std::make_pair(TE, true);

}

+ return std::make_pair(TE, false);

+ };

+ // Calculate the cost of the reshuffled vectors, if any.

+ for (int I = 0, E = FirstUsers.size(); I < E; ++I) {

+ Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);

+ unsigned VF = ShuffleMasks[I].begin()->second.size();

+ auto *FTy = FixedVectorType::get(

+ cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);

+ auto Vector = ShuffleMasks[I].takeVector();

+ auto &&EstimateShufflesCost = [this, FTy,

+ &Cost](ArrayRef<int> Mask,

+ ArrayRef<const TreeEntry *> TEs) {

+ assert((TEs.size() == 1 || TEs.size() == 2) &&

+ "Expected exactly 1 or 2 tree entries.");

+ if (TEs.size() == 1) {

+ int Limit = 2 * Mask.size();

+ if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||

+ !ShuffleVectorInst::isIdentityMask(Mask)) {

+ InstructionCost C =

+ TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

+ << " for final shuffle of insertelement "

+ "external users.\n";

+ TEs.front()->dump();

+ dbgs() << "SLP: Current total cost = " << Cost << "\n");

+ Cost += C;

+ }

+ } else {

+ InstructionCost C =

+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

+ << " for final shuffle of vector node and external "

+ "insertelement users.\n";

+ if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();

+ dbgs() << "SLP: Current total cost = " << Cost << "\n");

+ Cost += C;

+ }

+ return TEs.back();

+ };

+ (void)performExtractsShuffleAction<const TreeEntry>(

+ makeMutableArrayRef(Vector.data(), Vector.size()), Base,

+ [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,

+ EstimateShufflesCost);

InstructionCost InsertCost = TTI->getScalarizationOverhead(

- cast<FixedVectorType>(FirstUsers.front()->getType()),

- DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);

- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost

- << " for insertelements gather.\n"

- << "SLP: Current total cost = " << Cost << "\n");

- Cost -= InsertCost;

- } else if (FirstUsers.size() >= 2) {

- unsigned MaxVF = *std::max_element(VF.begin(), VF.end());

- // Combined masks of the first 2 vectors.

- SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);

- copy(ShuffleMask.front(), CombinedMask.begin());

- APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);

- auto *VecTy = FixedVectorType::get(

- cast<VectorType>(FirstUsers.front()->getType())->getElementType(),

- MaxVF);

- for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {

- if (ShuffleMask[1][I] != UndefMaskElem) {

- CombinedMask[I] = ShuffleMask[1][I] + MaxVF;

- CombinedDemandedElts.setBit(I);

- }

- InstructionCost C =

- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);

- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

- << " for final shuffle of vector node and external "

- "insertelement users "

- << *VectorizableTree.front()->Scalars.front() << ".\n"

- << "SLP: Current total cost = " << Cost << "\n");

- Cost += C;

- InstructionCost InsertCost = TTI->getScalarizationOverhead(

- VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);

- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost

- << " for insertelements gather.\n"

- << "SLP: Current total cost = " << Cost << "\n");

+ cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],

+ /*Insert*/ true, /*Extract*/ false);

Cost -= InsertCost;

- for (int I = 2, E = FirstUsers.size(); I < E; ++I) {

- // Other elements - permutation of 2 vectors (the initial one and the

- // next Ith incoming vector).

- unsigned VF = ShuffleMask[I].size();

- for (unsigned Idx = 0; Idx < VF; ++Idx) {

- int Mask = ShuffleMask[I][Idx];

- if (Mask != UndefMaskElem)

- CombinedMask[Idx] = MaxVF + Mask;

- else if (CombinedMask[Idx] != UndefMaskElem)

- CombinedMask[Idx] = Idx;

- }

- for (unsigned Idx = VF; Idx < MaxVF; ++Idx)

- if (CombinedMask[Idx] != UndefMaskElem)

- CombinedMask[Idx] = Idx;

- InstructionCost C =

- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);

- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

- << " for final shuffle of vector node and external "

- "insertelement users "

- << *VectorizableTree.front()->Scalars.front() << ".\n"

- << "SLP: Current total cost = " << Cost << "\n");

- Cost += C;

- InstructionCost InsertCost = TTI->getScalarizationOverhead(

- cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],

- /*Insert*/ true, /*Extract*/ false);

- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost

- << " for insertelements gather.\n"

- << "SLP: Current total cost = " << Cost << "\n");

- Cost -= InsertCost;

- }

}

#ifndef NDEBUG

@@ -5906,6 +7298,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,

}

+ if (UsedTEs.empty()) {

+ assert(all_of(TE->Scalars, UndefValue::classof) &&

+ "Expected vector of undefs only.");

+ return None;

+ }

unsigned VF = 0;

if (UsedTEs.size() == 1) {

// Try to find the perfect match in another gather node at first.

@@ -5965,17 +7363,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,

return None;

}

-InstructionCost

-BoUpSLP::getGatherCost(FixedVectorType *Ty,

- const DenseSet<unsigned> &ShuffledIndices,

- bool NeedToShuffle) const {

- unsigned NumElts = Ty->getNumElements();

- APInt DemandedElts = APInt::getZero(NumElts);

- for (unsigned I = 0; I < NumElts; ++I)

- if (!ShuffledIndices.count(I))

- DemandedElts.setBit(I);

+InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,

+ const APInt &ShuffledIndices,

+ bool NeedToShuffle) const {

InstructionCost Cost =

- TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,

+ TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,

/*Extract*/ false);

if (NeedToShuffle)

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);

@@ -5992,19 +7384,19 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// Find the cost of inserting/extracting values from the vector.

// Check if the same elements are inserted several times and count them as

// shuffle candidates.

- DenseSet<unsigned> ShuffledElements;

+ APInt ShuffledElements = APInt::getZero(VL.size());

DenseSet<Value *> UniqueElements;

// Iterate in reverse order to consider insert elements with the high cost.

for (unsigned I = VL.size(); I > 0; --I) {

unsigned Idx = I - 1;

// No need to shuffle duplicates for constants.

if (isConstant(VL[Idx])) {

- ShuffledElements.insert(Idx);

+ ShuffledElements.setBit(Idx);

continue;

}

if (!UniqueElements.insert(VL[Idx]).second) {

DuplicateNonConst = true;

- ShuffledElements.insert(Idx);

+ ShuffledElements.setBit(Idx);

}

return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);

@@ -6029,14 +7421,83 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

// Get the basic block this bundle is in. All instructions in the bundle

- // should be in this block.

+ // should be in this block (except for extractelement-like instructions with

+ // constant indeces).

auto *Front = E->getMainOp();

auto *BB = Front->getParent();

assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {

+ if (E->getOpcode() == Instruction::GetElementPtr &&

+ !isa<GetElementPtrInst>(V))

+ return true;

auto *I = cast<Instruction>(V);

- return !E->isOpcodeOrAlt(I) || I->getParent() == BB;

+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||

+ isVectorLikeInstWithConstOps(I);

}));

+ auto &&FindLastInst = [E, Front, this, &BB]() {

+ Instruction *LastInst = Front;

+ for (Value *V : E->Scalars) {

+ auto *I = dyn_cast<Instruction>(V);

+ if (!I)

+ continue;

+ if (LastInst->getParent() == I->getParent()) {

+ if (LastInst->comesBefore(I))

+ LastInst = I;

+ continue;

+ }

+ assert(isVectorLikeInstWithConstOps(LastInst) &&

+ isVectorLikeInstWithConstOps(I) &&

+ "Expected vector-like insts only.");

+ if (!DT->isReachableFromEntry(LastInst->getParent())) {

+ LastInst = I;

+ continue;

+ }

+ if (!DT->isReachableFromEntry(I->getParent()))

+ continue;

+ auto *NodeA = DT->getNode(LastInst->getParent());

+ auto *NodeB = DT->getNode(I->getParent());

+ assert(NodeA && "Should only process reachable instructions");

+ assert(NodeB && "Should only process reachable instructions");

+ assert((NodeA == NodeB) ==

+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

+ "Different nodes should have different DFS numbers");

+ if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())

+ LastInst = I;

+ }

+ BB = LastInst->getParent();

+ return LastInst;

+ };

+ auto &&FindFirstInst = [E, Front]() {

+ Instruction *FirstInst = Front;

+ for (Value *V : E->Scalars) {

+ auto *I = dyn_cast<Instruction>(V);

+ if (!I)

+ continue;

+ if (I->comesBefore(FirstInst))

+ FirstInst = I;

+ }

+ return FirstInst;

+ };

+ // Set the insert point to the beginning of the basic block if the entry

+ // should not be scheduled.

+ if (E->State != TreeEntry::NeedToGather &&

+ doesNotNeedToSchedule(E->Scalars)) {

+ Instruction *InsertInst;

+ if (all_of(E->Scalars, isUsedOutsideBlock))

+ InsertInst = FindLastInst();

+ else

+ InsertInst = FindFirstInst();

+ // If the instruction is PHI, set the insert point after all the PHIs.

+ if (isa<PHINode>(InsertInst))

+ InsertInst = BB->getFirstNonPHI();

+ BasicBlock::iterator InsertPt = InsertInst->getIterator();

+ Builder.SetInsertPoint(BB, InsertPt);

+ Builder.SetCurrentDebugLocation(Front->getDebugLoc());

+ return;

+ }

// The last instruction in the bundle in program order.

Instruction *LastInst = nullptr;

@@ -6045,8 +7506,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

// VL.back() and iterate over schedule data until we reach the end of the

// bundle. The end of the bundle is marked by null ScheduleData.

if (BlocksSchedules.count(BB)) {

- auto *Bundle =

- BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));

+ Value *V = E->isOneOf(E->Scalars.back());

+ if (doesNotNeedToBeScheduled(V))

+ V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);

+ auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);

if (Bundle && Bundle->isPartOfBundle())

for (; Bundle; Bundle = Bundle->NextInBundle)

if (Bundle->OpValue == Bundle->Inst)

@@ -6072,19 +7535,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

// we both exit early from buildTree_rec and that the bundle be out-of-order

// (causing us to iterate all the way to the end of the block).

if (!LastInst) {

- SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());

- for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {

- if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))

- LastInst = &I;

- if (Bundle.empty())

- break;

- }

+ LastInst = FindLastInst();

+ // If the instruction is PHI, set the insert point after all the PHIs.

+ if (isa<PHINode>(LastInst))

+ LastInst = BB->getFirstNonPHI()->getPrevNode();

}

assert(LastInst && "Failed to find last instruction in bundle");

// Set the insertion point after the last instruction in the bundle. Set the

// debug location to Front.

- Builder.SetInsertPoint(BB, ++LastInst->getIterator());

+ Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));

Builder.SetCurrentDebugLocation(Front->getDebugLoc());

}

@@ -6214,8 +7674,15 @@ public:

} // namespace

Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

- unsigned VF = VL.size();

+ const unsigned VF = VL.size();

InstructionsState S = getSameOpcode(VL);

+ // Special processing for GEPs bundle, which may include non-gep values.

+ if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {

+ const auto *It =

+ find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });

+ if (It != VL.end())

+ S = getSameOpcode(*It);

+ }

if (S.getOpcode()) {

if (TreeEntry *E = getTreeEntry(S.OpValue))

if (E->isSame(VL)) {

@@ -6270,7 +7737,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

}

- // Check that every instruction appears once in this bundle.

+ // Can't vectorize this, so simply build a new vector with each lane

+ // corresponding to the requested value.

+ return createBuildVector(VL);

+Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {

+ assert(any_of(VectorizableTree,

+ [VL](const std::unique_ptr<TreeEntry> &TE) {

+ return TE->State == TreeEntry::NeedToGather && TE->isSame(VL);

+ }) &&

+ "Non-matching gather node.");

+ unsigned VF = VL.size();

+ // Exploit possible reuse of values across lanes.

SmallVector<int> ReuseShuffleIndicies;

SmallVector<Value *> UniqueValues;

if (VL.size() > 2) {

@@ -6303,6 +7781,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),

UndefMaskElem);

} else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {

+ if (UniqueValues.empty()) {

+ assert(all_of(VL, UndefValue::classof) && "Expected list of undefs.");

+ NumValues = VF;

+ }

ReuseShuffleIndicies.clear();

UniqueValues.clear();

UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));

@@ -6342,7 +7824,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

SmallVector<const TreeEntry *> Entries;

Optional<TargetTransformInfo::ShuffleKind> Shuffle =

isGatherShuffledEntry(E, Mask, Entries);

- if (Shuffle.hasValue()) {

+ if (Shuffle) {

assert((Entries.size() == 1 || Entries.size() == 2) &&

"Expected shuffle of 1 or 2 entries.");

Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,

@@ -6376,14 +7858,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());

switch (ShuffleOrOp) {

case Instruction::PHI: {

- assert(

- (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) &&

- "PHI reordering is free.");

+ assert((E->ReorderIndices.empty() ||

+ E != VectorizableTree.front().get() ||

+ !E->UserTreeIndices.empty()) &&

+ "PHI reordering is free.");

auto *PH = cast<PHINode>(VL0);

Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

Value *V = NewPhi;

+ // Adjust insertion point once all PHI's have been generated.

+ Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());

+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());

ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

V = ShuffleBuilder.finalize(V);

@@ -6449,7 +7937,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

cast<FixedVectorType>(FirstInsert->getType())->getNumElements();

const unsigned NumScalars = E->Scalars.size();

- unsigned Offset = *getInsertIndex(VL0, 0);

+ unsigned Offset = *getInsertIndex(VL0);

assert(Offset < NumElts && "Failed to find vector index offset");

// Create shuffle to resize vector

@@ -6656,19 +8144,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

unsigned AS = LI->getPointerAddressSpace();

Value *PO = LI->getPointerOperand();

if (E->State == TreeEntry::Vectorize) {

Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));

+ NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());

// The pointer operand uses an in-tree scalar so we add the new BitCast

- // to ExternalUses list to make sure that an extract will be generated

- // in the future.

+ // or LoadInst to ExternalUses list to make sure that an extract will

+ // be generated in the future.

if (TreeEntry *Entry = getTreeEntry(PO)) {

// Find which lane we need to extract.

unsigned FoundLane = Entry->findLaneForValue(PO);

- ExternalUses.emplace_back(PO, cast<User>(VecPtr), FoundLane);

+ ExternalUses.emplace_back(

+ PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);

}

- NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());

} else {

assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");

Value *VecPtr = vectorizeTree(E->getOperand(0));

@@ -6676,7 +8163,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Align CommonAlignment = LI->getAlign();

for (Value *V : E->Scalars)

CommonAlignment =

- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());

+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);

}

Value *V = propagateMetadata(NewLI, E->Scalars);

@@ -6701,17 +8188,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Value *ScalarPtr = SI->getPointerOperand();

Value *VecPtr = Builder.CreateBitCast(

ScalarPtr, VecValue->getType()->getPointerTo(AS));

- StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,

- SI->getAlign());

+ StoreInst *ST =

+ Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());

- // The pointer operand uses an in-tree scalar, so add the new BitCast to

- // ExternalUses to make sure that an extract will be generated in the

- // future.

+ // The pointer operand uses an in-tree scalar, so add the new BitCast or

+ // StoreInst to ExternalUses to make sure that an extract will be

+ // generated in the future.

if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {

// Find which lane we need to extract.

unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);

- ExternalUses.push_back(

- ExternalUser(ScalarPtr, cast<User>(VecPtr), FoundLane));

+ ExternalUses.push_back(ExternalUser(

+ ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,

+ FoundLane));

}

Value *V = propagateMetadata(ST, E->Scalars);

@@ -6733,8 +8221,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);

- if (Instruction *I = dyn_cast<Instruction>(V))

- V = propagateMetadata(I, E->Scalars);

+ if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {

+ SmallVector<Value *> GEPs;

+ for (Value *V : E->Scalars) {

+ if (isa<GetElementPtrInst>(V))

+ GEPs.push_back(V);

+ }

+ V = propagateMetadata(I, GEPs);

+ }

ShuffleBuilder.addInversedMask(E->ReorderIndices);

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

@@ -6767,11 +8261,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

ValueList OpVL;

// Some intrinsics have scalar arguments. This argument should not be

// vectorized.

- if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {

+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {

CallInst *CEI = cast<CallInst>(VL0);

ScalarArg = CEI->getArgOperand(j);

OpVecs.push_back(CEI->getArgOperand(j));

- if (hasVectorInstrinsicOverloadedScalarOpd(IID, j))

+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

TysForDecl.push_back(ScalarArg->getType());

continue;

}

@@ -6779,6 +8273,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Value *OpVec = vectorizeTree(E->getOperand(j));

LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");

OpVecs.push_back(OpVec);

+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

+ TysForDecl.push_back(OpVec->getType());

}

Function *CF;

@@ -6822,11 +8318,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

((Instruction::isBinaryOp(E->getOpcode()) &&

Instruction::isBinaryOp(E->getAltOpcode())) ||

(Instruction::isCast(E->getOpcode()) &&

- Instruction::isCast(E->getAltOpcode()))) &&

+ Instruction::isCast(E->getAltOpcode())) ||

+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

"Invalid Shuffle Vector Operand");

Value *LHS = nullptr, *RHS = nullptr;

- if (Instruction::isBinaryOp(E->getOpcode())) {

+ if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {

setInsertPointAfterBundle(E);

LHS = vectorizeTree(E->getOperand(0));

RHS = vectorizeTree(E->getOperand(1));

@@ -6846,6 +8343,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);

V1 = Builder.CreateBinOp(

static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);

+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

+ V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);

+ auto *AltCI = cast<CmpInst>(E->getAltOp());

+ CmpInst::Predicate AltPred = AltCI->getPredicate();

+ V1 = Builder.CreateCmp(AltPred, LHS, RHS);

} else {

V0 = Builder.CreateCast(

static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);

@@ -6866,11 +8368,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// each vector operation.

ValueList OpScalars, AltScalars;

SmallVector<int> Mask;

- buildSuffleEntryMask(

+ buildShuffleEntryMask(

E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

[E](Instruction *I) {

assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

- return I->getOpcode() == E->getAltOpcode();

+ return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());

Mask, &OpScalars, &AltScalars);

@@ -6901,6 +8403,17 @@ Value *BoUpSLP::vectorizeTree() {

return vectorizeTree(ExternallyUsedValues);

}

+namespace {

+/// Data type for handling buildvector sequences with the reused scalars from

+/// other tree entries.

+struct ShuffledInsertData {

+ /// List of insertelements to be replaced by shuffles.

+ SmallVector<InsertElementInst *> InsertElements;

+ /// The parent vectors and shuffle mask for the given list of inserts.

+ MapVector<Value *, SmallVector<int>> ValueMasks;

+};

+} // namespace

Value *

BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

// All blocks must be scheduled before any instructions are inserted.

@@ -6934,6 +8447,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()

<< " values .\n");

+ SmallVector<ShuffledInsertData> ShuffledInserts;

+ // Maps vector instruction to original insertelement instruction

+ DenseMap<Value *, InsertElementInst *> VectorToInsertElement;

// Extract all of the elements with the external uses.

for (const auto &ExternalUse : ExternalUses) {

Value *Scalar = ExternalUse.Scalar;

@@ -6947,6 +8463,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

assert(E && "Invalid scalar");

assert(E->State != TreeEntry::NeedToGather &&

"Extracting from a gather list");

+ // Non-instruction pointers are not deleted, just skip them.

+ if (E->getOpcode() == Instruction::GetElementPtr &&

+ !isa<GetElementPtrInst>(Scalar))

+ continue;

Value *Vec = E->VectorizedValue;

assert(Vec && "Can't find vectorizable value");

@@ -6973,6 +8493,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

assert(isa<FixedVectorType>(Scalar->getType()) &&

isa<InsertElementInst>(Scalar) &&

"In-tree scalar of vector type is not insertelement?");

+ auto *IE = cast<InsertElementInst>(Scalar);

+ VectorToInsertElement.try_emplace(Vec, IE);

return Vec;

};

// If User == nullptr, the Scalar is used as extra arg. Generate

@@ -7001,6 +8523,69 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

continue;

}

+ if (auto *VU = dyn_cast<InsertElementInst>(User)) {

+ // Skip if the scalar is another vector op or Vec is not an instruction.

+ if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {

+ if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {

+ Optional<unsigned> InsertIdx = getInsertIndex(VU);

+ if (InsertIdx) {

+ // Need to use original vector, if the root is truncated.

+ if (MinBWs.count(Scalar) &&

+ VectorizableTree[0]->VectorizedValue == Vec)

+ Vec = VectorRoot;

+ auto *It =

+ find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {

+ // Checks if 2 insertelements are from the same buildvector.

+ InsertElementInst *VecInsert = Data.InsertElements.front();

+ return areTwoInsertFromSameBuildVector(VU, VecInsert);

+ });

+ unsigned Idx = *InsertIdx;

+ if (It == ShuffledInserts.end()) {

+ (void)ShuffledInserts.emplace_back();

+ It = std::next(ShuffledInserts.begin(),

+ ShuffledInserts.size() - 1);

+ SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

+ if (Mask.empty())

+ Mask.assign(FTy->getNumElements(), UndefMaskElem);

+ // Find the insertvector, vectorized in tree, if any.

+ Value *Base = VU;

+ while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {

+ if (IEBase != User &&

+ (!IEBase->hasOneUse() ||

+ getInsertIndex(IEBase).value_or(Idx) == Idx))

+ break;

+ // Build the mask for the vectorized insertelement instructions.

+ if (const TreeEntry *E = getTreeEntry(IEBase)) {

+ do {

+ IEBase = cast<InsertElementInst>(Base);

+ int IEIdx = *getInsertIndex(IEBase);

+ assert(Mask[Idx] == UndefMaskElem &&

+ "InsertElementInstruction used already.");

+ Mask[IEIdx] = IEIdx;

+ Base = IEBase->getOperand(0);

+ } while (E == getTreeEntry(Base));

+ break;

+ }

+ Base = cast<InsertElementInst>(Base)->getOperand(0);

+ // After the vectorization the def-use chain has changed, need

+ // to look through original insertelement instructions, if they

+ // get replaced by vector instructions.

+ auto It = VectorToInsertElement.find(Base);

+ if (It != VectorToInsertElement.end())

+ Base = It->second;

+ }

+ SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

+ if (Mask.empty())

+ Mask.assign(FTy->getNumElements(), UndefMaskElem);

+ Mask[Idx] = ExternalUse.Lane;

+ It->InsertElements.push_back(cast<InsertElementInst>(User));

+ continue;

+ }

// Generate extracts for out-of-tree users.

// Find the insertion point for the extractelement lane.

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

@@ -7036,6 +8621,221 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");

}

+ // Checks if the mask is an identity mask.

+ auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {

+ int Limit = Mask.size();

+ return VecTy->getNumElements() == Mask.size() &&

+ all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&

+ ShuffleVectorInst::isIdentityMask(Mask);

+ };

+ // Tries to combine 2 different masks into single one.

+ auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {

+ SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);

+ for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {

+ if (ExtMask[I] == UndefMaskElem)

+ continue;

+ NewMask[I] = Mask[ExtMask[I]];

+ }

+ Mask.swap(NewMask);

+ };

+ // Peek through shuffles, trying to simplify the final shuffle code.

+ auto &&PeekThroughShuffles =

+ [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,

+ bool CheckForLengthChange = false) {

+ while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {

+ // Exit if not a fixed vector type or changing size shuffle.

+ if (!isa<FixedVectorType>(SV->getType()) ||

+ (CheckForLengthChange && SV->changesLength()))

+ break;

+ // Exit if the identity or broadcast mask is found.

+ if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||

+ SV->isZeroEltSplat())

+ break;

+ bool IsOp1Undef = isUndefVector(SV->getOperand(0));

+ bool IsOp2Undef = isUndefVector(SV->getOperand(1));

+ if (!IsOp1Undef && !IsOp2Undef)

+ break;

+ SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),

+ SV->getShuffleMask().end());

+ CombineMasks(ShuffleMask, Mask);

+ Mask.swap(ShuffleMask);

+ if (IsOp2Undef)

+ V = SV->getOperand(0);

+ else

+ V = SV->getOperand(1);

+ }

+ };

+ // Smart shuffle instruction emission, walks through shuffles trees and

+ // tries to find the best matching vector for the actual shuffle

+ // instruction.

+ auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,

+ &CombineMasks](Value *V1, Value *V2,

+ ArrayRef<int> Mask) -> Value * {

+ assert(V1 && "Expected at least one vector value.");

+ if (V2 && !isUndefVector(V2)) {

+ // Peek through shuffles.

+ Value *Op1 = V1;

+ Value *Op2 = V2;

+ int VF =

+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

+ SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);

+ SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);

+ for (int I = 0, E = Mask.size(); I < E; ++I) {

+ if (Mask[I] < VF)

+ CombinedMask1[I] = Mask[I];

+ else

+ CombinedMask2[I] = Mask[I] - VF;

+ }

+ Value *PrevOp1;

+ Value *PrevOp2;

+ do {

+ PrevOp1 = Op1;

+ PrevOp2 = Op2;

+ PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);

+ PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);

+ // Check if we have 2 resizing shuffles - need to peek through operands

+ // again.

+ if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))

+ if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2))

+ if (SV1->getOperand(0)->getType() ==

+ SV2->getOperand(0)->getType() &&

+ SV1->getOperand(0)->getType() != SV1->getType() &&

+ isUndefVector(SV1->getOperand(1)) &&

+ isUndefVector(SV2->getOperand(1))) {

+ Op1 = SV1->getOperand(0);

+ Op2 = SV2->getOperand(0);

+ SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),

+ SV1->getShuffleMask().end());

+ CombineMasks(ShuffleMask1, CombinedMask1);

+ CombinedMask1.swap(ShuffleMask1);

+ SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),

+ SV2->getShuffleMask().end());

+ CombineMasks(ShuffleMask2, CombinedMask2);

+ CombinedMask2.swap(ShuffleMask2);

+ }

+ } while (PrevOp1 != Op1 || PrevOp2 != Op2);

+ VF = cast<VectorType>(Op1->getType())

+ ->getElementCount()

+ .getKnownMinValue();

+ for (int I = 0, E = Mask.size(); I < E; ++I) {

+ if (CombinedMask2[I] != UndefMaskElem) {

+ assert(CombinedMask1[I] == UndefMaskElem &&

+ "Expected undefined mask element");

+ CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

+ }

+ Value *Vec = Builder.CreateShuffleVector(

+ Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,

+ CombinedMask1);

+ if (auto *I = dyn_cast<Instruction>(Vec)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

+ return Vec;

+ }

+ if (isa<PoisonValue>(V1))

+ return PoisonValue::get(FixedVectorType::get(

+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()));

+ Value *Op = V1;

+ SmallVector<int> CombinedMask(Mask.begin(), Mask.end());

+ PeekThroughShuffles(Op, CombinedMask);

+ if (!isa<FixedVectorType>(Op->getType()) ||

+ !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {

+ Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);

+ if (auto *I = dyn_cast<Instruction>(Vec)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

+ return Vec;

+ }

+ return Op;

+ };

+ auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) {

+ unsigned VF = Mask.size();

+ unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

+ if (VF != VecVF) {

+ if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {

+ Vec = CreateShuffle(Vec, nullptr, Mask);

+ return std::make_pair(Vec, true);

+ }

+ SmallVector<int> ResizeMask(VF, UndefMaskElem);

+ for (unsigned I = 0; I < VF; ++I) {

+ if (Mask[I] != UndefMaskElem)

+ ResizeMask[Mask[I]] = Mask[I];

+ }

+ Vec = CreateShuffle(Vec, nullptr, ResizeMask);

+ }

+ return std::make_pair(Vec, false);

+ };

+ // Perform shuffling of the vectorize tree entries for better handling of

+ // external extracts.

+ for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {

+ // Find the first and the last instruction in the list of insertelements.

+ sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);

+ InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();

+ InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();

+ Builder.SetInsertPoint(LastInsert);

+ auto Vector = ShuffledInserts[I].ValueMasks.takeVector();

+ Value *NewInst = performExtractsShuffleAction<Value>(

+ makeMutableArrayRef(Vector.data(), Vector.size()),

+ FirstInsert->getOperand(0),

+ [](Value *Vec) {

+ return cast<VectorType>(Vec->getType())

+ ->getElementCount()

+ .getKnownMinValue();

+ },

+ ResizeToVF,

+ [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,

+ ArrayRef<Value *> Vals) {

+ assert((Vals.size() == 1 || Vals.size() == 2) &&

+ "Expected exactly 1 or 2 input values.");

+ if (Vals.size() == 1) {

+ // Do not create shuffle if the mask is a simple identity

+ // non-resizing mask.

+ if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())

+ ->getNumElements() ||

+ !ShuffleVectorInst::isIdentityMask(Mask))

+ return CreateShuffle(Vals.front(), nullptr, Mask);

+ return Vals.front();

+ }

+ return CreateShuffle(Vals.front() ? Vals.front()

+ : FirstInsert->getOperand(0),

+ Vals.back(), Mask);

+ });

+ auto It = ShuffledInserts[I].InsertElements.rbegin();

+ // Rebuild buildvector chain.

+ InsertElementInst *II = nullptr;

+ if (It != ShuffledInserts[I].InsertElements.rend())

+ II = *It;

+ SmallVector<Instruction *> Inserts;

+ while (It != ShuffledInserts[I].InsertElements.rend()) {

+ assert(II && "Must be an insertelement instruction.");

+ if (*It == II)

+ ++It;

+ else

+ Inserts.push_back(cast<Instruction>(II));

+ II = dyn_cast<InsertElementInst>(II->getOperand(0));

+ }

+ for (Instruction *II : reverse(Inserts)) {

+ II->replaceUsesOfWith(II->getOperand(0), NewInst);

+ if (auto *NewI = dyn_cast<Instruction>(NewInst))

+ if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))

+ II->moveAfter(NewI);

+ NewInst = II;

+ }

+ LastInsert->replaceAllUsesWith(NewInst);

+ for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {

+ IE->replaceUsesOfWith(IE->getOperand(0),

+ PoisonValue::get(IE->getOperand(0)->getType()));

+ IE->replaceUsesOfWith(IE->getOperand(1),

+ PoisonValue::get(IE->getOperand(1)->getType()));

+ eraseInstruction(IE);

+ }

+ CSEBlocks.insert(LastInsert->getParent());

+ }

// For each vectorized value:

for (auto &TEPtr : VectorizableTree) {

TreeEntry *Entry = TEPtr.get();

@@ -7050,6 +8850,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

Value *Scalar = Entry->Scalars[Lane];

+ if (Entry->getOpcode() == Instruction::GetElementPtr &&

+ !isa<GetElementPtrInst>(Scalar))

+ continue;

#ifndef NDEBUG

Type *Ty = Scalar->getType();

if (!Ty->isVoidTy()) {

@@ -7057,7 +8860,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");

// It is legal to delete users in the ignorelist.

- assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) ||

+ assert((getTreeEntry(U) ||

+ (UserIgnoreList && UserIgnoreList->contains(U)) ||

(isa_and_nonnull<Instruction>(U) &&

isDeleted(cast<Instruction>(U)))) &&

"Deleting out-of-tree value");

@@ -7225,9 +9029,11 @@ void BoUpSLP::optimizeGatherSequence() {

BoUpSLP::ScheduleData *

BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {

- ScheduleData *Bundle = nullptr;

+ ScheduleData *Bundle = nullptr;

ScheduleData *PrevInBundle = nullptr;

for (Value *V : VL) {

+ if (doesNotNeedToBeScheduled(V))

+ continue;

ScheduleData *BundleMember = getScheduleData(V);

assert(BundleMember &&

"no ScheduleData for bundle member "

@@ -7239,8 +9045,6 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {

} else {

Bundle = BundleMember;

}

- BundleMember->UnscheduledDepsInBundle = 0;

- Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;

// Group the instructions to a bundle.

BundleMember->FirstInBundle = Bundle;

@@ -7257,7 +9061,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

const InstructionsState &S) {

// No need to schedule PHIs, insertelement, extractelement and extractvalue

// instructions.

- if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))

+ if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||

+ doesNotNeedToSchedule(VL))

return nullptr;

// Initialize the instruction bundle.

@@ -7276,16 +9081,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });

ReSchedule = true;

}

- if (ReSchedule) {

- resetSchedule();

- initialFillReadyList(ReadyInsts);

- }

if (Bundle) {

LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle

<< " in block " << BB->getName() << "\n");

calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);

}

+ if (ReSchedule) {

+ resetSchedule();

+ initialFillReadyList(ReadyInsts);

+ }

// Now try to schedule the new bundle or (if no bundle) just calculate

// dependencies. As soon as the bundle is "ready" it means that there are no

// cyclic dependencies and we can schedule it. Note that's important that we

@@ -7293,14 +9099,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&

!ReadyInsts.empty()) {

ScheduleData *Picked = ReadyInsts.pop_back_val();

- if (Picked->isSchedulingEntity() && Picked->isReady())

- schedule(Picked, ReadyInsts);

+ assert(Picked->isSchedulingEntity() && Picked->isReady() &&

+ "must be ready to schedule");

+ schedule(Picked, ReadyInsts);

}

};

// Make sure that the scheduling region contains all

// instructions of the bundle.

for (Value *V : VL) {

+ if (doesNotNeedToBeScheduled(V))

+ continue;

if (!extendSchedulingRegion(V, S)) {

// If the scheduling region got new instructions at the lower end (or it

// is a new region for the first bundle). This makes it necessary to

@@ -7315,9 +9124,16 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

bool ReSchedule = false;

for (Value *V : VL) {

+ if (doesNotNeedToBeScheduled(V))

+ continue;

ScheduleData *BundleMember = getScheduleData(V);

assert(BundleMember &&

"no ScheduleData for bundle member (maybe not in same basic block)");

+ // Make sure we don't leave the pieces of the bundle in the ready list when

+ // whole bundle might not be ready.

+ ReadyInsts.remove(BundleMember);

if (!BundleMember->IsScheduled)

continue;

// A bundle member was scheduled as single instruction before and now

@@ -7339,16 +9155,24 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

Value *OpValue) {

- if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))

+ if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||

+ doesNotNeedToSchedule(VL))

return;

+ if (doesNotNeedToBeScheduled(OpValue))

+ OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);

ScheduleData *Bundle = getScheduleData(OpValue);

LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");

assert(!Bundle->IsScheduled &&

"Can't cancel bundle which is already scheduled");

- assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&

+ assert(Bundle->isSchedulingEntity() &&

+ (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&

"tried to unbundle something which is not a bundle");

+ // Remove the bundle from the ready list.

+ if (Bundle->isReady())

+ ReadyInsts.remove(Bundle);

// Un-bundle: make single instructions out of the bundle.

ScheduleData *BundleMember = Bundle;

while (BundleMember) {

@@ -7356,8 +9180,8 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

BundleMember->FirstInBundle = BundleMember;

ScheduleData *Next = BundleMember->NextInBundle;

BundleMember->NextInBundle = nullptr;

- BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;

- if (BundleMember->UnscheduledDepsInBundle == 0) {

+ BundleMember->TE = nullptr;

+ if (BundleMember->unscheduledDepsInBundle() == 0) {

ReadyInsts.insert(BundleMember);

}

BundleMember = Next;

@@ -7380,9 +9204,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

Instruction *I = dyn_cast<Instruction>(V);

assert(I && "bundle member must be an instruction");

assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&

+ !doesNotNeedToBeScheduled(I) &&

"phi nodes/insertelements/extractelements/extractvalues don't need to "

"be scheduled");

- auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {

+ auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {

ScheduleData *ISD = getScheduleData(I);

if (!ISD)

return false;

@@ -7394,7 +9219,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

ExtraScheduleDataMap[I][S.OpValue] = SD;

return true;

};

- if (CheckSheduleForI(I))

+ if (CheckScheduleForI(I))

return true;

if (!ScheduleStart) {

// It's the first instruction in the new region.

@@ -7402,7 +9227,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

ScheduleStart = I;

ScheduleEnd = I->getNextNode();

if (isOneOf(S, I) != I)

- CheckSheduleForI(I);

+ CheckScheduleForI(I);

assert(ScheduleEnd && "tried to vectorize a terminator?");

LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");

return true;

@@ -7430,7 +9255,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);

ScheduleStart = I;

if (isOneOf(S, I) != I)

- CheckSheduleForI(I);

+ CheckScheduleForI(I);

LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I

<< "\n");

return true;

@@ -7444,7 +9269,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

nullptr);

ScheduleEnd = I->getNextNode();

if (isOneOf(S, I) != I)

- CheckSheduleForI(I);

+ CheckScheduleForI(I);

assert(ScheduleEnd && "tried to vectorize a terminator?");

LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");

return true;

@@ -7456,7 +9281,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,

ScheduleData *NextLoadStore) {

ScheduleData *CurrentLoadStore = PrevLoadStore;

for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {

- ScheduleData *SD = ScheduleDataMap[I];

+ // No need to allocate data for non-schedulable instructions.

+ if (doesNotNeedToBeScheduled(I))

+ continue;

+ ScheduleData *SD = ScheduleDataMap.lookup(I);

if (!SD) {

SD = allocateScheduleDataChunks();

ScheduleDataMap[I] = SD;

@@ -7479,6 +9307,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,

}

CurrentLoadStore = SD;

}

+ if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

+ match(I, m_Intrinsic<Intrinsic::stackrestore>()))

+ RegionHasStackSave = true;

}

if (NextLoadStore) {

if (CurrentLoadStore)

@@ -7511,8 +9343,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

// Handle def-use chain dependencies.

if (BundleMember->OpValue != BundleMember->Inst) {

- ScheduleData *UseSD = getScheduleData(BundleMember->Inst);

- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {

+ if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {

BundleMember->Dependencies++;

ScheduleData *DestBundle = UseSD->FirstInBundle;

if (!DestBundle->IsScheduled)

@@ -7522,10 +9353,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

}

} else {

for (User *U : BundleMember->Inst->users()) {

- assert(isa<Instruction>(U) &&

- "user of instruction must be instruction");

- ScheduleData *UseSD = getScheduleData(U);

- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {

+ if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {

BundleMember->Dependencies++;

ScheduleData *DestBundle = UseSD->FirstInBundle;

if (!DestBundle->IsScheduled)

@@ -7536,6 +9364,75 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

}

+ auto makeControlDependent = [&](Instruction *I) {

+ auto *DepDest = getScheduleData(I);

+ assert(DepDest && "must be in schedule window");

+ DepDest->ControlDependencies.push_back(BundleMember);

+ BundleMember->Dependencies++;

+ ScheduleData *DestBundle = DepDest->FirstInBundle;

+ if (!DestBundle->IsScheduled)

+ BundleMember->incrementUnscheduledDeps(1);

+ if (!DestBundle->hasValidDependencies())

+ WorkList.push_back(DestBundle);

+ };

+ // Any instruction which isn't safe to speculate at the begining of the

+ // block is control dependend on any early exit or non-willreturn call

+ // which proceeds it.

+ if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {

+ for (Instruction *I = BundleMember->Inst->getNextNode();

+ I != ScheduleEnd; I = I->getNextNode()) {

+ if (isSafeToSpeculativelyExecute(I, &*BB->begin()))

+ continue;

+ // Add the dependency

+ makeControlDependent(I);

+ if (!isGuaranteedToTransferExecutionToSuccessor(I))

+ // Everything past here must be control dependent on I.

+ break;

+ }

+ if (RegionHasStackSave) {

+ // If we have an inalloc alloca instruction, it needs to be scheduled

+ // after any preceeding stacksave. We also need to prevent any alloca

+ // from reordering above a preceeding stackrestore.

+ if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||

+ match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {

+ for (Instruction *I = BundleMember->Inst->getNextNode();

+ I != ScheduleEnd; I = I->getNextNode()) {

+ if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

+ match(I, m_Intrinsic<Intrinsic::stackrestore>()))

+ // Any allocas past here must be control dependent on I, and I

+ // must be memory dependend on BundleMember->Inst.

+ break;

+ if (!isa<AllocaInst>(I))

+ continue;

+ // Add the dependency

+ makeControlDependent(I);

+ }

+ // In addition to the cases handle just above, we need to prevent

+ // allocas from moving below a stacksave. The stackrestore case

+ // is currently thought to be conservatism.

+ if (isa<AllocaInst>(BundleMember->Inst)) {

+ for (Instruction *I = BundleMember->Inst->getNextNode();

+ I != ScheduleEnd; I = I->getNextNode()) {

+ if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&

+ !match(I, m_Intrinsic<Intrinsic::stackrestore>()))

+ continue;

+ // Add the dependency

+ makeControlDependent(I);

+ break;

+ }

// Handle the memory dependencies (if any).

ScheduleData *DepDest = BundleMember->NextLoadStore;

if (!DepDest)

@@ -7598,7 +9495,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

}

if (InsertInReadyList && SD->isReady()) {

- ReadyInsts.push_back(SD);

+ ReadyInsts.insert(SD);

LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst

<< "\n");

}

@@ -7625,11 +9522,18 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");

+ // A key point - if we got here, pre-scheduling was able to find a valid

+ // scheduling of the sub-graph of the scheduling window which consists

+ // of all vector bundles and their transitive users. As such, we do not

+ // need to reschedule anything *outside of* that subgraph.

BS->resetSchedule();

// For the real scheduling we use a more sophisticated ready-list: it is

// sorted by the original instruction location. This lets the final schedule

// be as close as possible to the original instruction order.

+ // WARNING: If changing this order causes a correctness issue, that means

+ // there is some missing dependence edge in the schedule data graph.

struct ScheduleDataCompare {

bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {

return SD2->SchedulingPriority < SD1->SchedulingPriority;

@@ -7637,21 +9541,22 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

};

std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

- // Ensure that all dependency data is updated and fill the ready-list with

- // initial instructions.

+ // Ensure that all dependency data is updated (for nodes in the sub-graph)

+ // and fill the ready-list with initial instructions.

int Idx = 0;

- int NumToSchedule = 0;

for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;

I = I->getNextNode()) {

- BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {

+ BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {

+ TreeEntry *SDTE = getTreeEntry(SD->Inst);

+ (void)SDTE;

assert((isVectorLikeInstWithConstOps(SD->Inst) ||

- SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&

+ SD->isPartOfBundle() ==

+ (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&

"scheduler and vectorizer bundle mismatch");

SD->FirstInBundle->SchedulingPriority = Idx++;

- if (SD->isSchedulingEntity()) {

+ if (SD->isSchedulingEntity() && SD->isPartOfBundle())

BS->calculateDependencies(SD, false, this);

- NumToSchedule++;

- }

});

}

BS->initialFillReadyList(ReadyInsts);

@@ -7674,9 +9579,23 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

}

BS->schedule(picked, ReadyInsts);

- NumToSchedule--;

}

- assert(NumToSchedule == 0 && "could not schedule all instructions");

+ // Check that we didn't break any of our invariants.

+#ifdef EXPENSIVE_CHECKS

+ BS->verify();

+#endif

+#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)

+ // Check that all schedulable entities got scheduled

+ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {

+ BS->doForAllOpcodes(I, [&](ScheduleData *SD) {

+ if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {

+ assert(SD->IsScheduled && "must be scheduled at this point");

+ }

+ });

+ }

+#endif

// Avoid duplicate scheduling of the block.

BS->ScheduleStart = nullptr;

@@ -7686,11 +9605,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {

// If V is a store, just return the width of the stored value (or value

// truncated just before storing) without traversing the expression tree.

// This is the common case.

- if (auto *Store = dyn_cast<StoreInst>(V)) {

- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

- return DL->getTypeSizeInBits(Trunc->getSrcTy());

+ if (auto *Store = dyn_cast<StoreInst>(V))

return DL->getTypeSizeInBits(Store->getValueOperand()->getType());

- }

if (auto *IEI = dyn_cast<InsertElementInst>(V))

return getVectorElementSize(IEI->getOperand(1));

@@ -8092,6 +10008,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

// Scan the blocks in the function in post order.

for (auto BB : post_order(&F.getEntryBlock())) {

+ // Start new block - clear the list of reduction roots.

+ R.clearReductionData();

collectSeedInstructions(BB);

// Vectorize trees that end at stores.

@@ -8122,11 +10040,10 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

}

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

- unsigned Idx) {

+ unsigned Idx, unsigned MinVF) {

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()

<< "\n");

const unsigned Sz = R.getVectorElementSize(Chain[0]);

- const unsigned MinVF = R.getMinVecRegSize() / Sz;

unsigned VF = Chain.size();

if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)

@@ -8265,9 +10182,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

unsigned EltSize = R.getVectorElementSize(Operands[0]);

unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);

- unsigned MinVF = R.getMinVF(EltSize);

unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),

MaxElts);

+ auto *Store = cast<StoreInst>(Operands[0]);

+ Type *StoreTy = Store->getValueOperand()->getType();

+ Type *ValueTy = StoreTy;

+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

+ ValueTy = Trunc->getSrcTy();

+ unsigned MinVF = TTI->getStoreMinimumVF(

+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);

// FIXME: Is division-by-2 the correct step? Should we assert that the

// register size is a power-of-2?

@@ -8277,7 +10200,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);

if (!VectorizedStores.count(Slice.front()) &&

!VectorizedStores.count(Slice.back()) &&

- vectorizeStoreChain(Slice, R, Cnt)) {

+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {

// Mark the vectorized stores so that we don't vectorize them again.

VectorizedStores.insert(Slice.begin(), Slice.end());

Changed = true;

@@ -8481,7 +10404,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {

if (!I)

return false;

- if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))

+ if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||

+ isa<VectorType>(I->getType()))

return false;

Value *P = I->getParent();

@@ -8492,32 +10416,40 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {

if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)

return false;

- // Try to vectorize V.

- if (tryToVectorizePair(Op0, Op1, R))

- return true;

+ // First collect all possible candidates

+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;

+ Candidates.emplace_back(Op0, Op1);

auto *A = dyn_cast<BinaryOperator>(Op0);

auto *B = dyn_cast<BinaryOperator>(Op1);

// Try to skip B.

- if (B && B->hasOneUse()) {

+ if (A && B && B->hasOneUse()) {

auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))

- return true;

- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))

- return true;

+ if (B0 && B0->getParent() == P)

+ Candidates.emplace_back(A, B0);

+ if (B1 && B1->getParent() == P)

+ Candidates.emplace_back(A, B1);

}

// Try to skip A.

- if (A && A->hasOneUse()) {

+ if (B && A && A->hasOneUse()) {

auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))

- return true;

- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))

- return true;

+ if (A0 && A0->getParent() == P)

+ Candidates.emplace_back(A0, B);

+ if (A1 && A1->getParent() == P)

+ Candidates.emplace_back(A1, B);

}

- return false;

+ if (Candidates.size() == 1)

+ return tryToVectorizePair(Op0, Op1, R);

+ // We have multiple options. Try to pick the single best.

+ Optional<int> BestCandidate = R.findBestRootPair(Candidates);

+ if (!BestCandidate)

+ return false;

+ return tryToVectorizePair(Candidates[*BestCandidate].first,

+ Candidates[*BestCandidate].second, R);

}

namespace {

@@ -8552,15 +10484,16 @@ class HorizontalReduction {

using ReductionOpsType = SmallVector<Value *, 16>;

using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;

ReductionOpsListType ReductionOps;

- SmallVector<Value *, 32> ReducedVals;

+ /// List of possibly reduced values.

+ SmallVector<SmallVector<Value *>> ReducedVals;

+ /// Maps reduced value to the corresponding reduction operation.

+ DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;

// Use map vector to make stable output.

MapVector<Instruction *, Value *> ExtraArgs;

WeakTrackingVH ReductionRoot;

/// The type of reduction operation.

RecurKind RdxKind;

- const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();

static bool isCmpSelMinMax(Instruction *I) {

return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&

RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));

@@ -8604,26 +10537,6 @@ class HorizontalReduction {

return I->getOperand(Index);

}

- /// Checks if the ParentStackElem.first should be marked as a reduction

- /// operation with an extra argument or as extra argument itself.

- void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,

- Value *ExtraArg) {

- if (ExtraArgs.count(ParentStackElem.first)) {

- ExtraArgs[ParentStackElem.first] = nullptr;

- // We ran into something like:

- // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.

- // The whole ParentStackElem.first should be considered as an extra value

- // in this case.

- // Do not perform analysis of remaining operands of ParentStackElem.first

- // instruction, this whole instruction is an extra argument.

- ParentStackElem.second = INVALID_OPERAND_INDEX;

- } else {

- // We ran into something like:

- // ParentStackElem.first += ... + ExtraArg + ...

- ExtraArgs[ParentStackElem.first] = ExtraArg;

- }

/// Creates reduction operation with the current opcode.

static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,

Value *RHS, const Twine &Name, bool UseSelect) {

@@ -8682,7 +10595,7 @@ class HorizontalReduction {

}

/// Creates reduction operation with the current opcode with the IR flags

- /// from \p ReductionOps.

+ /// from \p ReductionOps, dropping nuw/nsw flags.

static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,

Value *RHS, const Twine &Name,

const ReductionOpsListType &ReductionOps) {

@@ -8696,31 +10609,21 @@ class HorizontalReduction {

Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);

if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {

if (auto *Sel = dyn_cast<SelectInst>(Op)) {

- propagateIRFlags(Sel->getCondition(), ReductionOps[0]);

- propagateIRFlags(Op, ReductionOps[1]);

+ propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,

+ /*IncludeWrapFlags=*/false);

+ propagateIRFlags(Op, ReductionOps[1], nullptr,

+ /*IncludeWrapFlags=*/false);

return Op;

}

- propagateIRFlags(Op, ReductionOps[0]);

- return Op;

- }

- /// Creates reduction operation with the current opcode with the IR flags

- /// from \p I.

- static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,

- Value *RHS, const Twine &Name, Instruction *I) {

- auto *SelI = dyn_cast<SelectInst>(I);

- Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);

- if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {

- if (auto *Sel = dyn_cast<SelectInst>(Op))

- propagateIRFlags(Sel->getCondition(), SelI->getCondition());

- }

- propagateIRFlags(Op, I);

+ propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);

return Op;

}

- static RecurKind getRdxKind(Instruction *I) {

- assert(I && "Expected instruction for reduction matching");

+ static RecurKind getRdxKind(Value *V) {

+ auto *I = dyn_cast<Instruction>(V);

+ if (!I)

+ return RecurKind::None;

if (match(I, m_Add(m_Value(), m_Value())))

return RecurKind::Add;

if (match(I, m_Mul(m_Value(), m_Value())))

@@ -8882,7 +10785,9 @@ public:

HorizontalReduction() = default;

/// Try to find a reduction tree.

- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {

+ bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,

+ ScalarEvolution &SE, const DataLayout &DL,

+ const TargetLibraryInfo &TLI) {

assert((!Phi || is_contained(Phi->operands(), Inst)) &&

"Phi needs to use the binary operator");

assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||

@@ -8926,124 +10831,178 @@ public:

ReductionRoot = Inst;

- // The opcode for leaf values that we perform a reduction on.

- // For example: load(x) + load(y) + load(z) + fptoui(w)

- // The leaf opcode for 'w' does not match, so we don't include it as a

- // potential candidate for the reduction.

- unsigned LeafOpcode = 0;

- // Post-order traverse the reduction tree starting at Inst. We only handle

- // true trees containing binary operators or selects.

- SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;

- Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));

- initReductionOps(Inst);

- while (!Stack.empty()) {

- Instruction *TreeN = Stack.back().first;

- unsigned EdgeToVisit = Stack.back().second++;

- const RecurKind TreeRdxKind = getRdxKind(TreeN);

- bool IsReducedValue = TreeRdxKind != RdxKind;

- // Postorder visit.

- if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {

- if (IsReducedValue)

- ReducedVals.push_back(TreeN);

- else {

- auto ExtraArgsIter = ExtraArgs.find(TreeN);

- if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {

- // Check if TreeN is an extra argument of its parent operation.

- if (Stack.size() <= 1) {

- // TreeN can't be an extra argument as it is a root reduction

- // operation.

- return false;

- }

- // Yes, TreeN is an extra argument, do not add it to a list of

- // reduction operations.

- // Stack[Stack.size() - 2] always points to the parent operation.

- markExtraArg(Stack[Stack.size() - 2], TreeN);

- ExtraArgs.erase(TreeN);

- } else

- addReductionOps(TreeN);

- }

- // Retract.

- Stack.pop_back();

- continue;

- }

- // Visit operands.

- Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);

- auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);

- if (!EdgeInst) {

- // Edge value is not a reduction instruction or a leaf instruction.

- // (It may be a constant, function argument, or something else.)

- markExtraArg(Stack.back(), EdgeVal);

- continue;

+ // Iterate through all the operands of the possible reduction tree and

+ // gather all the reduced values, sorting them by their value id.

+ BasicBlock *BB = Inst->getParent();

+ bool IsCmpSelMinMax = isCmpSelMinMax(Inst);

+ SmallVector<Instruction *> Worklist(1, Inst);

+ // Checks if the operands of the \p TreeN instruction are also reduction

+ // operations or should be treated as reduced values or an extra argument,

+ // which is not part of the reduction.

+ auto &&CheckOperands = [this, IsCmpSelMinMax,

+ BB](Instruction *TreeN,

+ SmallVectorImpl<Value *> &ExtraArgs,

+ SmallVectorImpl<Value *> &PossibleReducedVals,

+ SmallVectorImpl<Instruction *> &ReductionOps) {

+ for (int I = getFirstOperandIndex(TreeN),

+ End = getNumberOfOperands(TreeN);

+ I < End; ++I) {

+ Value *EdgeVal = getRdxOperand(TreeN, I);

+ ReducedValsToOps[EdgeVal].push_back(TreeN);

+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);

+ // Edge has wrong parent - mark as an extra argument.

+ if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&

+ !hasSameParent(EdgeInst, BB)) {

+ ExtraArgs.push_back(EdgeVal);

+ continue;

+ }

+ // If the edge is not an instruction, or it is different from the main

+ // reduction opcode or has too many uses - possible reduced value.

+ if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||

+ IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||

+ !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||

+ !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {

+ PossibleReducedVals.push_back(EdgeVal);

+ continue;

+ }

+ ReductionOps.push_back(EdgeInst);

}

- RecurKind EdgeRdxKind = getRdxKind(EdgeInst);

- // Continue analysis if the next operand is a reduction operation or

- // (possibly) a leaf value. If the leaf value opcode is not set,

- // the first met operation != reduction operation is considered as the

- // leaf opcode.

- // Only handle trees in the current basic block.

- // Each tree node needs to have minimal number of users except for the

- // ultimate reduction.

- const bool IsRdxInst = EdgeRdxKind == RdxKind;

- if (EdgeInst != Phi && EdgeInst != Inst &&

- hasSameParent(EdgeInst, Inst->getParent()) &&

- hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&

- (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {

- if (IsRdxInst) {

- // We need to be able to reassociate the reduction operations.

- if (!isVectorizable(EdgeRdxKind, EdgeInst)) {

- // I is an extra argument for TreeN (its parent operation).

- markExtraArg(Stack.back(), EdgeInst);

- continue;

- }

- } else if (!LeafOpcode) {

- LeafOpcode = EdgeInst->getOpcode();

+ };

+ // Try to regroup reduced values so that it gets more profitable to try to

+ // reduce them. Values are grouped by their value ids, instructions - by

+ // instruction op id and/or alternate op id, plus do extra analysis for

+ // loads (grouping them by the distabce between pointers) and cmp

+ // instructions (grouping them by the predicate).

+ MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>

+ PossibleReducedVals;

+ initReductionOps(Inst);

+ while (!Worklist.empty()) {

+ Instruction *TreeN = Worklist.pop_back_val();

+ SmallVector<Value *> Args;

+ SmallVector<Value *> PossibleRedVals;

+ SmallVector<Instruction *> PossibleReductionOps;

+ CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);

+ // If too many extra args - mark the instruction itself as a reduction

+ // value, not a reduction operation.

+ if (Args.size() < 2) {

+ addReductionOps(TreeN);

+ // Add extra args.

+ if (!Args.empty()) {

+ assert(Args.size() == 1 && "Expected only single argument.");

+ ExtraArgs[TreeN] = Args.front();

}

- Stack.push_back(

- std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));

- continue;

+ // Add reduction values. The values are sorted for better vectorization

+ // results.

+ for (Value *V : PossibleRedVals) {

+ size_t Key, Idx;

+ std::tie(Key, Idx) = generateKeySubkey(

+ V, &TLI,

+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {

+ auto It = PossibleReducedVals.find(Key);

+ if (It != PossibleReducedVals.end()) {

+ for (const auto &LoadData : It->second) {

+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);

+ if (getPointersDiff(RLI->getType(),

+ RLI->getPointerOperand(), LI->getType(),

+ LI->getPointerOperand(), DL, SE,

+ /*StrictCheck=*/true))

+ return hash_value(RLI->getPointerOperand());

+ }

+ return hash_value(LI->getPointerOperand());

+ },

+ /*AllowAlternate=*/false);

+ ++PossibleReducedVals[Key][Idx]

+ .insert(std::make_pair(V, 0))

+ .first->second;

+ }

+ Worklist.append(PossibleReductionOps.rbegin(),

+ PossibleReductionOps.rend());

+ } else {

+ size_t Key, Idx;

+ std::tie(Key, Idx) = generateKeySubkey(

+ TreeN, &TLI,

+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {

+ auto It = PossibleReducedVals.find(Key);

+ if (It != PossibleReducedVals.end()) {

+ for (const auto &LoadData : It->second) {

+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);

+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

+ LI->getType(), LI->getPointerOperand(),

+ DL, SE, /*StrictCheck=*/true))

+ return hash_value(RLI->getPointerOperand());

+ }

+ return hash_value(LI->getPointerOperand());

+ },

+ /*AllowAlternate=*/false);

+ ++PossibleReducedVals[Key][Idx]

+ .insert(std::make_pair(TreeN, 0))

+ .first->second;

+ }

+ auto PossibleReducedValsVect = PossibleReducedVals.takeVector();

+ // Sort values by the total number of values kinds to start the reduction

+ // from the longest possible reduced values sequences.

+ for (auto &PossibleReducedVals : PossibleReducedValsVect) {

+ auto PossibleRedVals = PossibleReducedVals.second.takeVector();

+ SmallVector<SmallVector<Value *>> PossibleRedValsVect;

+ for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();

+ It != E; ++It) {

+ PossibleRedValsVect.emplace_back();

+ auto RedValsVect = It->second.takeVector();

+ stable_sort(RedValsVect, [](const auto &P1, const auto &P2) {

+ return P1.second < P2.second;

+ });

+ for (const std::pair<Value *, unsigned> &Data : RedValsVect)

+ PossibleRedValsVect.back().append(Data.second, Data.first);

}

- // I is an extra argument for TreeN (its parent operation).

- markExtraArg(Stack.back(), EdgeInst);

- }

+ stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {

+ return P1.size() > P2.size();

+ });

+ ReducedVals.emplace_back();

+ for (ArrayRef<Value *> Data : PossibleRedValsVect)

+ ReducedVals.back().append(Data.rbegin(), Data.rend());

+ }

+ // Sort the reduced values by number of same/alternate opcode and/or pointer

+ // operand.

+ stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {

+ return P1.size() > P2.size();

+ });

return true;

}

/// Attempt to vectorize the tree found by matchAssociativeReduction.

Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {

+ constexpr int ReductionLimit = 4;

+ constexpr unsigned RegMaxNumber = 4;

+ constexpr unsigned RedValsMaxNumber = 128;

// If there are a sufficient number of reduction values, reduce

// to a nearby power-of-2. We can safely generate oversized

// vectors and rely on the backend to split them to legal sizes.

- unsigned NumReducedVals = ReducedVals.size();

- if (NumReducedVals < 4)

+ unsigned NumReducedVals = std::accumulate(

+ ReducedVals.begin(), ReducedVals.end(), 0,

+ [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });

+ if (NumReducedVals < ReductionLimit)

return nullptr;

- // Intersect the fast-math-flags from all reduction operations.

- FastMathFlags RdxFMF;

- RdxFMF.set();

- for (ReductionOpsType &RdxOp : ReductionOps) {

- for (Value *RdxVal : RdxOp) {

- if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))

- RdxFMF &= FPMO->getFastMathFlags();

- }

IRBuilder<> Builder(cast<Instruction>(ReductionRoot));

- Builder.setFastMathFlags(RdxFMF);

+ // Track the reduced values in case if they are replaced by extractelement

+ // because of the vectorization.

+ DenseMap<Value *, WeakTrackingVH> TrackedVals;

BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

// The same extra argument may be used several times, so log each attempt

// to use it.

for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {

assert(Pair.first && "DebugLoc must be set.");

ExternallyUsedValues[Pair.second].push_back(Pair.first);

+ TrackedVals.try_emplace(Pair.second, Pair.second);

}

// The compare instruction of a min/max is the insertion point for new

// instructions and may be replaced with a new compare instruction.

- auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

+ auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

assert(isa<SelectInst>(RdxRootInst) &&

"Expected min/max reduction to have select root instruction");

Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();

@@ -9055,164 +11014,390 @@ public:

// The reduction root is used as the insertion point for new instructions,

// so set it as externally used to prevent it from being deleted.

ExternallyUsedValues[ReductionRoot];

- SmallVector<Value *, 16> IgnoreList;

- for (ReductionOpsType &RdxOp : ReductionOps)

- IgnoreList.append(RdxOp.begin(), RdxOp.end());

- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);

- if (NumReducedVals > ReduxWidth) {

- // In the loop below, we are building a tree based on a window of

- // 'ReduxWidth' values.

- // If the operands of those values have common traits (compare predicate,

- // constant operand, etc), then we want to group those together to

- // minimize the cost of the reduction.

- // TODO: This should be extended to count common operands for

- // compares and binops.

- // Step 1: Count the number of times each compare predicate occurs.

- SmallDenseMap<unsigned, unsigned> PredCountMap;

- for (Value *RdxVal : ReducedVals) {

- CmpInst::Predicate Pred;

- if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))

- ++PredCountMap[Pred];

- }

- // Step 2: Sort the values so the most common predicates come first.

- stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {

- CmpInst::Predicate PredA, PredB;

- if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&

- match(B, m_Cmp(PredB, m_Value(), m_Value()))) {

- return PredCountMap[PredA] > PredCountMap[PredB];

- }

- return false;

- });

- }

+ SmallDenseSet<Value *> IgnoreList;

+ for (ReductionOpsType &RdxOps : ReductionOps)

+ for (Value *RdxOp : RdxOps) {

+ if (!RdxOp)

+ continue;

+ IgnoreList.insert(RdxOp);

+ }

+ bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));

+ // Need to track reduced vals, they may be changed during vectorization of

+ // subvectors.

+ for (ArrayRef<Value *> Candidates : ReducedVals)

+ for (Value *V : Candidates)

+ TrackedVals.try_emplace(V, V);

+ DenseMap<Value *, unsigned> VectorizedVals;

Value *VectorizedTree = nullptr;

- unsigned i = 0;

- while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {

- ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);

- V.buildTree(VL, IgnoreList);

- if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))

- break;

- if (V.isLoadCombineReductionCandidate(RdxKind))

- break;

- V.reorderTopToBottom();

- V.reorderBottomToTop(/*IgnoreReorder=*/true);

- V.buildExternalUses(ExternallyUsedValues);

- // For a poison-safe boolean logic reduction, do not replace select

- // instructions with logic ops. All reduced values will be frozen (see

- // below) to prevent leaking poison.

- if (isa<SelectInst>(ReductionRoot) &&

- isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&

- NumReducedVals != ReduxWidth)

- break;

+ bool CheckForReusedReductionOps = false;

+ // Try to vectorize elements based on their type.

+ for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {

+ ArrayRef<Value *> OrigReducedVals = ReducedVals[I];

+ InstructionsState S = getSameOpcode(OrigReducedVals);

+ SmallVector<Value *> Candidates;

+ DenseMap<Value *, Value *> TrackedToOrig;

+ for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {

+ Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;

+ // Check if the reduction value was not overriden by the extractelement

+ // instruction because of the vectorization and exclude it, if it is not

+ // compatible with other values.

+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))

+ if (isVectorLikeInstWithConstOps(Inst) &&

+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))

+ continue;

+ Candidates.push_back(RdxVal);

+ TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);

+ }

+ bool ShuffledExtracts = false;

+ // Try to handle shuffled extractelements.

+ if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&

+ I + 1 < E) {

+ InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);

+ if (NextS.getOpcode() == Instruction::ExtractElement &&

+ !NextS.isAltShuffle()) {

+ SmallVector<Value *> CommonCandidates(Candidates);

+ for (Value *RV : ReducedVals[I + 1]) {

+ Value *RdxVal = TrackedVals.find(RV)->second;

+ // Check if the reduction value was not overriden by the

+ // extractelement instruction because of the vectorization and

+ // exclude it, if it is not compatible with other values.

+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))

+ if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))

+ continue;

+ CommonCandidates.push_back(RdxVal);

+ TrackedToOrig.try_emplace(RdxVal, RV);

+ }

+ SmallVector<int> Mask;

+ if (isFixedVectorShuffle(CommonCandidates, Mask)) {

+ ++I;

+ Candidates.swap(CommonCandidates);

+ ShuffledExtracts = true;

+ }

+ unsigned NumReducedVals = Candidates.size();

+ if (NumReducedVals < ReductionLimit)

+ continue;

- V.computeMinimumValueSizes();

+ unsigned MaxVecRegSize = V.getMaxVecRegSize();

+ unsigned EltSize = V.getVectorElementSize(Candidates[0]);

+ unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);

+ unsigned ReduxWidth = std::min<unsigned>(

+ PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));

+ unsigned Start = 0;

+ unsigned Pos = Start;

+ // Restarts vectorization attempt with lower vector factor.

+ unsigned PrevReduxWidth = ReduxWidth;

+ bool CheckForReusedReductionOpsLocal = false;

+ auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,

+ &CheckForReusedReductionOpsLocal,

+ &PrevReduxWidth, &V,

+ &IgnoreList](bool IgnoreVL = false) {

+ bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);

+ if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {

+ // Check if any of the reduction ops are gathered. If so, worth

+ // trying again with less number of reduction ops.

+ CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;

+ }

+ ++Pos;

+ if (Pos < NumReducedVals - ReduxWidth + 1)

+ return IsAnyRedOpGathered;

+ Pos = Start;

+ ReduxWidth /= 2;

+ return IsAnyRedOpGathered;

+ };

+ while (Pos < NumReducedVals - ReduxWidth + 1 &&

+ ReduxWidth >= ReductionLimit) {

+ // Dependency in tree of the reduction ops - drop this attempt, try

+ // later.

+ if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&

+ Start == 0) {

+ CheckForReusedReductionOps = true;

+ break;

+ }

+ PrevReduxWidth = ReduxWidth;

+ ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);

+ // Beeing analyzed already - skip.

+ if (V.areAnalyzedReductionVals(VL)) {

+ (void)AdjustReducedVals(/*IgnoreVL=*/true);

+ continue;

+ }

+ // Early exit if any of the reduction values were deleted during

+ // previous vectorization attempts.

+ if (any_of(VL, [&V](Value *RedVal) {

+ auto *RedValI = dyn_cast<Instruction>(RedVal);

+ if (!RedValI)

+ return false;

+ return V.isDeleted(RedValI);

+ }))

+ break;

+ V.buildTree(VL, IgnoreList);

+ if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {

+ if (!AdjustReducedVals())

+ V.analyzedReductionVals(VL);

+ continue;

+ }

+ if (V.isLoadCombineReductionCandidate(RdxKind)) {

+ if (!AdjustReducedVals())

+ V.analyzedReductionVals(VL);

+ continue;

+ }

+ V.reorderTopToBottom();

+ // No need to reorder the root node at all.

+ V.reorderBottomToTop(/*IgnoreReorder=*/true);

+ // Keep extracted other reduction values, if they are used in the

+ // vectorization trees.

+ BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(

+ ExternallyUsedValues);

+ for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {

+ if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))

+ continue;

+ for_each(ReducedVals[Cnt],

+ [&LocalExternallyUsedValues, &TrackedVals](Value *V) {

+ if (isa<Instruction>(V))

+ LocalExternallyUsedValues[TrackedVals[V]];

+ });

+ }

+ // Number of uses of the candidates in the vector of values.

+ SmallDenseMap<Value *, unsigned> NumUses;

+ for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {

+ Value *V = Candidates[Cnt];

+ if (NumUses.count(V) > 0)

+ continue;

+ NumUses[V] = std::count(VL.begin(), VL.end(), V);

+ }

+ for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {

+ Value *V = Candidates[Cnt];

+ if (NumUses.count(V) > 0)

+ continue;

+ NumUses[V] = std::count(VL.begin(), VL.end(), V);

+ }

+ // Gather externally used values.

+ SmallPtrSet<Value *, 4> Visited;

+ for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {

+ Value *V = Candidates[Cnt];

+ if (!Visited.insert(V).second)

+ continue;

+ unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];

+ if (NumOps != ReducedValsToOps.find(V)->second.size())

+ LocalExternallyUsedValues[V];

+ }

+ for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {

+ Value *V = Candidates[Cnt];

+ if (!Visited.insert(V).second)

+ continue;

+ unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];

+ if (NumOps != ReducedValsToOps.find(V)->second.size())

+ LocalExternallyUsedValues[V];

+ }

+ V.buildExternalUses(LocalExternallyUsedValues);

+ V.computeMinimumValueSizes();

+ // Intersect the fast-math-flags from all reduction operations.

+ FastMathFlags RdxFMF;

+ RdxFMF.set();

+ for (Value *U : IgnoreList)

+ if (auto *FPMO = dyn_cast<FPMathOperator>(U))

+ RdxFMF &= FPMO->getFastMathFlags();

+ // Estimate cost.

+ InstructionCost TreeCost = V.getTreeCost(VL);

+ InstructionCost ReductionCost =

+ getReductionCost(TTI, VL, ReduxWidth, RdxFMF);

+ InstructionCost Cost = TreeCost + ReductionCost;

+ if (!Cost.isValid()) {

+ LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");

+ return nullptr;

+ }

+ if (Cost >= -SLPCostThreshold) {

+ V.getORE()->emit([&]() {

+ return OptimizationRemarkMissed(

+ SV_NAME, "HorSLPNotBeneficial",

+ ReducedValsToOps.find(VL[0])->second.front())

+ << "Vectorizing horizontal reduction is possible"

+ << "but not beneficial with cost " << ore::NV("Cost", Cost)

+ << " and threshold "

+ << ore::NV("Threshold", -SLPCostThreshold);

+ });

+ if (!AdjustReducedVals())

+ V.analyzedReductionVals(VL);

+ continue;

+ }

- // Estimate cost.

- InstructionCost TreeCost =

- V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));

- InstructionCost ReductionCost =

- getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);

- InstructionCost Cost = TreeCost + ReductionCost;

- if (!Cost.isValid()) {

- LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");

- return nullptr;

- }

- if (Cost >= -SLPCostThreshold) {

+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"

+ << Cost << ". (HorRdx)\n");

V.getORE()->emit([&]() {

- return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",

- cast<Instruction>(VL[0]))

- << "Vectorizing horizontal reduction is possible"

- << "but not beneficial with cost " << ore::NV("Cost", Cost)

- << " and threshold "

- << ore::NV("Threshold", -SLPCostThreshold);

+ return OptimizationRemark(

+ SV_NAME, "VectorizedHorizontalReduction",

+ ReducedValsToOps.find(VL[0])->second.front())

+ << "Vectorized horizontal reduction with cost "

+ << ore::NV("Cost", Cost) << " and with tree size "

+ << ore::NV("TreeSize", V.getTreeSize());

});

- break;

- }

- LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"

- << Cost << ". (HorRdx)\n");

- V.getORE()->emit([&]() {

- return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",

- cast<Instruction>(VL[0]))

- << "Vectorized horizontal reduction with cost "

- << ore::NV("Cost", Cost) << " and with tree size "

- << ore::NV("TreeSize", V.getTreeSize());

- });

+ Builder.setFastMathFlags(RdxFMF);

- // Vectorize a tree.

- DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();

- Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);

+ // Vectorize a tree.

+ Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);

- // Emit a reduction. If the root is a select (min/max idiom), the insert

- // point is the compare condition of that select.

- Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

- if (isCmpSelMinMax(RdxRootInst))

- Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));

- else

- Builder.SetInsertPoint(RdxRootInst);

+ // Emit a reduction. If the root is a select (min/max idiom), the insert

+ // point is the compare condition of that select.

+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

+ if (IsCmpSelMinMax)

+ Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));

+ else

+ Builder.SetInsertPoint(RdxRootInst);

- // To prevent poison from leaking across what used to be sequential, safe,

- // scalar boolean logic operations, the reduction operand must be frozen.

- if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))

- VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

+ // To prevent poison from leaking across what used to be sequential,

+ // safe, scalar boolean logic operations, the reduction operand must be

+ // frozen.

+ if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))

+ VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

- Value *ReducedSubTree =

- emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

+ Value *ReducedSubTree =

+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

- if (!VectorizedTree) {

- // Initialize the final value in the reduction.

- VectorizedTree = ReducedSubTree;

- } else {

- // Update the final value in the reduction.

- Builder.SetCurrentDebugLocation(Loc);

- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,

- ReducedSubTree, "op.rdx", ReductionOps);

+ if (!VectorizedTree) {

+ // Initialize the final value in the reduction.

+ VectorizedTree = ReducedSubTree;

+ } else {

+ // Update the final value in the reduction.

+ Builder.SetCurrentDebugLocation(

+ cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,

+ ReducedSubTree, "op.rdx", ReductionOps);

+ }

+ // Count vectorized reduced values to exclude them from final reduction.

+ for (Value *V : VL)

+ ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)

+ .first->getSecond();

+ Pos += ReduxWidth;

+ Start = Pos;

+ ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);

}

- i += ReduxWidth;

- ReduxWidth = PowerOf2Floor(NumReducedVals - i);

}

if (VectorizedTree) {

// Finish the reduction.

- for (; i < NumReducedVals; ++i) {

- auto *I = cast<Instruction>(ReducedVals[i]);

- Builder.SetCurrentDebugLocation(I->getDebugLoc());

- VectorizedTree =

- createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);

+ // Need to add extra arguments and not vectorized possible reduction

+ // values.

+ // Try to avoid dependencies between the scalar remainders after

+ // reductions.

+ auto &&FinalGen =

+ [this, &Builder,

+ &TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) {

+ unsigned Sz = InstVals.size();

+ SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +

+ Sz % 2);

+ for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {

+ Instruction *RedOp = InstVals[I + 1].first;

+ Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());

+ Value *RdxVal1 = InstVals[I].second;

+ Value *StableRdxVal1 = RdxVal1;

+ auto It1 = TrackedVals.find(RdxVal1);

+ if (It1 != TrackedVals.end())

+ StableRdxVal1 = It1->second;

+ Value *RdxVal2 = InstVals[I + 1].second;

+ Value *StableRdxVal2 = RdxVal2;

+ auto It2 = TrackedVals.find(RdxVal2);

+ if (It2 != TrackedVals.end())

+ StableRdxVal2 = It2->second;

+ Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,

+ StableRdxVal2, "op.rdx", ReductionOps);

+ ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);

+ }

+ if (Sz % 2 == 1)

+ ExtraReds[Sz / 2] = InstVals.back();

+ return ExtraReds;

+ };

+ SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;

+ SmallPtrSet<Value *, 8> Visited;

+ for (ArrayRef<Value *> Candidates : ReducedVals) {

+ for (Value *RdxVal : Candidates) {

+ if (!Visited.insert(RdxVal).second)

+ continue;

+ unsigned NumOps = VectorizedVals.lookup(RdxVal);

+ for (Instruction *RedOp :

+ makeArrayRef(ReducedValsToOps.find(RdxVal)->second)

+ .drop_back(NumOps))

+ ExtraReductions.emplace_back(RedOp, RdxVal);

+ }

}

for (auto &Pair : ExternallyUsedValues) {

// Add each externally used value to the final reduction.

- for (auto *I : Pair.second) {

- Builder.SetCurrentDebugLocation(I->getDebugLoc());

- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,

- Pair.first, "op.extra", I);

- }

+ for (auto *I : Pair.second)

+ ExtraReductions.emplace_back(I, Pair.first);

+ }

+ // Iterate through all not-vectorized reduction values/extra arguments.

+ while (ExtraReductions.size() > 1) {

+ SmallVector<std::pair<Instruction *, Value *>> NewReds =

+ FinalGen(ExtraReductions);

+ ExtraReductions.swap(NewReds);

+ }

+ // Final reduction.

+ if (ExtraReductions.size() == 1) {

+ Instruction *RedOp = ExtraReductions.back().first;

+ Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());

+ Value *RdxVal = ExtraReductions.back().second;

+ Value *StableRdxVal = RdxVal;

+ auto It = TrackedVals.find(RdxVal);

+ if (It != TrackedVals.end())

+ StableRdxVal = It->second;

+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,

+ StableRdxVal, "op.rdx", ReductionOps);

}

ReductionRoot->replaceAllUsesWith(VectorizedTree);

- // Mark all scalar reduction ops for deletion, they are replaced by the

- // vector reductions.

- V.eraseInstructions(IgnoreList);

+ // The original scalar reduction is expected to have no remaining

+ // uses outside the reduction tree itself. Assert that we got this

+ // correct, replace internal uses with undef, and mark for eventual

+ // deletion.

+#ifndef NDEBUG

+ SmallSet<Value *, 4> IgnoreSet;

+ for (ArrayRef<Value *> RdxOps : ReductionOps)

+ IgnoreSet.insert(RdxOps.begin(), RdxOps.end());

+#endif

+ for (ArrayRef<Value *> RdxOps : ReductionOps) {

+ for (Value *Ignore : RdxOps) {

+ if (!Ignore)

+ continue;

+#ifndef NDEBUG

+ for (auto *U : Ignore->users()) {

+ assert(IgnoreSet.count(U) &&

+ "All users must be either in the reduction ops list.");

+ }

+#endif

+ if (!Ignore->use_empty()) {

+ Value *Undef = UndefValue::get(Ignore->getType());

+ Ignore->replaceAllUsesWith(Undef);

+ }

+ V.eraseInstruction(cast<Instruction>(Ignore));

+ }

+ } else if (!CheckForReusedReductionOps) {

+ for (ReductionOpsType &RdxOps : ReductionOps)

+ for (Value *RdxOp : RdxOps)

+ V.analyzedReductionRoot(cast<Instruction>(RdxOp));

}

return VectorizedTree;

}

- unsigned numReductionValues() const { return ReducedVals.size(); }

private:

/// Calculate the cost of a reduction.

InstructionCost getReductionCost(TargetTransformInfo *TTI,

- Value *FirstReducedVal, unsigned ReduxWidth,

- FastMathFlags FMF) {

+ ArrayRef<Value *> ReducedVals,

+ unsigned ReduxWidth, FastMathFlags FMF) {

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ Value *FirstReducedVal = ReducedVals.front();

Type *ScalarTy = FirstReducedVal->getType();

FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);

- InstructionCost VectorCost, ScalarCost;

+ InstructionCost VectorCost = 0, ScalarCost;

+ // If all of the reduced values are constant, the vector cost is 0, since

+ // the reduction value can be calculated at the compile time.

+ bool AllConsts = all_of(ReducedVals, isConstant);

switch (RdxKind) {

case RecurKind::Add:

case RecurKind::Mul:

@@ -9222,17 +11407,22 @@ private:

case RecurKind::FAdd:

case RecurKind::FMul: {

unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);

- VectorCost =

- TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);

+ if (!AllConsts)

+ VectorCost =

+ TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);

ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);

break;

}

case RecurKind::FMax:

case RecurKind::FMin: {

auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

- /*IsUnsigned=*/false, CostKind);

+ if (!AllConsts) {

+ auto *VecCondTy =

+ cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

+ VectorCost =

+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

+ /*IsUnsigned=*/false, CostKind);

+ }

CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,

SclCondTy, RdxPred, CostKind) +

@@ -9245,11 +11435,14 @@ private:

case RecurKind::UMax:

case RecurKind::UMin: {

auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

- bool IsUnsigned =

- RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;

- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,

- CostKind);

+ if (!AllConsts) {

+ auto *VecCondTy =

+ cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

+ bool IsUnsigned =

+ RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;

+ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

+ IsUnsigned, CostKind);

+ }

CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,

SclCondTy, RdxPred, CostKind) +

@@ -9463,7 +11656,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {

/// performed.

static bool tryToVectorizeHorReductionOrInstOperands(

PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,

- TargetTransformInfo *TTI,

+ TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,

+ const TargetLibraryInfo &TLI,

const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {

if (!ShouldVectorizeHor)

return false;

@@ -9482,7 +11676,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(

// horizontal reduction.

// Interrupt the process if the Root instruction itself was vectorized or all

// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.

- // Skip the analysis of CmpInsts.Compiler implements postanalysis of the

+ // Skip the analysis of CmpInsts. Compiler implements postanalysis of the

// CmpInsts so we can skip extra attempts in

// tryToVectorizeHorReductionOrInstOperands and save compile time.

std::queue<std::pair<Instruction *, unsigned>> Stack;

@@ -9490,13 +11684,16 @@ static bool tryToVectorizeHorReductionOrInstOperands(

SmallPtrSet<Value *, 8> VisitedInstrs;

SmallVector<WeakTrackingVH> PostponedInsts;

bool Res = false;

- auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,

- Value *&B1) -> Value * {

+ auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,

+ Value *&B0,

+ Value *&B1) -> Value * {

+ if (R.isAnalyzedReductionRoot(Inst))

+ return nullptr;

bool IsBinop = matchRdxBop(Inst, B0, B1);

bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));

if (IsBinop || IsSelect) {

HorizontalReduction HorRdx;

- if (HorRdx.matchAssociativeReduction(P, Inst))

+ if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))

return HorRdx.tryToReduce(R, TTI);

}

return nullptr;

@@ -9541,7 +11738,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(

// Do not try to vectorize CmpInst operands, this is done separately.

// Final attempt for binop args vectorization should happen after the loop

// to try to find reductions.

- if (!isa<CmpInst>(Inst))

+ if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))

PostponedInsts.push_back(Inst);

}

@@ -9554,8 +11751,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(

if (auto *I = dyn_cast<Instruction>(Op))

// Do not try to vectorize CmpInst operands, this is done

// separately.

- if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&

- I->getParent() == BB)

+ if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&

+ !R.isDeleted(I) && I->getParent() == BB)

Stack.emplace(I, Level);

}

// Try to vectorized binops where reductions were not found.

@@ -9579,8 +11776,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,

auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {

return tryToVectorize(I, R);

};

- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,

- ExtraVectorization);

+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,

+ *TLI, ExtraVectorization);

}

bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

@@ -9748,12 +11945,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(

for (auto *I : reverse(Instructions)) {

if (R.isDeleted(I))

continue;

- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))

+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);

- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))

+ } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);

- else if (isa<CmpInst>(I))

+ } else if (isa<CmpInst>(I)) {

PostponedCmps.push_back(I);

+ continue;

+ }

+ // Try to find reductions in buildvector sequnces.

+ OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);

}

if (AtTerminator) {

// Try to find reductions first.

@@ -10171,7 +12372,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =

DT->getNode(I2->getParent());

assert(NodeI1 && "Should only process reachable instructions");

- assert(NodeI1 && "Should only process reachable instructions");

+ assert(NodeI2 && "Should only process reachable instructions");

assert((NodeI1 == NodeI2) ==

(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

"Different nodes should have different DFS numbers");