src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-12-25 22:36:56 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2022-05-14 11:44:01 +0000
commit	0eae32dcef82f6f06de6419a0d623d7def0cc8f6 (patch)
tree	55b7e05be47b835fd137915bee1e64026c35e71c /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent	4824e7fd18a1223177218d4aec1b3c6c5c4a444e (diff)
parent	77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

633

1 files changed, 432 insertions, 201 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 95061e9053fa..37ae13666f7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {

/// after: 6 3 5 4 7 2 1 0

static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {

const unsigned Sz = Order.size();

- SmallBitVector UsedIndices(Sz);

- SmallVector<int> MaskedIndices;

+ SmallBitVector UnusedIndices(Sz, /*t=*/true);

+ SmallBitVector MaskedIndices(Sz);

for (unsigned I = 0; I < Sz; ++I) {

if (Order[I] < Sz)

- UsedIndices.set(Order[I]);

+ UnusedIndices.reset(Order[I]);

else

- MaskedIndices.push_back(I);

+ MaskedIndices.set(I);

}

- if (MaskedIndices.empty())

+ if (MaskedIndices.none())

return;

- SmallVector<int> AvailableIndices(MaskedIndices.size());

- unsigned Cnt = 0;

- int Idx = UsedIndices.find_first();

- do {

- AvailableIndices[Cnt] = Idx;

- Idx = UsedIndices.find_next(Idx);

- ++Cnt;

- } while (Idx > 0);

- assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");

- for (int I = 0, E = MaskedIndices.size(); I < E; ++I)

- Order[MaskedIndices[I]] = AvailableIndices[I];

+ assert(UnusedIndices.count() == MaskedIndices.count() &&

+ "Non-synced masked/available indices.");

+ int Idx = UnusedIndices.find_first();

+ int MIdx = MaskedIndices.find_first();

+ while (MIdx >= 0) {

+ assert(Idx >= 0 && "Indices must be synced.");

+ Order[MIdx] = Idx;

+ Idx = UnusedIndices.find_next(Idx);

+ MIdx = MaskedIndices.find_next(MIdx);

+ }

}

namespace llvm {

@@ -812,6 +811,13 @@ public:

/// ExtractElement, ExtractValue), which can be part of the graph.

Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);

+ /// Gets reordering data for the given tree entry. If the entry is vectorized

+ /// - just return ReorderIndices, otherwise check if the scalars can be

+ /// reordered and return the most optimal order.

+ /// \param TopToBottom If true, include the order of vectorized stores and

+ /// insertelement nodes, otherwise skip them.

+ Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);

/// Reorders the current graph to the most profitable order starting from the

/// root node to the leaf nodes. The best order is chosen only from the nodes

/// of the same size (vectorization factor). Smaller nodes are considered

@@ -1010,18 +1016,25 @@ public:

std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

}

- // The hard-coded scores listed here are not very important. When computing

- // the scores of matching one sub-tree with another, we are basically

- // counting the number of values that are matching. So even if all scores

- // are set to 1, we would still get a decent matching result.

+ // The hard-coded scores listed here are not very important, though it shall

+ // be higher for better matches to improve the resulting cost. When

+ // computing the scores of matching one sub-tree with another, we are

+ // basically counting the number of values that are matching. So even if all

+ // scores are set to 1, we would still get a decent matching result.

// However, sometimes we have to break ties. For example we may have to

// choose between matching loads vs matching opcodes. This is what these

- // scores are helping us with: they provide the order of preference.

+ // scores are helping us with: they provide the order of preference. Also,

+ // this is important if the scalar is externally used or used in another

+ // tree entry node in the different lane.

/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

- static const int ScoreConsecutiveLoads = 3;

+ static const int ScoreConsecutiveLoads = 4;

+ /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

+ static const int ScoreReversedLoads = 3;

/// ExtractElementInst from same vector and consecutive indexes.

- static const int ScoreConsecutiveExtracts = 3;

+ static const int ScoreConsecutiveExtracts = 4;

+ /// ExtractElementInst from same vector and reversed indices.

+ static const int ScoreReversedExtracts = 3;

/// Constants.

static const int ScoreConstants = 2;

/// Instructions with the same opcode.

@@ -1041,7 +1054,10 @@ public:

/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,

- ScalarEvolution &SE) {

+ ScalarEvolution &SE, int NumLanes) {

+ if (V1 == V2)

+ return VLOperands::ScoreSplat;

auto *LI1 = dyn_cast<LoadInst>(V1);

auto *LI2 = dyn_cast<LoadInst>(V2);

if (LI1 && LI2) {

@@ -1051,8 +1067,17 @@ public:

Optional<int> Dist = getPointersDiff(

LI1->getType(), LI1->getPointerOperand(), LI2->getType(),

LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);

- return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads

- : VLOperands::ScoreFail;

+ if (!Dist)

+ return VLOperands::ScoreFail;

+ // The distance is too large - still may be profitable to use masked

+ // loads/gathers.

+ if (std::abs(*Dist) > NumLanes / 2)

+ return VLOperands::ScoreAltOpcodes;

+ // This still will detect consecutive loads, but we might have "holes"

+ // in some cases. It is ok for non-power-2 vectorization and may produce

+ // better results. It should not affect current vectorization.

+ return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads

+ : VLOperands::ScoreReversedLoads;

}

auto *C1 = dyn_cast<Constant>(V1);

@@ -1062,18 +1087,41 @@ public:

// Extracts from consecutive indexes of the same vector better score as

// the extracts could be optimized away.

- Value *EV;

- ConstantInt *Ex1Idx, *Ex2Idx;

- if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&

- match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&

- Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())

- return VLOperands::ScoreConsecutiveExtracts;

+ Value *EV1;

+ ConstantInt *Ex1Idx;

+ if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {

+ // Undefs are always profitable for extractelements.

+ if (isa<UndefValue>(V2))

+ return VLOperands::ScoreConsecutiveExtracts;

+ Value *EV2 = nullptr;

+ ConstantInt *Ex2Idx = nullptr;

+ if (match(V2,

+ m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),

+ m_Undef())))) {

+ // Undefs are always profitable for extractelements.

+ if (!Ex2Idx)

+ return VLOperands::ScoreConsecutiveExtracts;

+ if (isUndefVector(EV2) && EV2->getType() == EV1->getType())

+ return VLOperands::ScoreConsecutiveExtracts;

+ if (EV2 == EV1) {

+ int Idx1 = Ex1Idx->getZExtValue();

+ int Idx2 = Ex2Idx->getZExtValue();

+ int Dist = Idx2 - Idx1;

+ // The distance is too large - still may be profitable to use

+ // shuffles.

+ if (std::abs(Dist) > NumLanes / 2)

+ return VLOperands::ScoreAltOpcodes;

+ return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts

+ : VLOperands::ScoreReversedExtracts;

+ }

auto *I1 = dyn_cast<Instruction>(V1);

auto *I2 = dyn_cast<Instruction>(V2);

if (I1 && I2) {

- if (I1 == I2)

- return VLOperands::ScoreSplat;

+ if (I1->getParent() != I2->getParent())

+ return VLOperands::ScoreFail;

InstructionsState S = getSameOpcode({I1, I2});

// Note: Only consider instructions with <= 2 operands to avoid

// complexity explosion.

@@ -1088,11 +1136,13 @@ public:

return VLOperands::ScoreFail;

}

- /// Holds the values and their lane that are taking part in the look-ahead

+ /// Holds the values and their lanes that are taking part in the look-ahead

/// score calculation. This is used in the external uses cost calculation.

- SmallDenseMap<Value *, int> InLookAheadValues;

+ /// Need to hold all the lanes in case of splat/broadcast at least to

+ /// correctly check for the use in the different lane.

+ SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;

- /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are

+ /// \returns the additional cost due to uses of \p LHS and \p RHS that are

/// either external to the vectorized code, or require shuffling.

int getExternalUsesCost(const std::pair<Value *, int> &LHS,

const std::pair<Value *, int> &RHS) {

@@ -1116,22 +1166,30 @@ public:

for (User *U : V->users()) {

if (const TreeEntry *UserTE = R.getTreeEntry(U)) {

// The user is in the VectorizableTree. Check if we need to insert.

- auto It = llvm::find(UserTE->Scalars, U);

- assert(It != UserTE->Scalars.end() && "U is in UserTE");

- int UserLn = std::distance(UserTE->Scalars.begin(), It);

+ int UserLn = UserTE->findLaneForValue(U);

assert(UserLn >= 0 && "Bad lane");

- if (UserLn != Ln)

+ // If the values are different, check just the line of the current

+ // value. If the values are the same, need to add UserInDiffLaneCost

+ // only if UserLn does not match both line numbers.

+ if ((LHS.first != RHS.first && UserLn != Ln) ||

+ (LHS.first == RHS.first && UserLn != LHS.second &&

+ UserLn != RHS.second)) {

Cost += UserInDiffLaneCost;

+ break;

+ }

} else {

// Check if the user is in the look-ahead code.

auto It2 = InLookAheadValues.find(U);

if (It2 != InLookAheadValues.end()) {

// The user is in the look-ahead code. Check the lane.

- if (It2->second != Ln)

+ if (!It2->getSecond().contains(Ln)) {

Cost += UserInDiffLaneCost;

+ break;

+ }

} else {

// The user is neither in SLP tree nor in the look-ahead code.

Cost += ExternalUseCost;

+ break;

}

// Limit the number of visited uses to cap compilation time.

@@ -1170,32 +1228,36 @@ public:

Value *V1 = LHS.first;

Value *V2 = RHS.first;

// Get the shallow score of V1 and V2.

- int ShallowScoreAtThisLevel =

- std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -

- getExternalUsesCost(LHS, RHS));

+ int ShallowScoreAtThisLevel = std::max(

+ (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -

+ getExternalUsesCost(LHS, RHS));

int Lane1 = LHS.second;

int Lane2 = RHS.second;

// If reached MaxLevel,

// or if V1 and V2 are not instructions,

// or if they are SPLAT,

- // or if they are not consecutive, early return the current cost.

+ // or if they are not consecutive,

+ // or if profitable to vectorize loads or extractelements, early return

+ // the current cost.

auto *I1 = dyn_cast<Instruction>(V1);

auto *I2 = dyn_cast<Instruction>(V2);

if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

ShallowScoreAtThisLevel == VLOperands::ScoreFail ||

- (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))

+ (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||

+ (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&

+ ShallowScoreAtThisLevel))

return ShallowScoreAtThisLevel;

assert(I1 && I2 && "Should have early exited.");

// Keep track of in-tree values for determining the external-use cost.

- InLookAheadValues[V1] = Lane1;

- InLookAheadValues[V2] = Lane2;

+ InLookAheadValues[V1].insert(Lane1);

+ InLookAheadValues[V2].insert(Lane2);

// Contains the I2 operand indexes that got matched with I1 operands.

SmallSet<unsigned, 4> Op2Used;

- // Recursion towards the operands of I1 and I2. We are trying all possbile

+ // Recursion towards the operands of I1 and I2. We are trying all possible

// operand pairs, and keeping track of the best score.

for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

OpIdx1 != NumOperands1; ++OpIdx1) {

@@ -1319,27 +1381,79 @@ public:

return None;

}

- /// Helper for reorderOperandVecs. \Returns the lane that we should start

- /// reordering from. This is the one which has the least number of operands

- /// that can freely move about.

+ /// Helper for reorderOperandVecs.

+ /// \returns the lane that we should start reordering from. This is the one

+ /// which has the least number of operands that can freely move about or

+ /// less profitable because it already has the most optimal set of operands.

unsigned getBestLaneToStartReordering() const {

- unsigned BestLane = 0;

unsigned Min = UINT_MAX;

- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

- ++Lane) {

- unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);

- if (NumFreeOps < Min) {

- Min = NumFreeOps;

- BestLane = Lane;

+ unsigned SameOpNumber = 0;

+ // std::pair<unsigned, unsigned> is used to implement a simple voting

+ // algorithm and choose the lane with the least number of operands that

+ // can freely move about or less profitable because it already has the

+ // most optimal set of operands. The first unsigned is a counter for

+ // voting, the second unsigned is the counter of lanes with instructions

+ // with same/alternate opcodes and same parent basic block.

+ MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;

+ // Try to be closer to the original results, if we have multiple lanes

+ // with same cost. If 2 lanes have the same cost, use the one with the

+ // lowest index.

+ for (int I = getNumLanes(); I > 0; --I) {

+ unsigned Lane = I - 1;

+ OperandsOrderData NumFreeOpsHash =

+ getMaxNumOperandsThatCanBeReordered(Lane);

+ // Compare the number of operands that can move and choose the one with

+ // the least number.

+ if (NumFreeOpsHash.NumOfAPOs < Min) {

+ Min = NumFreeOpsHash.NumOfAPOs;

+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

+ HashMap.clear();

+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&

+ NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {

+ // Select the most optimal lane in terms of number of operands that

+ // should be moved around.

+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&

+ NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

+ ++HashMap[NumFreeOpsHash.Hash].first;

+ }

+ // Select the lane with the minimum counter.

+ unsigned BestLane = 0;

+ unsigned CntMin = UINT_MAX;

+ for (const auto &Data : reverse(HashMap)) {

+ if (Data.second.first < CntMin) {

+ CntMin = Data.second.first;

+ BestLane = Data.second.second;

}

return BestLane;

}

- /// \Returns the maximum number of operands that are allowed to be reordered

- /// for \p Lane. This is used as a heuristic for selecting the first lane to

- /// start operand reordering.

- unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {

+ /// Data structure that helps to reorder operands.

+ struct OperandsOrderData {

+ /// The best number of operands with the same APOs, which can be

+ /// reordered.

+ unsigned NumOfAPOs = UINT_MAX;

+ /// Number of operands with the same/alternate instruction opcode and

+ /// parent.

+ unsigned NumOpsWithSameOpcodeParent = 0;

+ /// Hash for the actual operands ordering.

+ /// Used to count operands, actually their position id and opcode

+ /// value. It is used in the voting mechanism to find the lane with the

+ /// least number of operands that can freely move about or less profitable

+ /// because it already has the most optimal set of operands. Can be

+ /// replaced with SmallVector<unsigned> instead but hash code is faster

+ /// and requires less memory.

+ unsigned Hash = 0;

+ };

+ /// \returns the maximum number of operands that are allowed to be reordered

+ /// for \p Lane and the number of compatible instructions(with the same

+ /// parent/opcode). This is used as a heuristic for selecting the first lane

+ /// to start operand reordering.

+ OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {

unsigned CntTrue = 0;

unsigned NumOperands = getNumOperands();

// Operands with the same APO can be reordered. We therefore need to count

@@ -1348,11 +1462,45 @@ public:

// a map. Instead we can simply count the number of operands that

// correspond to one of them (in this case the 'true' APO), and calculate

// the other by subtracting it from the total number of operands.

- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)

- if (getData(OpIdx, Lane).APO)

+ // Operands with the same instruction opcode and parent are more

+ // profitable since we don't need to move them in many cases, with a high

+ // probability such lane already can be vectorized effectively.

+ bool AllUndefs = true;

+ unsigned NumOpsWithSameOpcodeParent = 0;

+ Instruction *OpcodeI = nullptr;

+ BasicBlock *Parent = nullptr;

+ unsigned Hash = 0;

+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

+ const OperandData &OpData = getData(OpIdx, Lane);

+ if (OpData.APO)

++CntTrue;

- unsigned CntFalse = NumOperands - CntTrue;

- return std::max(CntTrue, CntFalse);

+ // Use Boyer-Moore majority voting for finding the majority opcode and

+ // the number of times it occurs.

+ if (auto *I = dyn_cast<Instruction>(OpData.V)) {

+ if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||

+ I->getParent() != Parent) {

+ if (NumOpsWithSameOpcodeParent == 0) {

+ NumOpsWithSameOpcodeParent = 1;

+ OpcodeI = I;

+ Parent = I->getParent();

+ } else {

+ --NumOpsWithSameOpcodeParent;

+ }

+ } else {

+ ++NumOpsWithSameOpcodeParent;

+ }

+ Hash = hash_combine(

+ Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));

+ AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);

+ }

+ if (AllUndefs)

+ return {};

+ OperandsOrderData Data;

+ Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);

+ Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;

+ Data.Hash = Hash;

+ return Data;

}

/// Go through the instructions in VL and append their operands.

@@ -1500,11 +1648,37 @@ public:

ReorderingModes[OpIdx] = ReorderingMode::Failed;

}

+ // Check that we don't have same operands. No need to reorder if operands

+ // are just perfect diamond or shuffled diamond match. Do not do it only

+ // for possible broadcasts or non-power of 2 number of scalars (just for

+ // now).

+ auto &&SkipReordering = [this]() {

+ SmallPtrSet<Value *, 4> UniqueValues;

+ ArrayRef<OperandData> Op0 = OpsVec.front();

+ for (const OperandData &Data : Op0)

+ UniqueValues.insert(Data.V);

+ for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {

+ if (any_of(Op, [&UniqueValues](const OperandData &Data) {

+ return !UniqueValues.contains(Data.V);

+ }))

+ return false;

+ }

+ // TODO: Check if we can remove a check for non-power-2 number of

+ // scalars after full support of non-power-2 vectorization.

+ return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());

+ };

// If the initial strategy fails for any of the operand indexes, then we

// perform reordering again in a second pass. This helps avoid assigning

// high priority to the failed strategy, and should improve reordering for

// the non-failed operand indexes.

for (int Pass = 0; Pass != 2; ++Pass) {

+ // Check if no need to reorder operands since they're are perfect or

+ // shuffled diamond match.

+ // Need to to do it to avoid extra external use cost counting for

+ // shuffled matches, which may cause regressions.

+ if (SkipReordering())

+ break;

// Skip the second pass if the first pass did not fail.

bool StrategyFailed = false;

// Mark all operand data as free to use.

@@ -1792,9 +1966,10 @@ private:

if (Operands.size() < OpIdx + 1)

Operands.resize(OpIdx + 1);

assert(Operands[OpIdx].empty() && "Already resized?");

- Operands[OpIdx].resize(Scalars.size());

- for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)

- Operands[OpIdx][Lane] = OpVL[Lane];

+ assert(OpVL.size() <= Scalars.size() &&

+ "Number of operands is greater than the number of scalars.");

+ Operands[OpIdx].resize(OpVL.size());

+ copy(OpVL, Operands[OpIdx].begin());

}

/// Set the operands of this bundle in their original order.

@@ -1944,7 +2119,7 @@ private:

if (ReuseShuffleIndices.empty())

dbgs() << "Empty";

else

- for (unsigned ReuseIdx : ReuseShuffleIndices)

+ for (int ReuseIdx : ReuseShuffleIndices)

dbgs() << ReuseIdx << ", ";

dbgs() << "\n";

dbgs() << "ReorderIndices: ";

@@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

return None;

}

+Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,

+ bool TopToBottom) {

+ // No need to reorder if need to shuffle reuses, still need to shuffle the

+ // node.

+ if (!TE.ReuseShuffleIndices.empty())

+ return None;

+ if (TE.State == TreeEntry::Vectorize &&

+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&

+ !TE.isAltShuffle())

+ return TE.ReorderIndices;

+ if (TE.State == TreeEntry::NeedToGather) {

+ // TODO: add analysis of other gather nodes with extractelement

+ // instructions and other values/instructions, not only undefs.

+ if (((TE.getOpcode() == Instruction::ExtractElement &&

+ !TE.isAltShuffle()) ||

+ (all_of(TE.Scalars,

+ [](Value *V) {

+ return isa<UndefValue, ExtractElementInst>(V);

+ }) &&

+ any_of(TE.Scalars,

+ [](Value *V) { return isa<ExtractElementInst>(V); }))) &&

+ all_of(TE.Scalars,

+ [](Value *V) {

+ auto *EE = dyn_cast<ExtractElementInst>(V);

+ return !EE || isa<FixedVectorType>(EE->getVectorOperandType());

+ }) &&

+ allSameType(TE.Scalars)) {

+ // Check that gather of extractelements can be represented as

+ // just a shuffle of a single vector.

+ OrdersType CurrentOrder;

+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);

+ if (Reuse || !CurrentOrder.empty()) {

+ if (!CurrentOrder.empty())

+ fixupOrderingIndices(CurrentOrder);

+ return CurrentOrder;

+ }

+ if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))

+ return CurrentOrder;

+ }

+ return None;

void BoUpSLP::reorderTopToBottom() {

// Maps VF to the graph nodes.

DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;

@@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() {

// their ordering.

DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

// Find all reorderable nodes with the given VF.

- // Currently the are vectorized loads,extracts + some gathering of extracts.

+ // Currently the are vectorized stores,loads,extracts + some gathering of

+ // extracts.

for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](

const std::unique_ptr<TreeEntry> &TE) {

- // No need to reorder if need to shuffle reuses, still need to shuffle the

- // node.

- if (!TE->ReuseShuffleIndices.empty())

- return;

- if (TE->State == TreeEntry::Vectorize &&

- isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,

- InsertElementInst>(TE->getMainOp()) &&

- !TE->isAltShuffle()) {

+ if (Optional<OrdersType> CurrentOrder =

+ getReorderingData(*TE.get(), /*TopToBottom=*/true)) {

VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

- return;

- }

- if (TE->State == TreeEntry::NeedToGather) {

- if (TE->getOpcode() == Instruction::ExtractElement &&

- !TE->isAltShuffle() &&

- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())

- ->getVectorOperandType()) &&

- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {

- // Check that gather of extractelements can be represented as

- // just a shuffle of a single vector.

- OrdersType CurrentOrder;

- bool Reuse =

- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);

- if (Reuse || !CurrentOrder.empty()) {

- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

- GathersToOrders.try_emplace(TE.get(), CurrentOrder);

- return;

- }

- if (Optional<OrdersType> CurrentOrder =

- findReusedOrderedScalars(*TE.get())) {

- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

+ if (TE->State != TreeEntry::Vectorize)

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

- }

}

});

@@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

const std::unique_ptr<TreeEntry> &TE) {

if (TE->State != TreeEntry::Vectorize)

NonVectorized.push_back(TE.get());

- // No need to reorder if need to shuffle reuses, still need to shuffle the

- // node.

- if (!TE->ReuseShuffleIndices.empty())

- return;

- if (TE->State == TreeEntry::Vectorize &&

- isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&

- !TE->isAltShuffle()) {

+ if (Optional<OrdersType> CurrentOrder =

+ getReorderingData(*TE.get(), /*TopToBottom=*/false)) {

OrderedEntries.insert(TE.get());

- return;

- }

- if (TE->State == TreeEntry::NeedToGather) {

- if (TE->getOpcode() == Instruction::ExtractElement &&

- !TE->isAltShuffle() &&

- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())

- ->getVectorOperandType()) &&

- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {

- // Check that gather of extractelements can be represented as

- // just a shuffle of a single vector with a single user only.

- OrdersType CurrentOrder;

- bool Reuse =

- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);

- if ((Reuse || !CurrentOrder.empty()) &&

- !any_of(VectorizableTree,

- [&TE](const std::unique_ptr<TreeEntry> &Entry) {

- return Entry->State == TreeEntry::NeedToGather &&

- Entry.get() != TE.get() &&

- Entry->isSame(TE->Scalars);

- })) {

- OrderedEntries.insert(TE.get());

- GathersToOrders.try_emplace(TE.get(), CurrentOrder);

- return;

- }

- if (Optional<OrdersType> CurrentOrder =

- findReusedOrderedScalars(*TE.get())) {

- OrderedEntries.insert(TE.get());

+ if (TE->State != TreeEntry::Vectorize)

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

- }

}

});

@@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Check that every instruction appears once in this bundle.

DenseMap<Value *, unsigned> UniquePositions;

for (Value *V : VL) {

+ if (isConstant(V)) {

+ ReuseShuffleIndicies.emplace_back(

+ isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());

+ UniqueValues.emplace_back(V);

+ continue;

+ }

auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

- ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1

- : Res.first->second);

+ ReuseShuffleIndicies.emplace_back(Res.first->second);

if (Res.second)

UniqueValues.emplace_back(V);

}

@@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

} else {

LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");

if (NumUniqueScalarValues <= 1 ||

+ (UniquePositions.size() == 1 && all_of(UniqueValues,

+ [](Value *V) {

+ return isa<UndefValue>(V) ||

+ !isConstant(V);

+ })) ||

!llvm::isPowerOf2_32(NumUniqueScalarValues)) {

LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

@@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

- // If any of the scalars is marked as a value that needs to stay scalar, then

- // we need to gather the scalars.

// The reduction nodes (stored in UserIgnoreList) also should stay scalar.

for (Value *V : VL) {

- if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {

+ if (is_contained(UserIgnoreList, V)) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

if (TryToFindDuplicates(S))

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

@@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

SmallVectorImpl<unsigned> &CurrentOrder) const {

- Instruction *E0 = cast<Instruction>(OpValue);

- assert(E0->getOpcode() == Instruction::ExtractElement ||

- E0->getOpcode() == Instruction::ExtractValue);

- assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");

+ const auto *It = find_if(VL, [](Value *V) {

+ return isa<ExtractElementInst, ExtractValueInst>(V);

+ });

+ assert(It != VL.end() && "Expected at least one extract instruction.");

+ auto *E0 = cast<Instruction>(*It);

+ assert(all_of(VL,

+ [](Value *V) {

+ return isa<UndefValue, ExtractElementInst, ExtractValueInst>(

+ V);

+ }) &&

+ "Invalid opcode");

// Check if all of the extracts come from the same vector and from the

// correct offset.

Value *Vec = E0->getOperand(0);

@@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

// Also, later we can check that all the indices are used and we have a

// consecutive access in the extract instructions, by checking that no

// element of CurrentOrder still has value E + 1.

- CurrentOrder.assign(E, E + 1);

+ CurrentOrder.assign(E, E);

unsigned I = 0;

for (; I < E; ++I) {

- auto *Inst = cast<Instruction>(VL[I]);

+ auto *Inst = dyn_cast<Instruction>(VL[I]);

+ if (!Inst)

+ continue;

if (Inst->getOperand(0) != Vec)

break;

+ if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

+ if (isa<UndefValue>(EE->getIndexOperand()))

+ continue;

Optional<unsigned> Idx = getExtractIndex(Inst);

if (!Idx)

break;

const unsigned ExtIdx = *Idx;

if (ExtIdx != I) {

- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)

+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)

break;

ShouldKeepOrder = false;

CurrentOrder[ExtIdx] = I;

} else {

- if (CurrentOrder[I] != E + 1)

+ if (CurrentOrder[I] != E)

break;

CurrentOrder[I] = I;

}

@@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

bool BoUpSLP::areAllUsersVectorized(Instruction *I,

ArrayRef<Value *> VectorizedVals) const {

return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||

- llvm::all_of(I->users(), [this](User *U) {

- return ScalarToTreeEntry.count(U) > 0;

+ all_of(I->users(), [this](User *U) {

+ return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);

});

}

@@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

for (auto *V : VL) {

++Idx;

+ // Need to exclude undefs from analysis.

+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)

+ continue;

// Reached the start of a new vector registers.

if (Idx % EltsPerVector == 0) {

AllConsecutive = true;

@@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

// Check all extracts for a vector register on the target directly

// extract values in order.

unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

- CurrentIdx % EltsPerVector == Idx % EltsPerVector;

+ if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {

+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;

+ }

if (AllConsecutive)

continue;

@@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// FIXME: it tries to fix a problem with MSVC buildbots.

TargetTransformInfo &TTIRef = *TTI;

auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,

- VectorizedVals](InstructionCost &Cost,

- bool IsGather) {

+ VectorizedVals, E](InstructionCost &Cost) {

DenseMap<Value *, int> ExtractVectorsTys;

+ SmallPtrSet<Value *, 4> CheckedExtracts;

for (auto *V : VL) {

if (isa<UndefValue>(V))

continue;

@@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// instruction itself is not going to be vectorized, consider this

// instruction as dead and remove its cost from the final cost of the

// vectorized tree.

- if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))

+ // Also, avoid adjusting the cost for extractelements with multiple uses

+ // in different graph entries.

+ const TreeEntry *VE = getTreeEntry(V);

+ if (!CheckedExtracts.insert(V).second ||

+ !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

+ (VE && VE != E))

continue;

auto *EE = cast<ExtractElementInst>(V);

Optional<unsigned> EEIdx = getExtractIndex(EE);

@@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

}

return GatherCost;

}

- if (isSplat(VL)) {

- // Found the broadcasting of the single scalar, calculate the cost as the

- // broadcast.

- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);

- }

if ((E->getOpcode() == Instruction::ExtractElement ||

all_of(E->Scalars,

[](Value *V) {

@@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// single input vector or of 2 input vectors.

InstructionCost Cost =

computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);

- AdjustExtractsCost(Cost, /*IsGather=*/true);

+ AdjustExtractsCost(Cost);

if (NeedToShuffleReuses)

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

FinalVecTy, E->ReuseShuffleIndices);

return Cost;

}

+ if (isSplat(VL)) {

+ // Found the broadcasting of the single scalar, calculate the cost as the

+ // broadcast.

+ assert(VecTy == FinalVecTy &&

+ "No reused scalars expected for broadcast.");

+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);

+ }

InstructionCost ReuseShuffleCost = 0;

if (NeedToShuffleReuses)

ReuseShuffleCost = TTI->getShuffleCost(

@@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);

}

} else {

- AdjustExtractsCost(CommonCost, /*IsGather=*/false);

+ AdjustExtractsCost(CommonCost);

}

return CommonCost;

}

@@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,

FoundOr = true;

}

// Check if the input is an extended load of the required or/shift expression.

- Value *LoadPtr;

+ Value *Load;

if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||

- !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))

+ !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))

return false;

// Require that the total load bit width is a legal integer type.

// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.

// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.

- Type *SrcTy = LoadPtr->getType()->getPointerElementType();

+ Type *SrcTy = Load->getType();

unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;

if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))

return false;

@@ -9061,8 +9253,7 @@ private:

"A call to the llvm.fmuladd intrinsic is not handled yet");

++NumVectorInstructions;

- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,

- ReductionOps.back());

+ return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);

}

};

@@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

return Changed;

}

+/// Compare two cmp instructions. If IsCompatibility is true, function returns

+/// true if 2 cmps have same/swapped predicates and mos compatible corresponding

+/// operands. If IsCompatibility is false, function implements strict weak

+/// ordering relation between two cmp instructions, returning true if the first

+/// instruction is "less" than the second, i.e. its predicate is less than the

+/// predicate of the second or the operands IDs are less than the operands IDs

+/// of the second cmp instruction.

+template <bool IsCompatibility>

+static bool compareCmp(Value *V, Value *V2,

+ function_ref<bool(Instruction *)> IsDeleted) {

+ auto *CI1 = cast<CmpInst>(V);

+ auto *CI2 = cast<CmpInst>(V2);

+ if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))

+ return false;

+ if (CI1->getOperand(0)->getType()->getTypeID() <

+ CI2->getOperand(0)->getType()->getTypeID())

+ return !IsCompatibility;

+ if (CI1->getOperand(0)->getType()->getTypeID() >

+ CI2->getOperand(0)->getType()->getTypeID())

+ return false;

+ CmpInst::Predicate Pred1 = CI1->getPredicate();

+ CmpInst::Predicate Pred2 = CI2->getPredicate();

+ CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);

+ CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);

+ CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);

+ CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);

+ if (BasePred1 < BasePred2)

+ return !IsCompatibility;

+ if (BasePred1 > BasePred2)

+ return false;

+ // Compare operands.

+ bool LEPreds = Pred1 <= Pred2;

+ bool GEPreds = Pred1 >= Pred2;

+ for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {

+ auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);

+ auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);

+ if (Op1->getValueID() < Op2->getValueID())

+ return !IsCompatibility;

+ if (Op1->getValueID() > Op2->getValueID())

+ return false;

+ if (auto *I1 = dyn_cast<Instruction>(Op1))

+ if (auto *I2 = dyn_cast<Instruction>(Op2)) {

+ if (I1->getParent() != I2->getParent())

+ return false;

+ InstructionsState S = getSameOpcode({I1, I2});

+ if (S.getOpcode())

+ continue;

+ return false;

+ }

+ return IsCompatibility;

bool SLPVectorizerPass::vectorizeSimpleInstructions(

SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,

bool AtTerminator) {

@@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(

}

// Try to vectorize list of compares.

// Sort by type, compare predicate, etc.

- // TODO: Add analysis on the operand opcodes (profitable to vectorize

- // instructions with same/alternate opcodes/const values).

auto &&CompareSorter = [&R](Value *V, Value *V2) {

- auto *CI1 = cast<CmpInst>(V);

- auto *CI2 = cast<CmpInst>(V2);

- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))

- return false;

- if (CI1->getOperand(0)->getType()->getTypeID() <

- CI2->getOperand(0)->getType()->getTypeID())

- return true;

- if (CI1->getOperand(0)->getType()->getTypeID() >

- CI2->getOperand(0)->getType()->getTypeID())

- return false;

- return CI1->getPredicate() < CI2->getPredicate() ||

- (CI1->getPredicate() > CI2->getPredicate() &&

- CI1->getPredicate() <

- CmpInst::getSwappedPredicate(CI2->getPredicate()));

+ return compareCmp<false>(V, V2,

+ [&R](Instruction *I) { return R.isDeleted(I); });

};

auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {

if (V1 == V2)

return true;

- auto *CI1 = cast<CmpInst>(V1);

- auto *CI2 = cast<CmpInst>(V2);

- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))

- return false;

- if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())

- return false;

- return CI1->getPredicate() == CI2->getPredicate() ||

- CI1->getPredicate() ==

- CmpInst::getSwappedPredicate(CI2->getPredicate());

+ return compareCmp<true>(V1, V2,

+ [&R](Instruction *I) { return R.isDeleted(I); });

};

auto Limit = [&R](Value *V) {

unsigned EltSize = R.getVectorElementSize(V);

@@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

return true;

if (Opcodes1.size() > Opcodes2.size())

return false;

+ Optional<bool> ConstOrder;

for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

// Undefs are compatible with any other value.

- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {

+ if (!ConstOrder)

+ ConstOrder =

+ !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);

continue;

+ }

if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

@@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

continue;

return I1->getOpcode() < I2->getOpcode();

}

- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {

+ if (!ConstOrder)

+ ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();

continue;

+ }

if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())

return true;

if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())

return false;

}

- return false;

+ return ConstOrder && *ConstOrder;

};

auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {

if (V1 == V2)