src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-01-24 22:00:03 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-01-24 22:00:03 +0000
commit	480093f4440d54b30b3025afeac24b48f2ba7a2e (patch)
tree	162e72994062888647caf0d875428db9445491a8 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent	489b1cf2ecf5b9b4a394857987014bfb09067726 (diff)
parent	706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)

Notes

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

965

1 files changed, 649 insertions, 316 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 974eff9974d9..aabd974cd73e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -26,6 +26,7 @@

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetVector.h"

+#include "llvm/ADT/SmallBitVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

@@ -72,6 +73,7 @@

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/IR/Verifier.h"

+#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

@@ -127,6 +129,10 @@ static cl::opt<int>

MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,

cl::desc("Attempt to vectorize for this register size in bits"));

+static cl::opt<int>

+MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,

+ cl::desc("Maximum depth of the lookup for consecutive stores."));

/// Limits the size of scheduling regions in a block.

/// It avoid long compile times for _very_ large blocks where vector

/// instructions are spread over a wide range.

@@ -147,6 +153,20 @@ static cl::opt<unsigned> MinTreeSize(

"slp-min-tree-size", cl::init(3), cl::Hidden,

cl::desc("Only vectorize small trees if they are fully vectorizable"));

+// The maximum depth that the look-ahead score heuristic will explore.

+// The higher this value, the higher the compilation time overhead.

+static cl::opt<int> LookAheadMaxDepth(

+ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,

+ cl::desc("The maximum look-ahead depth for operand reordering scores"));

+// The Look-ahead heuristic goes through the users of the bundle to calculate

+// the users cost in getExternalUsesCost(). To avoid compilation time increase

+// we limit the number of users visited to this value.

+static cl::opt<unsigned> LookAheadUsersBudget(

+ "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,

+ cl::desc("The maximum number of users to visit while visiting the "

+ "predecessors. This prevents compilation time increase."));

static cl::opt<bool>

ViewSLPTree("view-slp-tree", cl::Hidden,

cl::desc("Display the SLP trees with Graphviz"));

@@ -547,7 +567,7 @@ public:

/// Construct a vectorizable tree that starts at \p Roots, ignoring users for

/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking

- /// into account (anf updating it, if required) list of externally used

+ /// into account (and updating it, if required) list of externally used

/// values stored in \p ExternallyUsedValues.

void buildTree(ArrayRef<Value *> Roots,

ExtraValueToDebugLocsMap &ExternallyUsedValues,

@@ -609,7 +629,10 @@ public:

return MinVecRegSize;

}

- /// Check if ArrayType or StructType is isomorphic to some VectorType.

+ /// Check if homogeneous aggregate is isomorphic to some VectorType.

+ /// Accepts homogeneous multidimensional aggregate of scalars/vectors like

+ /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },

+ /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

///

/// \returns number of elements in vector if isomorphism exists, 0 otherwise.

unsigned canMapToVector(Type *T, const DataLayout &DL) const;

@@ -721,6 +744,7 @@ public:

const DataLayout &DL;

ScalarEvolution &SE;

+ const BoUpSLP &R;

/// \returns the operand data at \p OpIdx and \p Lane.

OperandData &getData(unsigned OpIdx, unsigned Lane) {

@@ -746,6 +770,227 @@ public:

std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

}

+ // The hard-coded scores listed here are not very important. When computing

+ // the scores of matching one sub-tree with another, we are basically

+ // counting the number of values that are matching. So even if all scores

+ // are set to 1, we would still get a decent matching result.

+ // However, sometimes we have to break ties. For example we may have to

+ // choose between matching loads vs matching opcodes. This is what these

+ // scores are helping us with: they provide the order of preference.

+ /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

+ static const int ScoreConsecutiveLoads = 3;

+ /// ExtractElementInst from same vector and consecutive indexes.

+ static const int ScoreConsecutiveExtracts = 3;

+ /// Constants.

+ static const int ScoreConstants = 2;

+ /// Instructions with the same opcode.

+ static const int ScoreSameOpcode = 2;

+ /// Instructions with alt opcodes (e.g, add + sub).

+ static const int ScoreAltOpcodes = 1;

+ /// Identical instructions (a.k.a. splat or broadcast).

+ static const int ScoreSplat = 1;

+ /// Matching with an undef is preferable to failing.

+ static const int ScoreUndef = 1;

+ /// Score for failing to find a decent match.

+ static const int ScoreFail = 0;

+ /// User exteranl to the vectorized code.

+ static const int ExternalUseCost = 1;

+ /// The user is internal but in a different lane.

+ static const int UserInDiffLaneCost = ExternalUseCost;

+ /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

+ static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,

+ ScalarEvolution &SE) {

+ auto *LI1 = dyn_cast<LoadInst>(V1);

+ auto *LI2 = dyn_cast<LoadInst>(V2);

+ if (LI1 && LI2)

+ return isConsecutiveAccess(LI1, LI2, DL, SE)

+ ? VLOperands::ScoreConsecutiveLoads

+ : VLOperands::ScoreFail;

+ auto *C1 = dyn_cast<Constant>(V1);

+ auto *C2 = dyn_cast<Constant>(V2);

+ if (C1 && C2)

+ return VLOperands::ScoreConstants;

+ // Extracts from consecutive indexes of the same vector better score as

+ // the extracts could be optimized away.

+ auto *Ex1 = dyn_cast<ExtractElementInst>(V1);

+ auto *Ex2 = dyn_cast<ExtractElementInst>(V2);

+ if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&

+ cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==

+ cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {

+ return VLOperands::ScoreConsecutiveExtracts;

+ }

+ auto *I1 = dyn_cast<Instruction>(V1);

+ auto *I2 = dyn_cast<Instruction>(V2);

+ if (I1 && I2) {

+ if (I1 == I2)

+ return VLOperands::ScoreSplat;

+ InstructionsState S = getSameOpcode({I1, I2});

+ // Note: Only consider instructions with <= 2 operands to avoid

+ // complexity explosion.

+ if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)

+ return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes

+ : VLOperands::ScoreSameOpcode;

+ }

+ if (isa<UndefValue>(V2))

+ return VLOperands::ScoreUndef;

+ return VLOperands::ScoreFail;

+ }

+ /// Holds the values and their lane that are taking part in the look-ahead

+ /// score calculation. This is used in the external uses cost calculation.

+ SmallDenseMap<Value *, int> InLookAheadValues;

+ /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are

+ /// either external to the vectorized code, or require shuffling.

+ int getExternalUsesCost(const std::pair<Value *, int> &LHS,

+ const std::pair<Value *, int> &RHS) {

+ int Cost = 0;

+ SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};

+ for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {

+ Value *V = Values[Idx].first;

+ // Calculate the absolute lane, using the minimum relative lane of LHS

+ // and RHS as base and Idx as the offset.

+ int Ln = std::min(LHS.second, RHS.second) + Idx;

+ assert(Ln >= 0 && "Bad lane calculation");

+ unsigned UsersBudget = LookAheadUsersBudget;

+ for (User *U : V->users()) {

+ if (const TreeEntry *UserTE = R.getTreeEntry(U)) {

+ // The user is in the VectorizableTree. Check if we need to insert.

+ auto It = llvm::find(UserTE->Scalars, U);

+ assert(It != UserTE->Scalars.end() && "U is in UserTE");

+ int UserLn = std::distance(UserTE->Scalars.begin(), It);

+ assert(UserLn >= 0 && "Bad lane");

+ if (UserLn != Ln)

+ Cost += UserInDiffLaneCost;

+ } else {

+ // Check if the user is in the look-ahead code.

+ auto It2 = InLookAheadValues.find(U);

+ if (It2 != InLookAheadValues.end()) {

+ // The user is in the look-ahead code. Check the lane.

+ if (It2->second != Ln)

+ Cost += UserInDiffLaneCost;

+ } else {

+ // The user is neither in SLP tree nor in the look-ahead code.

+ Cost += ExternalUseCost;

+ }

+ // Limit the number of visited uses to cap compilation time.

+ if (--UsersBudget == 0)

+ break;

+ }

+ return Cost;

+ }

+ /// Go through the operands of \p LHS and \p RHS recursively until \p

+ /// MaxLevel, and return the cummulative score. For example:

+ /// \verbatim

+ /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]

+ /// \ / \ / \ / \ /

+ /// + + + +

+ /// G1 G2 G3 G4

+ /// \endverbatim

+ /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at

+ /// each level recursively, accumulating the score. It starts from matching

+ /// the additions at level 0, then moves on to the loads (level 1). The

+ /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and

+ /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while

+ /// {A[0],C[0]} has a score of VLOperands::ScoreFail.

+ /// Please note that the order of the operands does not matter, as we

+ /// evaluate the score of all profitable combinations of operands. In

+ /// other words the score of G1 and G4 is the same as G1 and G2. This

+ /// heuristic is based on ideas described in:

+ /// Look-ahead SLP: Auto-vectorization in the presence of commutative

+ /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,

+ /// Luís F. W. Góes

+ int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,

+ const std::pair<Value *, int> &RHS, int CurrLevel,

+ int MaxLevel) {

+ Value *V1 = LHS.first;

+ Value *V2 = RHS.first;

+ // Get the shallow score of V1 and V2.

+ int ShallowScoreAtThisLevel =

+ std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -

+ getExternalUsesCost(LHS, RHS));

+ int Lane1 = LHS.second;

+ int Lane2 = RHS.second;

+ // If reached MaxLevel,

+ // or if V1 and V2 are not instructions,

+ // or if they are SPLAT,

+ // or if they are not consecutive, early return the current cost.

+ auto *I1 = dyn_cast<Instruction>(V1);

+ auto *I2 = dyn_cast<Instruction>(V2);

+ if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

+ ShallowScoreAtThisLevel == VLOperands::ScoreFail ||

+ (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))

+ return ShallowScoreAtThisLevel;

+ assert(I1 && I2 && "Should have early exited.");

+ // Keep track of in-tree values for determining the external-use cost.

+ InLookAheadValues[V1] = Lane1;

+ InLookAheadValues[V2] = Lane2;

+ // Contains the I2 operand indexes that got matched with I1 operands.

+ SmallSet<unsigned, 4> Op2Used;

+ // Recursion towards the operands of I1 and I2. We are trying all possbile

+ // operand pairs, and keeping track of the best score.

+ for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

+ OpIdx1 != NumOperands1; ++OpIdx1) {

+ // Try to pair op1I with the best operand of I2.

+ int MaxTmpScore = 0;

+ unsigned MaxOpIdx2 = 0;

+ bool FoundBest = false;

+ // If I2 is commutative try all combinations.

+ unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;

+ unsigned ToIdx = isCommutative(I2)

+ ? I2->getNumOperands()

+ : std::min(I2->getNumOperands(), OpIdx1 + 1);

+ assert(FromIdx <= ToIdx && "Bad index");

+ for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {

+ // Skip operands already paired with OpIdx1.

+ if (Op2Used.count(OpIdx2))

+ continue;

+ // Recursively calculate the cost at each level

+ int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},

+ {I2->getOperand(OpIdx2), Lane2},

+ CurrLevel + 1, MaxLevel);

+ // Look for the best score.

+ if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {

+ MaxTmpScore = TmpScore;

+ MaxOpIdx2 = OpIdx2;

+ FoundBest = true;

+ }

+ if (FoundBest) {

+ // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.

+ Op2Used.insert(MaxOpIdx2);

+ ShallowScoreAtThisLevel += MaxTmpScore;

+ }

+ return ShallowScoreAtThisLevel;

+ }

+ /// \Returns the look-ahead score, which tells us how much the sub-trees

+ /// rooted at \p LHS and \p RHS match, the more they match the higher the

+ /// score. This helps break ties in an informed way when we cannot decide on

+ /// the order of the operands by just considering the immediate

+ /// predecessors.

+ int getLookAheadScore(const std::pair<Value *, int> &LHS,

+ const std::pair<Value *, int> &RHS) {

+ InLookAheadValues.clear();

+ return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);

+ }

// Search all operands in Ops[*][Lane] for the one that matches best

// Ops[OpIdx][LastLane] and return its opreand index.

// If no good match can be found, return None.

@@ -763,9 +1008,6 @@ public:

// The linearized opcode of the operand at OpIdx, Lane.

bool OpIdxAPO = getData(OpIdx, Lane).APO;

- const unsigned BestScore = 2;

- const unsigned GoodScore = 1;

// The best operand index and its score.

// Sometimes we have more than one option (e.g., Opcode and Undefs), so we

// are using the score to differentiate between the two.

@@ -794,41 +1036,19 @@ public:

// Look for an operand that matches the current mode.

switch (RMode) {

case ReorderingMode::Load:

- if (isa<LoadInst>(Op)) {

- // Figure out which is left and right, so that we can check for

- // consecutive loads

- bool LeftToRight = Lane > LastLane;

- Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

- Value *OpRight = (LeftToRight) ? Op : OpLastLane;

- if (isConsecutiveAccess(cast<LoadInst>(OpLeft),

- cast<LoadInst>(OpRight), DL, SE))

- BestOp.Idx = Idx;

- }

- break;

- case ReorderingMode::Opcode:

- // We accept both Instructions and Undefs, but with different scores.

- if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&

- cast<Instruction>(Op)->getOpcode() ==

- cast<Instruction>(OpLastLane)->getOpcode()) ||

- (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||

- isa<UndefValue>(Op)) {

- // An instruction has a higher score than an undef.

- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;

- if (Score > BestOp.Score) {

- BestOp.Idx = Idx;

- BestOp.Score = Score;

- }

- break;

case ReorderingMode::Constant:

- if (isa<Constant>(Op)) {

- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;

- if (Score > BestOp.Score) {

- BestOp.Idx = Idx;

- BestOp.Score = Score;

- }

+ case ReorderingMode::Opcode: {

+ bool LeftToRight = Lane > LastLane;

+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;

+ unsigned Score =

+ getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});

+ if (Score > BestOp.Score) {

+ BestOp.Idx = Idx;

+ BestOp.Score = Score;

}

break;

+ }

case ReorderingMode::Splat:

if (Op == OpLastLane)

BestOp.Idx = Idx;

@@ -959,8 +1179,8 @@ public:

public:

/// Initialize with all the operands of the instruction vector \p RootVL.

VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,

- ScalarEvolution &SE)

- : DL(DL), SE(SE) {

+ ScalarEvolution &SE, const BoUpSLP &R)

+ : DL(DL), SE(SE), R(R) {

// Append all the operands of RootVL.

appendOperandsOfVL(RootVL);

}

@@ -1189,7 +1409,8 @@ private:

SmallVectorImpl<Value *> &Left,

SmallVectorImpl<Value *> &Right,

const DataLayout &DL,

- ScalarEvolution &SE);

+ ScalarEvolution &SE,

+ const BoUpSLP &R);

struct TreeEntry {

using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;

TreeEntry(VecTreeTy &Container) : Container(Container) {}

@@ -1211,7 +1432,8 @@ private:

Value *VectorizedValue = nullptr;

/// Do we need to gather this sequence ?

- bool NeedToGather = false;

+ enum EntryState { Vectorize, NeedToGather };

+ EntryState State;

/// Does this sequence require some shuffling?

SmallVector<unsigned, 4> ReuseShuffleIndices;

@@ -1353,15 +1575,30 @@ private:

dbgs() << "Scalars: \n";

for (Value *V : Scalars)

dbgs().indent(2) << *V << "\n";

- dbgs() << "NeedToGather: " << NeedToGather << "\n";

- dbgs() << "MainOp: " << *MainOp << "\n";

- dbgs() << "AltOp: " << *AltOp << "\n";

+ dbgs() << "State: ";

+ switch (State) {

+ case Vectorize:

+ dbgs() << "Vectorize\n";

+ break;

+ case NeedToGather:

+ dbgs() << "NeedToGather\n";

+ break;

+ }

+ dbgs() << "MainOp: ";

+ if (MainOp)

+ dbgs() << *MainOp << "\n";

+ else

+ dbgs() << "NULL\n";

+ dbgs() << "AltOp: ";

+ if (AltOp)

+ dbgs() << *AltOp << "\n";

+ else

+ dbgs() << "NULL\n";

dbgs() << "VectorizedValue: ";

if (VectorizedValue)

- dbgs() << *VectorizedValue;

+ dbgs() << *VectorizedValue << "\n";

else

- dbgs() << "NULL";

- dbgs() << "\n";

+ dbgs() << "NULL\n";

dbgs() << "ReuseShuffleIndices: ";

if (ReuseShuffleIndices.empty())

dbgs() << "Emtpy";

@@ -1392,7 +1629,7 @@ private:

TreeEntry *Last = VectorizableTree.back().get();

Last->Idx = VectorizableTree.size() - 1;

Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());

- Last->NeedToGather = !Vectorized;

+ Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;

Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

ReuseShuffleIndices.end());

Last->ReorderIndices = ReorderIndices;

@@ -1721,7 +1958,7 @@ private:

return nullptr;

}

- bool isInSchedulingRegion(ScheduleData *SD) {

+ bool isInSchedulingRegion(ScheduleData *SD) const {

return SD->SchedulingRegionID == SchedulingRegionID;

}

@@ -2063,7 +2300,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

static std::string getNodeAttributes(const TreeEntry *Entry,

const BoUpSLP *) {

- if (Entry->NeedToGather)

+ if (Entry->State == TreeEntry::NeedToGather)

return "color=red";

return "";

}

@@ -2115,7 +2352,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

TreeEntry *Entry = TEPtr.get();

// No need to handle users of gathered values.

- if (Entry->NeedToGather)

+ if (Entry->State == TreeEntry::NeedToGather)

continue;

// For each lane:

@@ -2152,7 +2389,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U

<< ".\n");

- assert(!UseEntry->NeedToGather && "Bad state");

+ assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");

continue;

}

@@ -2448,7 +2685,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));

uint64_t Size = DL->getTypeAllocSize(ScalarTy);

// Check that the sorted loads are consecutive.

- if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {

+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {

if (CurrentOrder.empty()) {

// Original loads are consecutive and does not require reordering.

++NumOpsWantToKeepOriginalOrder;

@@ -2543,7 +2780,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Commutative predicate - collect + sort operands of the instructions

// so that each side is more likely to have the same opcode.

assert(P0 == SwapP0 && "Commutative Predicate mismatch");

- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

} else {

// Collect operands - commute if it uses the swapped predicate.

for (Value *V : VL) {

@@ -2590,7 +2827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// have the same opcode.

if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {

ValueList Left, Right;

- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

TE->setOperand(0, Left);

TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

@@ -2637,9 +2874,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

// We don't combine GEPs with non-constant indexes.

+ Type *Ty1 = VL0->getOperand(1)->getType();

for (Value *V : VL) {

auto Op = cast<Instruction>(V)->getOperand(1);

- if (!isa<ConstantInt>(Op)) {

+ if (!isa<ConstantInt>(Op) ||

+ (Op->getType() != Ty1 &&

+ Op->getType()->getScalarSizeInBits() >

+ DL->getIndexSizeInBits(

+ V->getType()->getPointerAddressSpace()))) {

LLVM_DEBUG(dbgs()

<< "SLP: not-vectorizable GEP (non-constant indexes).\n");

BS.cancelScheduling(VL, VL0);

@@ -2665,24 +2907,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

case Instruction::Store: {

// Check if the stores are consecutive or if we need to swizzle them.

- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)

- if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {

+ llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

+ // Make sure all stores in the bundle are simple - we can't vectorize

+ // atomic or volatile stores.

+ SmallVector<Value *, 4> PointerOps(VL.size());

+ ValueList Operands(VL.size());

+ auto POIter = PointerOps.begin();

+ auto OIter = Operands.begin();

+ for (Value *V : VL) {

+ auto *SI = cast<StoreInst>(V);

+ if (!SI->isSimple()) {

BS.cancelScheduling(VL, VL0);

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");

return;

}

+ *POIter = SI->getPointerOperand();

+ *OIter = SI->getValueOperand();

+ ++POIter;

+ ++OIter;

+ }

- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

+ OrdersType CurrentOrder;

+ // Check the order of pointer operands.

+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {

+ Value *Ptr0;

+ Value *PtrN;

+ if (CurrentOrder.empty()) {

+ Ptr0 = PointerOps.front();

+ PtrN = PointerOps.back();

+ } else {

+ Ptr0 = PointerOps[CurrentOrder.front()];

+ PtrN = PointerOps[CurrentOrder.back()];

+ }

+ const SCEV *Scev0 = SE->getSCEV(Ptr0);

+ const SCEV *ScevN = SE->getSCEV(PtrN);

+ const auto *Diff =

+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));

+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);

+ // Check that the sorted pointer operands are consecutive.

+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {

+ if (CurrentOrder.empty()) {

+ // Original stores are consecutive and does not require reordering.

+ ++NumOpsWantToKeepOriginalOrder;

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,

+ UserTreeIdx, ReuseShuffleIndicies);

+ TE->setOperandsInOrder();

+ buildTree_rec(Operands, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

+ } else {

+ // Need to reorder.

+ auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;

+ ++(I->getSecond());

+ TreeEntry *TE =

+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies, I->getFirst());

+ TE->setOperandsInOrder();

+ buildTree_rec(Operands, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");

+ }

+ return;

+ }

- ValueList Operands;

- for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(0));

- TE->setOperandsInOrder();

- buildTree_rec(Operands, Depth + 1, {TE, 0});

+ BS.cancelScheduling(VL, VL0);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

return;

}

case Instruction::Call: {

@@ -2777,7 +3069,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Reorder operands if reordering would enable vectorization.

if (isa<BinaryOperator>(VL0)) {

ValueList Left, Right;

- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

TE->setOperand(0, Left);

TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

@@ -2806,27 +3098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

- unsigned N;

- Type *EltTy;

- auto *ST = dyn_cast<StructType>(T);

- if (ST) {

- N = ST->getNumElements();

- EltTy = *ST->element_begin();

- } else {

- N = cast<ArrayType>(T)->getNumElements();

- EltTy = cast<ArrayType>(T)->getElementType();

+ unsigned N = 1;

+ Type *EltTy = T;

+ while (isa<CompositeType>(EltTy)) {

+ if (auto *ST = dyn_cast<StructType>(EltTy)) {

+ // Check that struct is homogeneous.

+ for (const auto *Ty : ST->elements())

+ if (Ty != *ST->element_begin())

+ return 0;

+ N *= ST->getNumElements();

+ EltTy = *ST->element_begin();

+ } else {

+ auto *SeqT = cast<SequentialType>(EltTy);

+ N *= SeqT->getNumElements();

+ EltTy = SeqT->getElementType();

+ }

}

if (!isValidElementType(EltTy))

return 0;

uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));

if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))

return 0;

- if (ST) {

- // Check that struct is homogeneous.

- for (const auto *Ty : ST->elements())

- if (Ty != EltTy)

- return 0;

- }

return N;

}

@@ -2927,7 +3221,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

ReuseShuffleCost =

TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

}

- if (E->NeedToGather) {

+ if (E->State == TreeEntry::NeedToGather) {

if (allConstant(VL))

return 0;

if (isSplat(VL)) {

@@ -2995,7 +3289,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);

}

- if (!E->NeedToGather) {

+ if (E->State == TreeEntry::Vectorize) {

int DeadCost = ReuseShuffleCost;

if (!E->ReorderIndices.empty()) {

// TODO: Merge this shuffle with the ReuseShuffleCost.

@@ -3135,13 +3429,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

SmallVector<const Value *, 4> Operands(VL0->operand_values());

int ScalarEltCost = TTI->getArithmeticInstrCost(

- E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);

+ E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);

if (NeedToShuffleReuses) {

ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

}

int ScalarCost = VecTy->getNumElements() * ScalarEltCost;

- int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK,

- Op2VK, Op1VP, Op2VP, Operands);

+ int VecCost = TTI->getArithmeticInstrCost(

+ E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);

return ReuseShuffleCost + VecCost - ScalarCost;

}

case Instruction::GetElementPtr: {

@@ -3162,7 +3456,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

}

case Instruction::Load: {

// Cost of wide load - cost of scalar loads.

- unsigned alignment = cast<LoadInst>(VL0)->getAlignment();

+ MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());

int ScalarEltCost =

TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);

if (NeedToShuffleReuses) {

@@ -3180,15 +3474,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

}

case Instruction::Store: {

// We know that we can merge the stores. Calculate the cost.

- unsigned alignment = cast<StoreInst>(VL0)->getAlignment();

+ bool IsReorder = !E->ReorderIndices.empty();

+ auto *SI =

+ cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

+ MaybeAlign Alignment(SI->getAlignment());

int ScalarEltCost =

- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);

- if (NeedToShuffleReuses) {

- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

- }

+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);

+ if (NeedToShuffleReuses)

+ ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;

- int VecStCost =

- TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);

+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store,

+ VecTy, Alignment, 0, VL0);

+ if (IsReorder) {

+ // TODO: Merge this shuffle with the ReuseShuffleCost.

+ VecStCost += TTI->getShuffleCost(

+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

+ }

return ReuseShuffleCost + VecStCost - ScalarStCost;

}

case Instruction::Call: {

@@ -3274,20 +3575,22 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {

<< VectorizableTree.size() << " is fully vectorizable .\n");

// We only handle trees of heights 1 and 2.

- if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)

+ if (VectorizableTree.size() == 1 &&

+ VectorizableTree[0]->State == TreeEntry::Vectorize)

return true;

if (VectorizableTree.size() != 2)

return false;

// Handle splat and all-constants stores.

- if (!VectorizableTree[0]->NeedToGather &&

+ if (VectorizableTree[0]->State == TreeEntry::Vectorize &&

(allConstant(VectorizableTree[1]->Scalars) ||

isSplat(VectorizableTree[1]->Scalars)))

return true;

// Gathering cost would be too much for tiny trees.

- if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)

+ if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||

+ VectorizableTree[1]->State == TreeEntry::NeedToGather)

return false;

return true;

@@ -3397,7 +3700,7 @@ int BoUpSLP::getSpillCost() const {

continue;

}

- // Debug informations don't impact spill cost.

+ // Debug information does not impact spill cost.

if ((isa<CallInst>(&*PrevInstIt) &&

!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&

&*PrevInstIt != PrevInst)

@@ -3441,12 +3744,13 @@ int BoUpSLP::getTreeCost() {

// their uses. Since such an approach results in fewer total entries,

// existing heuristics based on tree size may yield different results.

- if (TE.NeedToGather &&

- std::any_of(

- std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),

- [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {

- return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);

- }))

+ if (TE.State == TreeEntry::NeedToGather &&

+ std::any_of(std::next(VectorizableTree.begin(), I + 1),

+ VectorizableTree.end(),

+ [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {

+ return EntryPtr->State == TreeEntry::NeedToGather &&

+ EntryPtr->isSame(TE.Scalars);

+ }))

continue;

int C = getEntryCost(&TE);

@@ -3538,13 +3842,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// Perform operand reordering on the instructions in VL and return the reordered

// operands in Left and Right.

-void BoUpSLP::reorderInputsAccordingToOpcode(

- ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,

- SmallVectorImpl<Value *> &Right, const DataLayout &DL,

- ScalarEvolution &SE) {

+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

+ SmallVectorImpl<Value *> &Left,

+ SmallVectorImpl<Value *> &Right,

+ const DataLayout &DL,

+ ScalarEvolution &SE,

+ const BoUpSLP &R) {

if (VL.empty())

return;

- VLOperands Ops(VL, DL, SE);

+ VLOperands Ops(VL, DL, SE, R);

// Reorder the operands in place.

Ops.reorder();

Left = Ops.getVL(0);

@@ -3735,7 +4041,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

- if (E->NeedToGather) {

+ if (E->State == TreeEntry::NeedToGather) {

setInsertPointAfterBundle(E);

auto *V = Gather(E->Scalars, VecTy);

if (NeedToShuffleReuses) {

@@ -3790,7 +4096,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

case Instruction::ExtractElement: {

- if (!E->NeedToGather) {

+ if (E->State == TreeEntry::Vectorize) {

Value *V = E->getSingleOperand(0);

if (!E->ReorderIndices.empty()) {

OrdersType Mask;

@@ -3823,7 +4129,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::ExtractValue: {

- if (!E->NeedToGather) {

+ if (E->State == TreeEntry::Vectorize) {

LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));

Builder.SetInsertPoint(LI);

PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

@@ -4050,15 +4356,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::Store: {

- StoreInst *SI = cast<StoreInst>(VL0);

+ bool IsReorder = !E->ReorderIndices.empty();

+ auto *SI = cast<StoreInst>(

+ IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);

unsigned Alignment = SI->getAlignment();

unsigned AS = SI->getPointerAddressSpace();

setInsertPointAfterBundle(E);

Value *VecValue = vectorizeTree(E->getOperand(0));

+ if (IsReorder) {

+ OrdersType Mask;

+ inversePermutation(E->ReorderIndices, Mask);

+ VecValue = Builder.CreateShuffleVector(

+ VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,

+ "reorder_shuffle");

+ }

Value *ScalarPtr = SI->getPointerOperand();

- Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));

+ Value *VecPtr = Builder.CreateBitCast(

+ ScalarPtr, VecValue->getType()->getPointerTo(AS));

StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);

// The pointer operand uses an in-tree scalar, so add the new BitCast to

@@ -4088,7 +4404,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

std::vector<Value *> OpVecs;

for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;

++j) {

- Value *OpVec = vectorizeTree(E->getOperand(j));

+ ValueList &VL = E->getOperand(j);

+ // Need to cast all elements to the same type before vectorization to

+ // avoid crash.

+ Type *VL0Ty = VL0->getOperand(j)->getType();

+ Type *Ty = llvm::all_of(

+ VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })

+ ? VL0Ty

+ : DL->getIndexType(cast<GetElementPtrInst>(VL0)

+ ->getPointerOperandType()

+ ->getScalarType());

+ for (Value *&V : VL) {

+ auto *CI = cast<ConstantInt>(V);

+ V = ConstantExpr::getIntegerCast(CI, Ty,

+ CI->getValue().isSignBitSet());

+ }

+ Value *OpVec = vectorizeTree(VL);

OpVecs.push_back(OpVec);

}

@@ -4284,7 +4615,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

continue;

TreeEntry *E = getTreeEntry(Scalar);

assert(E && "Invalid scalar");

- assert(!E->NeedToGather && "Extracting from a gather list");

+ assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");

Value *Vec = E->VectorizedValue;

assert(Vec && "Can't find vectorizable value");

@@ -4357,7 +4688,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

TreeEntry *Entry = TEPtr.get();

// No need to handle users of gathered values.

- if (Entry->NeedToGather)

+ if (Entry->State == TreeEntry::NeedToGather)

continue;

assert(Entry->VectorizedValue && "Can't find vectorizable value");

@@ -5332,125 +5663,140 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

}

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

- unsigned VecRegSize) {

- const unsigned ChainLen = Chain.size();

- LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen

+ unsigned Idx) {

+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()

<< "\n");

const unsigned Sz = R.getVectorElementSize(Chain[0]);

- const unsigned VF = VecRegSize / Sz;

+ const unsigned MinVF = R.getMinVecRegSize() / Sz;

+ unsigned VF = Chain.size();

- if (!isPowerOf2_32(Sz) || VF < 2)

+ if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)

return false;

- bool Changed = false;

- // Look for profitable vectorizable trees at all offsets, starting at zero.

- for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {

- ArrayRef<Value *> Operands = Chain.slice(i, VF);

- // Check that a previous iteration of this loop did not delete the Value.

- if (llvm::any_of(Operands, [&R](Value *V) {

- auto *I = dyn_cast<Instruction>(V);

- return I && R.isDeleted(I);

- }))

- continue;

- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i

- << "\n");

- R.buildTree(Operands);

- if (R.isTreeTinyAndNotFullyVectorizable())

- continue;

+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx

+ << "\n");

- R.computeMinimumValueSizes();

+ R.buildTree(Chain);

+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();

+ // TODO: Handle orders of size less than number of elements in the vector.

+ if (Order && Order->size() == Chain.size()) {

+ // TODO: reorder tree nodes without tree rebuilding.

+ SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());

+ llvm::transform(*Order, ReorderedOps.begin(),

+ [Chain](const unsigned Idx) { return Chain[Idx]; });

+ R.buildTree(ReorderedOps);

+ }

+ if (R.isTreeTinyAndNotFullyVectorizable())

+ return false;

- int Cost = R.getTreeCost();

+ R.computeMinimumValueSizes();

- LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF

- << "\n");

- if (Cost < -SLPCostThreshold) {

- LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");

+ int Cost = R.getTreeCost();

- using namespace ore;

+ LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");

+ if (Cost < -SLPCostThreshold) {

+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");

- R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",

- cast<StoreInst>(Chain[i]))

- << "Stores SLP vectorized with cost " << NV("Cost", Cost)

- << " and with tree size "

- << NV("TreeSize", R.getTreeSize()));

+ using namespace ore;

- R.vectorizeTree();

+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",

+ cast<StoreInst>(Chain[0]))

+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)

+ << " and with tree size "

+ << NV("TreeSize", R.getTreeSize()));

- // Move to the next bundle.

- i += VF - 1;

- Changed = true;

- }

+ R.vectorizeTree();

+ return true;

}

- return Changed;

+ return false;

}

bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

BoUpSLP &R) {

- SetVector<StoreInst *> Heads;

- SmallDenseSet<StoreInst *> Tails;

- SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;

// We may run into multiple chains that merge into a single chain. We mark the

// stores that we vectorized so that we don't visit the same store twice.

BoUpSLP::ValueSet VectorizedStores;

bool Changed = false;

- auto &&FindConsecutiveAccess =

- [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {

- if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))

- return false;

- Tails.insert(Stores[Idx]);

- Heads.insert(Stores[K]);

- ConsecutiveChain[Stores[K]] = Stores[Idx];

- return true;

- };

+ int E = Stores.size();

+ SmallBitVector Tails(E, false);

+ SmallVector<int, 16> ConsecutiveChain(E, E + 1);

+ int MaxIter = MaxStoreLookup.getValue();

+ int IterCnt;

+ auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,

+ &ConsecutiveChain](int K, int Idx) {

+ if (IterCnt >= MaxIter)

+ return true;

+ ++IterCnt;

+ if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))

+ return false;

+ Tails.set(Idx);

+ ConsecutiveChain[K] = Idx;

+ return true;

+ };

// Do a quadratic search on all of the given stores in reverse order and find

// all of the pairs of stores that follow each other.

- int E = Stores.size();

for (int Idx = E - 1; Idx >= 0; --Idx) {

// If a store has multiple consecutive store candidates, search according

// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...

// This is because usually pairing with immediate succeeding or preceding

// candidate create the best chance to find slp vectorization opportunity.

- for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)

+ const int MaxLookDepth = std::max(E - Idx, Idx + 1);

+ IterCnt = 0;

+ for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)

if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||

(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))

break;

}

// For stores that start but don't end a link in the chain:

- for (auto *SI : llvm::reverse(Heads)) {

- if (Tails.count(SI))

+ for (int Cnt = E; Cnt > 0; --Cnt) {

+ int I = Cnt - 1;

+ if (ConsecutiveChain[I] == E + 1 || Tails.test(I))

continue;

// We found a store instr that starts a chain. Now follow the chain and try

// to vectorize it.

BoUpSLP::ValueList Operands;

- StoreInst *I = SI;

// Collect the chain into a list.

- while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {

- Operands.push_back(I);

+ while (I != E + 1 && !VectorizedStores.count(Stores[I])) {

+ Operands.push_back(Stores[I]);

// Move to the next value in the chain.

I = ConsecutiveChain[I];

}

+ // If a vector register can't hold 1 element, we are done.

+ unsigned MaxVecRegSize = R.getMaxVecRegSize();

+ unsigned EltSize = R.getVectorElementSize(Stores[0]);

+ if (MaxVecRegSize % EltSize != 0)

+ continue;

+ unsigned MaxElts = MaxVecRegSize / EltSize;

// FIXME: Is division-by-2 the correct step? Should we assert that the

// register size is a power-of-2?

- for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();

- Size /= 2) {

- if (vectorizeStoreChain(Operands, R, Size)) {

- // Mark the vectorized stores so that we don't vectorize them again.

- VectorizedStores.insert(Operands.begin(), Operands.end());

- Changed = true;

- break;

+ unsigned StartIdx = 0;

+ for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {

+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {

+ ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);

+ if (!VectorizedStores.count(Slice.front()) &&

+ !VectorizedStores.count(Slice.back()) &&

+ vectorizeStoreChain(Slice, R, Cnt)) {

+ // Mark the vectorized stores so that we don't vectorize them again.

+ VectorizedStores.insert(Slice.begin(), Slice.end());

+ Changed = true;

+ // If we vectorized initial block, no need to try to vectorize it

+ // again.

+ if (Cnt == StartIdx)

+ StartIdx += Size;

+ Cnt += Size;

+ continue;

+ }

+ ++Cnt;

}

+ // Check if the whole array was vectorized already - exit.

+ if (StartIdx >= Operands.size())

+ break;

}

@@ -5835,38 +6181,36 @@ class HorizontalReduction {

explicit operator bool() const { return Opcode; }

- /// Get the index of the first operand.

- unsigned getFirstOperandIndex() const {

- assert(!!*this && "The opcode is not set.");

+ /// Return true if this operation is any kind of minimum or maximum.

+ bool isMinMax() const {

switch (Kind) {

+ case RK_Arithmetic:

+ return false;

case RK_Min:

- case RK_UMin:

case RK_Max:

+ case RK_UMin:

case RK_UMax:

- return 1;

- case RK_Arithmetic:

+ return true;

case RK_None:

break;

}

- return 0;

+ llvm_unreachable("Reduction kind is not set");

+ }

+ /// Get the index of the first operand.

+ unsigned getFirstOperandIndex() const {

+ assert(!!*this && "The opcode is not set.");

+ // We allow calling this before 'Kind' is set, so handle that specially.

+ if (Kind == RK_None)

+ return 0;

+ return isMinMax() ? 1 : 0;

}

/// Total number of operands in the reduction operation.

unsigned getNumberOfOperands() const {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- return 2;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

- return 3;

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ return isMinMax() ? 3 : 2;

}

/// Checks if the operation has the same parent as \p P.

@@ -5875,79 +6219,46 @@ class HorizontalReduction {

"Expected reduction operation.");

if (!IsRedOp)

return I->getParent() == P;

- switch (Kind) {

- case RK_Arithmetic:

- // Arithmetic reduction operation must be used once only.

- return I->getParent() == P;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax: {

+ if (isMinMax()) {

// SelectInst must be used twice while the condition op must have single

// use only.

auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());

return I->getParent() == P && Cmp && Cmp->getParent() == P;

}

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ // Arithmetic reduction operation must be used once only.

+ return I->getParent() == P;

}

/// Expected number of uses for reduction operations/reduced values.

bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- return I->hasOneUse();

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

+ if (isMinMax())

return I->hasNUses(2) &&

(!IsReductionOp ||

cast<SelectInst>(I)->getCondition()->hasOneUse());

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ return I->hasOneUse();

}

/// Initializes the list of reduction operations.

void initReductionOps(ReductionOpsListType &ReductionOps) {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- ReductionOps.assign(1, ReductionOpsType());

- break;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

+ if (isMinMax())

ReductionOps.assign(2, ReductionOpsType());

- break;

- case RK_None:

- llvm_unreachable("Reduction kind is not set");

- }

+ else

+ ReductionOps.assign(1, ReductionOpsType());

}

/// Add all reduction operations for the reduction instruction \p I.

void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- ReductionOps[0].emplace_back(I);

- break;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

+ if (isMinMax()) {

ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());

ReductionOps[1].emplace_back(I);

- break;

- case RK_None:

- llvm_unreachable("Reduction kind is not set");

+ } else {

+ ReductionOps[0].emplace_back(I);

}

@@ -5980,12 +6291,12 @@ class HorizontalReduction {

/// Checks if two operation data are both a reduction op or both a reduced

/// value.

- bool operator==(const OperationData &OD) {

+ bool operator==(const OperationData &OD) const {

assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&

"One of the comparing operations is incorrect.");

return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);

}

- bool operator!=(const OperationData &OD) { return !(*this == OD); }

+ bool operator!=(const OperationData &OD) const { return !(*this == OD); }

void clear() {

Opcode = 0;

LHS = nullptr;

@@ -6005,18 +6316,7 @@ class HorizontalReduction {

Value *getLHS() const { return LHS; }

Value *getRHS() const { return RHS; }

Type *getConditionType() const {

- switch (Kind) {

- case RK_Arithmetic:

- return nullptr;

- case RK_Min:

- case RK_Max:

- case RK_UMin:

- case RK_UMax:

- return CmpInst::makeCmpResultType(LHS->getType());

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;

}

/// Creates reduction operation with the current opcode with the IR flags

@@ -6400,6 +6700,18 @@ public:

assert(Pair.first && "DebugLoc must be set.");

ExternallyUsedValues[Pair.second].push_back(Pair.first);

}

+ // The compare instruction of a min/max is the insertion point for new

+ // instructions and may be replaced with a new compare instruction.

+ auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

+ assert(isa<SelectInst>(RdxRootInst) &&

+ "Expected min/max reduction to have select root instruction");

+ Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();

+ assert(isa<Instruction>(ScalarCond) &&

+ "Expected min/max reduction to have compare condition");

+ return cast<Instruction>(ScalarCond);

+ };

// The reduction root is used as the insertion point for new instructions,

// so set it as externally used to prevent it from being deleted.

ExternallyUsedValues[ReductionRoot];

@@ -6455,8 +6767,14 @@ public:

DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();

Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);

- // Emit a reduction.

- Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));

+ // Emit a reduction. For min/max, the root is a select, but the insertion

+ // point is the compare condition of that select.

+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

+ if (ReductionData.isMinMax())

+ Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));

+ else

+ Builder.SetInsertPoint(RdxRootInst);

Value *ReducedSubTree =

emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

if (VectorizedTree) {

@@ -6492,8 +6810,20 @@ public:

VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);

}

- // Update users.

+ // Update users. For a min/max reduction that ends with a compare and

+ // select, we also have to RAUW for the compare instruction feeding the

+ // reduction root. That's because the original compare may have extra uses

+ // besides the final select of the reduction.

+ if (ReductionData.isMinMax()) {

+ if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {

+ Instruction *ScalarCmp =

+ getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));

+ ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());

+ }

ReductionRoot->replaceAllUsesWith(VectorizedTree);

// Mark all scalar reduction ops for deletion, they are replaced by the

// vector reductions.

V.eraseInstructions(IgnoreList);

@@ -6619,45 +6949,54 @@ private:

/// %rb = insertelement <4 x float> %ra, float %s1, i32 1

/// %rc = insertelement <4 x float> %rb, float %s2, i32 2

/// %rd = insertelement <4 x float> %rc, float %s3, i32 3

-/// starting from the last insertelement instruction.

+/// starting from the last insertelement or insertvalue instruction.

///

-/// Returns true if it matches

-static bool findBuildVector(InsertElementInst *LastInsertElem,

- TargetTransformInfo *TTI,

- SmallVectorImpl<Value *> &BuildVectorOpds,

- int &UserCost) {

- UserCost = 0;

- Value *V = nullptr;

- do {

- if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {

- UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,

- LastInsertElem->getType(),

- CI->getZExtValue());

- }

- BuildVectorOpds.push_back(LastInsertElem->getOperand(1));

- V = LastInsertElem->getOperand(0);

- if (isa<UndefValue>(V))

- break;

- LastInsertElem = dyn_cast<InsertElementInst>(V);

- if (!LastInsertElem || !LastInsertElem->hasOneUse())

- return false;

- } while (true);

- std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());

- return true;

-/// Like findBuildVector, but looks for construction of aggregate.

+/// Also recognize aggregates like {<2 x float>, <2 x float>},

+/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.

+/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.

+///

+/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.

///

/// \return true if it matches.

-static bool findBuildAggregate(InsertValueInst *IV,

- SmallVectorImpl<Value *> &BuildVectorOpds) {

+static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,

+ SmallVectorImpl<Value *> &BuildVectorOpds,

+ int &UserCost) {

+ assert((isa<InsertElementInst>(LastInsertInst) ||

+ isa<InsertValueInst>(LastInsertInst)) &&

+ "Expected insertelement or insertvalue instruction!");

+ UserCost = 0;

do {

- BuildVectorOpds.push_back(IV->getInsertedValueOperand());

- Value *V = IV->getAggregateOperand();

- if (isa<UndefValue>(V))

+ Value *InsertedOperand;

+ if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {

+ InsertedOperand = IE->getOperand(1);

+ LastInsertInst = IE->getOperand(0);

+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {

+ UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,

+ IE->getType(), CI->getZExtValue());

+ }

+ } else {

+ auto *IV = cast<InsertValueInst>(LastInsertInst);

+ InsertedOperand = IV->getInsertedValueOperand();

+ LastInsertInst = IV->getAggregateOperand();

+ }

+ if (isa<InsertElementInst>(InsertedOperand) ||

+ isa<InsertValueInst>(InsertedOperand)) {

+ int TmpUserCost;

+ SmallVector<Value *, 8> TmpBuildVectorOpds;

+ if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,

+ TmpUserCost))

+ return false;

+ BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),

+ TmpBuildVectorOpds.rend());

+ UserCost += TmpUserCost;

+ } else {

+ BuildVectorOpds.push_back(InsertedOperand);

+ }

+ if (isa<UndefValue>(LastInsertInst))

break;

- IV = dyn_cast<InsertValueInst>(V);

- if (!IV || !IV->hasOneUse())

+ if ((!isa<InsertValueInst>(LastInsertInst) &&

+ !isa<InsertElementInst>(LastInsertInst)) ||

+ !LastInsertInst->hasOneUse())

return false;

} while (true);

std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());

@@ -6825,25 +7164,26 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,

bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

BasicBlock *BB, BoUpSLP &R) {

+ int UserCost = 0;

const DataLayout &DL = BB->getModule()->getDataLayout();

if (!R.canMapToVector(IVI->getType(), DL))

return false;

SmallVector<Value *, 16> BuildVectorOpds;

- if (!findBuildAggregate(IVI, BuildVectorOpds))

+ if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))

return false;

LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");

// Aggregate value is unlikely to be processed in vector register, we need to

// extract scalars into scalar registers, so NeedExtraction is set true.

- return tryToVectorizeList(BuildVectorOpds, R);

+ return tryToVectorizeList(BuildVectorOpds, R, UserCost);

}

bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

BasicBlock *BB, BoUpSLP &R) {

int UserCost;

SmallVector<Value *, 16> BuildVectorOpds;

- if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||

+ if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||

(llvm::all_of(BuildVectorOpds,

[](Value *V) { return isa<ExtractElementInst>(V); }) &&

isShuffle(BuildVectorOpds)))

@@ -7118,14 +7458,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "

<< it->second.size() << ".\n");

- // Process the stores in chunks of 16.

- // TODO: The limit of 16 inhibits greater vectorization factors.

- // For example, AVX2 supports v32i8. Increasing this limit, however,

- // may cause a significant compile-time increase.

- for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {

- unsigned Len = std::min<unsigned>(CE - CI, 16);

- Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);

- }

+ Changed |= vectorizeStores(it->second, R);

}

return Changed;

}