aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-24 22:00:03 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-24 22:00:03 +0000
commit480093f4440d54b30b3025afeac24b48f2ba7a2e (patch)
tree162e72994062888647caf0d875428db9445491a8 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
parent489b1cf2ecf5b9b4a394857987014bfb09067726 (diff)
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp965
1 files changed, 649 insertions, 316 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 974eff9974d9..aabd974cd73e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -72,6 +73,7 @@
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -127,6 +129,10 @@ static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
+static cl::opt<int>
+MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
+ cl::desc("Maximum depth of the lookup for consecutive stores."));
+
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
@@ -147,6 +153,20 @@ static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
+// The maximum depth that the look-ahead score heuristic will explore.
+// The higher this value, the higher the compilation time overhead.
+static cl::opt<int> LookAheadMaxDepth(
+ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
+ cl::desc("The maximum look-ahead depth for operand reordering scores"));
+
+// The Look-ahead heuristic goes through the users of the bundle to calculate
+// the users cost in getExternalUsesCost(). To avoid compilation time increase
+// we limit the number of users visited to this value.
+static cl::opt<unsigned> LookAheadUsersBudget(
+ "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
+ cl::desc("The maximum number of users to visit while visiting the "
+ "predecessors. This prevents compilation time increase."));
+
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
@@ -547,7 +567,7 @@ public:
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
- /// into account (anf updating it, if required) list of externally used
+ /// into account (and updating it, if required) list of externally used
/// values stored in \p ExternallyUsedValues.
void buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
@@ -609,7 +629,10 @@ public:
return MinVecRegSize;
}
- /// Check if ArrayType or StructType is isomorphic to some VectorType.
+ /// Check if homogeneous aggregate is isomorphic to some VectorType.
+ /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
+ /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
+ /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -721,6 +744,7 @@ public:
const DataLayout &DL;
ScalarEvolution &SE;
+ const BoUpSLP &R;
/// \returns the operand data at \p OpIdx and \p Lane.
OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -746,6 +770,227 @@ public:
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
}
+ // The hard-coded scores listed here are not very important. When computing
+ // the scores of matching one sub-tree with another, we are basically
+ // counting the number of values that are matching. So even if all scores
+ // are set to 1, we would still get a decent matching result.
+ // However, sometimes we have to break ties. For example we may have to
+ // choose between matching loads vs matching opcodes. This is what these
+ // scores are helping us with: they provide the order of preference.
+
+ /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
+ static const int ScoreConsecutiveLoads = 3;
+ /// ExtractElementInst from same vector and consecutive indexes.
+ static const int ScoreConsecutiveExtracts = 3;
+ /// Constants.
+ static const int ScoreConstants = 2;
+ /// Instructions with the same opcode.
+ static const int ScoreSameOpcode = 2;
+ /// Instructions with alt opcodes (e.g, add + sub).
+ static const int ScoreAltOpcodes = 1;
+ /// Identical instructions (a.k.a. splat or broadcast).
+ static const int ScoreSplat = 1;
+ /// Matching with an undef is preferable to failing.
+ static const int ScoreUndef = 1;
+ /// Score for failing to find a decent match.
+ static const int ScoreFail = 0;
+ /// User exteranl to the vectorized code.
+ static const int ExternalUseCost = 1;
+ /// The user is internal but in a different lane.
+ static const int UserInDiffLaneCost = ExternalUseCost;
+
+ /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
+ static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
+ ScalarEvolution &SE) {
+ auto *LI1 = dyn_cast<LoadInst>(V1);
+ auto *LI2 = dyn_cast<LoadInst>(V2);
+ if (LI1 && LI2)
+ return isConsecutiveAccess(LI1, LI2, DL, SE)
+ ? VLOperands::ScoreConsecutiveLoads
+ : VLOperands::ScoreFail;
+
+ auto *C1 = dyn_cast<Constant>(V1);
+ auto *C2 = dyn_cast<Constant>(V2);
+ if (C1 && C2)
+ return VLOperands::ScoreConstants;
+
+ // Extracts from consecutive indexes of the same vector better score as
+ // the extracts could be optimized away.
+ auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
+ auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
+ if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
+ cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
+ cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
+ return VLOperands::ScoreConsecutiveExtracts;
+ }
+
+ auto *I1 = dyn_cast<Instruction>(V1);
+ auto *I2 = dyn_cast<Instruction>(V2);
+ if (I1 && I2) {
+ if (I1 == I2)
+ return VLOperands::ScoreSplat;
+ InstructionsState S = getSameOpcode({I1, I2});
+ // Note: Only consider instructions with <= 2 operands to avoid
+ // complexity explosion.
+ if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
+ return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
+ : VLOperands::ScoreSameOpcode;
+ }
+
+ if (isa<UndefValue>(V2))
+ return VLOperands::ScoreUndef;
+
+ return VLOperands::ScoreFail;
+ }
+
+ /// Holds the values and their lane that are taking part in the look-ahead
+ /// score calculation. This is used in the external uses cost calculation.
+ SmallDenseMap<Value *, int> InLookAheadValues;
+
+ /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
+ /// either external to the vectorized code, or require shuffling.
+ int getExternalUsesCost(const std::pair<Value *, int> &LHS,
+ const std::pair<Value *, int> &RHS) {
+ int Cost = 0;
+ SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
+ for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
+ Value *V = Values[Idx].first;
+ // Calculate the absolute lane, using the minimum relative lane of LHS
+ // and RHS as base and Idx as the offset.
+ int Ln = std::min(LHS.second, RHS.second) + Idx;
+ assert(Ln >= 0 && "Bad lane calculation");
+ unsigned UsersBudget = LookAheadUsersBudget;
+ for (User *U : V->users()) {
+ if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
+ // The user is in the VectorizableTree. Check if we need to insert.
+ auto It = llvm::find(UserTE->Scalars, U);
+ assert(It != UserTE->Scalars.end() && "U is in UserTE");
+ int UserLn = std::distance(UserTE->Scalars.begin(), It);
+ assert(UserLn >= 0 && "Bad lane");
+ if (UserLn != Ln)
+ Cost += UserInDiffLaneCost;
+ } else {
+ // Check if the user is in the look-ahead code.
+ auto It2 = InLookAheadValues.find(U);
+ if (It2 != InLookAheadValues.end()) {
+ // The user is in the look-ahead code. Check the lane.
+ if (It2->second != Ln)
+ Cost += UserInDiffLaneCost;
+ } else {
+ // The user is neither in SLP tree nor in the look-ahead code.
+ Cost += ExternalUseCost;
+ }
+ }
+ // Limit the number of visited uses to cap compilation time.
+ if (--UsersBudget == 0)
+ break;
+ }
+ }
+ return Cost;
+ }
+
+ /// Go through the operands of \p LHS and \p RHS recursively until \p
+ /// MaxLevel, and return the cummulative score. For example:
+ /// \verbatim
+ /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
+ /// \ / \ / \ / \ /
+ /// + + + +
+ /// G1 G2 G3 G4
+ /// \endverbatim
+ /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
+ /// each level recursively, accumulating the score. It starts from matching
+ /// the additions at level 0, then moves on to the loads (level 1). The
+ /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
+ /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
+ /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+ /// Please note that the order of the operands does not matter, as we
+ /// evaluate the score of all profitable combinations of operands. In
+ /// other words the score of G1 and G4 is the same as G1 and G2. This
+ /// heuristic is based on ideas described in:
+ /// Look-ahead SLP: Auto-vectorization in the presence of commutative
+ /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+ /// Luís F. W. Góes
+ int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
+ const std::pair<Value *, int> &RHS, int CurrLevel,
+ int MaxLevel) {
+
+ Value *V1 = LHS.first;
+ Value *V2 = RHS.first;
+ // Get the shallow score of V1 and V2.
+ int ShallowScoreAtThisLevel =
+ std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
+ getExternalUsesCost(LHS, RHS));
+ int Lane1 = LHS.second;
+ int Lane2 = RHS.second;
+
+ // If reached MaxLevel,
+ // or if V1 and V2 are not instructions,
+ // or if they are SPLAT,
+ // or if they are not consecutive, early return the current cost.
+ auto *I1 = dyn_cast<Instruction>(V1);
+ auto *I2 = dyn_cast<Instruction>(V2);
+ if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
+ ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+ (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+ return ShallowScoreAtThisLevel;
+ assert(I1 && I2 && "Should have early exited.");
+
+ // Keep track of in-tree values for determining the external-use cost.
+ InLookAheadValues[V1] = Lane1;
+ InLookAheadValues[V2] = Lane2;
+
+ // Contains the I2 operand indexes that got matched with I1 operands.
+ SmallSet<unsigned, 4> Op2Used;
+
+ // Recursion towards the operands of I1 and I2. We are trying all possbile
+ // operand pairs, and keeping track of the best score.
+ for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
+ OpIdx1 != NumOperands1; ++OpIdx1) {
+ // Try to pair op1I with the best operand of I2.
+ int MaxTmpScore = 0;
+ unsigned MaxOpIdx2 = 0;
+ bool FoundBest = false;
+ // If I2 is commutative try all combinations.
+ unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
+ unsigned ToIdx = isCommutative(I2)
+ ? I2->getNumOperands()
+ : std::min(I2->getNumOperands(), OpIdx1 + 1);
+ assert(FromIdx <= ToIdx && "Bad index");
+ for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
+ // Skip operands already paired with OpIdx1.
+ if (Op2Used.count(OpIdx2))
+ continue;
+ // Recursively calculate the cost at each level
+ int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
+ {I2->getOperand(OpIdx2), Lane2},
+ CurrLevel + 1, MaxLevel);
+ // Look for the best score.
+ if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+ MaxTmpScore = TmpScore;
+ MaxOpIdx2 = OpIdx2;
+ FoundBest = true;
+ }
+ }
+ if (FoundBest) {
+ // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
+ Op2Used.insert(MaxOpIdx2);
+ ShallowScoreAtThisLevel += MaxTmpScore;
+ }
+ }
+ return ShallowScoreAtThisLevel;
+ }
+
+ /// \Returns the look-ahead score, which tells us how much the sub-trees
+ /// rooted at \p LHS and \p RHS match, the more they match the higher the
+ /// score. This helps break ties in an informed way when we cannot decide on
+ /// the order of the operands by just considering the immediate
+ /// predecessors.
+ int getLookAheadScore(const std::pair<Value *, int> &LHS,
+ const std::pair<Value *, int> &RHS) {
+ InLookAheadValues.clear();
+ return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
+ }
+
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return None.
@@ -763,9 +1008,6 @@ public:
// The linearized opcode of the operand at OpIdx, Lane.
bool OpIdxAPO = getData(OpIdx, Lane).APO;
- const unsigned BestScore = 2;
- const unsigned GoodScore = 1;
-
// The best operand index and its score.
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
// are using the score to differentiate between the two.
@@ -794,41 +1036,19 @@ public:
// Look for an operand that matches the current mode.
switch (RMode) {
case ReorderingMode::Load:
- if (isa<LoadInst>(Op)) {
- // Figure out which is left and right, so that we can check for
- // consecutive loads
- bool LeftToRight = Lane > LastLane;
- Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
- Value *OpRight = (LeftToRight) ? Op : OpLastLane;
- if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
- cast<LoadInst>(OpRight), DL, SE))
- BestOp.Idx = Idx;
- }
- break;
- case ReorderingMode::Opcode:
- // We accept both Instructions and Undefs, but with different scores.
- if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
- cast<Instruction>(Op)->getOpcode() ==
- cast<Instruction>(OpLastLane)->getOpcode()) ||
- (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
- isa<UndefValue>(Op)) {
- // An instruction has a higher score than an undef.
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
- if (Score > BestOp.Score) {
- BestOp.Idx = Idx;
- BestOp.Score = Score;
- }
- }
- break;
case ReorderingMode::Constant:
- if (isa<Constant>(Op)) {
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
- if (Score > BestOp.Score) {
- BestOp.Idx = Idx;
- BestOp.Score = Score;
- }
+ case ReorderingMode::Opcode: {
+ bool LeftToRight = Lane > LastLane;
+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+ unsigned Score =
+ getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
+ if (Score > BestOp.Score) {
+ BestOp.Idx = Idx;
+ BestOp.Score = Score;
}
break;
+ }
case ReorderingMode::Splat:
if (Op == OpLastLane)
BestOp.Idx = Idx;
@@ -959,8 +1179,8 @@ public:
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
- ScalarEvolution &SE)
- : DL(DL), SE(SE) {
+ ScalarEvolution &SE, const BoUpSLP &R)
+ : DL(DL), SE(SE), R(R) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
@@ -1189,7 +1409,8 @@ private:
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const DataLayout &DL,
- ScalarEvolution &SE);
+ ScalarEvolution &SE,
+ const BoUpSLP &R);
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -1211,7 +1432,8 @@ private:
Value *VectorizedValue = nullptr;
/// Do we need to gather this sequence ?
- bool NeedToGather = false;
+ enum EntryState { Vectorize, NeedToGather };
+ EntryState State;
/// Does this sequence require some shuffling?
SmallVector<unsigned, 4> ReuseShuffleIndices;
@@ -1353,15 +1575,30 @@ private:
dbgs() << "Scalars: \n";
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
- dbgs() << "NeedToGather: " << NeedToGather << "\n";
- dbgs() << "MainOp: " << *MainOp << "\n";
- dbgs() << "AltOp: " << *AltOp << "\n";
+ dbgs() << "State: ";
+ switch (State) {
+ case Vectorize:
+ dbgs() << "Vectorize\n";
+ break;
+ case NeedToGather:
+ dbgs() << "NeedToGather\n";
+ break;
+ }
+ dbgs() << "MainOp: ";
+ if (MainOp)
+ dbgs() << *MainOp << "\n";
+ else
+ dbgs() << "NULL\n";
+ dbgs() << "AltOp: ";
+ if (AltOp)
+ dbgs() << *AltOp << "\n";
+ else
+ dbgs() << "NULL\n";
dbgs() << "VectorizedValue: ";
if (VectorizedValue)
- dbgs() << *VectorizedValue;
+ dbgs() << *VectorizedValue << "\n";
else
- dbgs() << "NULL";
- dbgs() << "\n";
+ dbgs() << "NULL\n";
dbgs() << "ReuseShuffleIndices: ";
if (ReuseShuffleIndices.empty())
dbgs() << "Emtpy";
@@ -1392,7 +1629,7 @@ private:
TreeEntry *Last = VectorizableTree.back().get();
Last->Idx = VectorizableTree.size() - 1;
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
- Last->NeedToGather = !Vectorized;
+ Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
Last->ReorderIndices = ReorderIndices;
@@ -1721,7 +1958,7 @@ private:
return nullptr;
}
- bool isInSchedulingRegion(ScheduleData *SD) {
+ bool isInSchedulingRegion(ScheduleData *SD) const {
return SD->SchedulingRegionID == SchedulingRegionID;
}
@@ -2063,7 +2300,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
static std::string getNodeAttributes(const TreeEntry *Entry,
const BoUpSLP *) {
- if (Entry->NeedToGather)
+ if (Entry->State == TreeEntry::NeedToGather)
return "color=red";
return "";
}
@@ -2115,7 +2352,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
- if (Entry->NeedToGather)
+ if (Entry->State == TreeEntry::NeedToGather)
continue;
// For each lane:
@@ -2152,7 +2389,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
- assert(!UseEntry->NeedToGather && "Bad state");
+ assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
continue;
}
}
@@ -2448,7 +2685,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
uint64_t Size = DL->getTypeAllocSize(ScalarTy);
// Check that the sorted loads are consecutive.
- if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
++NumOpsWantToKeepOriginalOrder;
@@ -2543,7 +2780,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
@@ -2590,7 +2827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -2637,9 +2874,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// We don't combine GEPs with non-constant indexes.
+ Type *Ty1 = VL0->getOperand(1)->getType();
for (Value *V : VL) {
auto Op = cast<Instruction>(V)->getOperand(1);
- if (!isa<ConstantInt>(Op)) {
+ if (!isa<ConstantInt>(Op) ||
+ (Op->getType() != Ty1 &&
+ Op->getType()->getScalarSizeInBits() >
+ DL->getIndexSizeInBits(
+ V->getType()->getPointerAddressSpace()))) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
@@ -2665,24 +2907,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::Store: {
// Check if the stores are consecutive or if we need to swizzle them.
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
- if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+ llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+ // Make sure all stores in the bundle are simple - we can't vectorize
+ // atomic or volatile stores.
+ SmallVector<Value *, 4> PointerOps(VL.size());
+ ValueList Operands(VL.size());
+ auto POIter = PointerOps.begin();
+ auto OIter = Operands.begin();
+ for (Value *V : VL) {
+ auto *SI = cast<StoreInst>(V);
+ if (!SI->isSimple()) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
return;
}
+ *POIter = SI->getPointerOperand();
+ *OIter = SI->getValueOperand();
+ ++POIter;
+ ++OIter;
+ }
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ OrdersType CurrentOrder;
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
+ }
+ const SCEV *Scev0 = SE->getSCEV(Ptr0);
+ const SCEV *ScevN = SE->getSCEV(PtrN);
+ const auto *Diff =
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ // Check that the sorted pointer operands are consecutive.
+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+ if (CurrentOrder.empty()) {
+ // Original stores are consecutive and does not require reordering.
+ ++NumOpsWantToKeepOriginalOrder;
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ } else {
+ // Need to reorder.
+ auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++(I->getSecond());
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, I->getFirst());
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+ }
+ return;
+ }
+ }
- ValueList Operands;
- for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(0));
- TE->setOperandsInOrder();
- buildTree_rec(Operands, Depth + 1, {TE, 0});
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
case Instruction::Call: {
@@ -2777,7 +3069,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -2806,27 +3098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
- unsigned N;
- Type *EltTy;
- auto *ST = dyn_cast<StructType>(T);
- if (ST) {
- N = ST->getNumElements();
- EltTy = *ST->element_begin();
- } else {
- N = cast<ArrayType>(T)->getNumElements();
- EltTy = cast<ArrayType>(T)->getElementType();
+ unsigned N = 1;
+ Type *EltTy = T;
+
+ while (isa<CompositeType>(EltTy)) {
+ if (auto *ST = dyn_cast<StructType>(EltTy)) {
+ // Check that struct is homogeneous.
+ for (const auto *Ty : ST->elements())
+ if (Ty != *ST->element_begin())
+ return 0;
+ N *= ST->getNumElements();
+ EltTy = *ST->element_begin();
+ } else {
+ auto *SeqT = cast<SequentialType>(EltTy);
+ N *= SeqT->getNumElements();
+ EltTy = SeqT->getElementType();
+ }
}
+
if (!isValidElementType(EltTy))
return 0;
uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
- if (ST) {
- // Check that struct is homogeneous.
- for (const auto *Ty : ST->elements())
- if (Ty != EltTy)
- return 0;
- }
return N;
}
@@ -2927,7 +3221,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
ReuseShuffleCost =
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
- if (E->NeedToGather) {
+ if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
@@ -2995,7 +3289,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
- if (!E->NeedToGather) {
+ if (E->State == TreeEntry::Vectorize) {
int DeadCost = ReuseShuffleCost;
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
@@ -3135,13 +3429,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarEltCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+ E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK,
- Op2VK, Op1VP, Op2VP, Operands);
+ int VecCost = TTI->getArithmeticInstrCost(
+ E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3162,7 +3456,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
- unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
+ MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());
int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
if (NeedToShuffleReuses) {
@@ -3180,15 +3474,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
- unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
+ bool IsReorder = !E->ReorderIndices.empty();
+ auto *SI =
+ cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
+ MaybeAlign Alignment(SI->getAlignment());
int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
- if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- }
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
+ if (NeedToShuffleReuses)
+ ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
- int VecStCost =
- TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
+ VecTy, Alignment, 0, VL0);
+ if (IsReorder) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ VecStCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
return ReuseShuffleCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
@@ -3274,20 +3575,22 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
<< VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
- if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)
+ if (VectorizableTree.size() == 1 &&
+ VectorizableTree[0]->State == TreeEntry::Vectorize)
return true;
if (VectorizableTree.size() != 2)
return false;
// Handle splat and all-constants stores.
- if (!VectorizableTree[0]->NeedToGather &&
+ if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
(allConstant(VectorizableTree[1]->Scalars) ||
isSplat(VectorizableTree[1]->Scalars)))
return true;
// Gathering cost would be too much for tiny trees.
- if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)
+ if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
+ VectorizableTree[1]->State == TreeEntry::NeedToGather)
return false;
return true;
@@ -3397,7 +3700,7 @@ int BoUpSLP::getSpillCost() const {
continue;
}
- // Debug informations don't impact spill cost.
+ // Debug information does not impact spill cost.
if ((isa<CallInst>(&*PrevInstIt) &&
!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
&*PrevInstIt != PrevInst)
@@ -3441,12 +3744,13 @@ int BoUpSLP::getTreeCost() {
// their uses. Since such an approach results in fewer total entries,
// existing heuristics based on tree size may yield different results.
//
- if (TE.NeedToGather &&
- std::any_of(
- std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),
- [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
- return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);
- }))
+ if (TE.State == TreeEntry::NeedToGather &&
+ std::any_of(std::next(VectorizableTree.begin(), I + 1),
+ VectorizableTree.end(),
+ [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+ return EntryPtr->State == TreeEntry::NeedToGather &&
+ EntryPtr->isSame(TE.Scalars);
+ }))
continue;
int C = getEntryCost(&TE);
@@ -3538,13 +3842,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Perform operand reordering on the instructions in VL and return the reordered
// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(
- ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right, const DataLayout &DL,
- ScalarEvolution &SE) {
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ const DataLayout &DL,
+ ScalarEvolution &SE,
+ const BoUpSLP &R) {
if (VL.empty())
return;
- VLOperands Ops(VL, DL, SE);
+ VLOperands Ops(VL, DL, SE, R);
// Reorder the operands in place.
Ops.reorder();
Left = Ops.getVL(0);
@@ -3735,7 +4041,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- if (E->NeedToGather) {
+ if (E->State == TreeEntry::NeedToGather) {
setInsertPointAfterBundle(E);
auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
@@ -3790,7 +4096,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::ExtractElement: {
- if (!E->NeedToGather) {
+ if (E->State == TreeEntry::Vectorize) {
Value *V = E->getSingleOperand(0);
if (!E->ReorderIndices.empty()) {
OrdersType Mask;
@@ -3823,7 +4129,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::ExtractValue: {
- if (!E->NeedToGather) {
+ if (E->State == TreeEntry::Vectorize) {
LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
@@ -4050,15 +4356,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(VL0);
+ bool IsReorder = !E->ReorderIndices.empty();
+ auto *SI = cast<StoreInst>(
+ IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
setInsertPointAfterBundle(E);
Value *VecValue = vectorizeTree(E->getOperand(0));
+ if (IsReorder) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ VecValue = Builder.CreateShuffleVector(
+ VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
+ "reorder_shuffle");
+ }
Value *ScalarPtr = SI->getPointerOperand();
- Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
+ Value *VecPtr = Builder.CreateBitCast(
+ ScalarPtr, VecValue->getType()->getPointerTo(AS));
StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
// The pointer operand uses an in-tree scalar, so add the new BitCast to
@@ -4088,7 +4404,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
std::vector<Value *> OpVecs;
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
- Value *OpVec = vectorizeTree(E->getOperand(j));
+ ValueList &VL = E->getOperand(j);
+ // Need to cast all elements to the same type before vectorization to
+ // avoid crash.
+ Type *VL0Ty = VL0->getOperand(j)->getType();
+ Type *Ty = llvm::all_of(
+ VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
+ ? VL0Ty
+ : DL->getIndexType(cast<GetElementPtrInst>(VL0)
+ ->getPointerOperandType()
+ ->getScalarType());
+ for (Value *&V : VL) {
+ auto *CI = cast<ConstantInt>(V);
+ V = ConstantExpr::getIntegerCast(CI, Ty,
+ CI->getValue().isSignBitSet());
+ }
+ Value *OpVec = vectorizeTree(VL);
OpVecs.push_back(OpVec);
}
@@ -4284,7 +4615,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
TreeEntry *E = getTreeEntry(Scalar);
assert(E && "Invalid scalar");
- assert(!E->NeedToGather && "Extracting from a gather list");
+ assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
@@ -4357,7 +4688,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
- if (Entry->NeedToGather)
+ if (Entry->State == TreeEntry::NeedToGather)
continue;
assert(Entry->VectorizedValue && "Can't find vectorizable value");
@@ -5332,125 +5663,140 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
}
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
- unsigned VecRegSize) {
- const unsigned ChainLen = Chain.size();
- LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+ unsigned Idx) {
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);
- const unsigned VF = VecRegSize / Sz;
+ const unsigned MinVF = R.getMinVecRegSize() / Sz;
+ unsigned VF = Chain.size();
- if (!isPowerOf2_32(Sz) || VF < 2)
+ if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
return false;
- bool Changed = false;
- // Look for profitable vectorizable trees at all offsets, starting at zero.
- for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
-
- ArrayRef<Value *> Operands = Chain.slice(i, VF);
- // Check that a previous iteration of this loop did not delete the Value.
- if (llvm::any_of(Operands, [&R](Value *V) {
- auto *I = dyn_cast<Instruction>(V);
- return I && R.isDeleted(I);
- }))
- continue;
-
- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
- << "\n");
-
- R.buildTree(Operands);
- if (R.isTreeTinyAndNotFullyVectorizable())
- continue;
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
+ << "\n");
- R.computeMinimumValueSizes();
+ R.buildTree(Chain);
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+ // TODO: Handle orders of size less than number of elements in the vector.
+ if (Order && Order->size() == Chain.size()) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
+ llvm::transform(*Order, ReorderedOps.begin(),
+ [Chain](const unsigned Idx) { return Chain[Idx]; });
+ R.buildTree(ReorderedOps);
+ }
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ return false;
- int Cost = R.getTreeCost();
+ R.computeMinimumValueSizes();
- LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
- << "\n");
- if (Cost < -SLPCostThreshold) {
- LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ int Cost = R.getTreeCost();
- using namespace ore;
+ LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ if (Cost < -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
- R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
- cast<StoreInst>(Chain[i]))
- << "Stores SLP vectorized with cost " << NV("Cost", Cost)
- << " and with tree size "
- << NV("TreeSize", R.getTreeSize()));
+ using namespace ore;
- R.vectorizeTree();
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Chain[0]))
+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+ << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
- // Move to the next bundle.
- i += VF - 1;
- Changed = true;
- }
+ R.vectorizeTree();
+ return true;
}
- return Changed;
+ return false;
}
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {
- SetVector<StoreInst *> Heads;
- SmallDenseSet<StoreInst *> Tails;
- SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
-
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- auto &&FindConsecutiveAccess =
- [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {
- if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
- return false;
-
- Tails.insert(Stores[Idx]);
- Heads.insert(Stores[K]);
- ConsecutiveChain[Stores[K]] = Stores[Idx];
- return true;
- };
+ int E = Stores.size();
+ SmallBitVector Tails(E, false);
+ SmallVector<int, 16> ConsecutiveChain(E, E + 1);
+ int MaxIter = MaxStoreLookup.getValue();
+ int IterCnt;
+ auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
+ &ConsecutiveChain](int K, int Idx) {
+ if (IterCnt >= MaxIter)
+ return true;
+ ++IterCnt;
+ if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
+ return false;
+ Tails.set(Idx);
+ ConsecutiveChain[K] = Idx;
+ return true;
+ };
// Do a quadratic search on all of the given stores in reverse order and find
// all of the pairs of stores that follow each other.
- int E = Stores.size();
for (int Idx = E - 1; Idx >= 0; --Idx) {
// If a store has multiple consecutive store candidates, search according
// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
// This is because usually pairing with immediate succeeding or preceding
// candidate create the best chance to find slp vectorization opportunity.
- for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)
+ const int MaxLookDepth = std::max(E - Idx, Idx + 1);
+ IterCnt = 0;
+ for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
break;
}
// For stores that start but don't end a link in the chain:
- for (auto *SI : llvm::reverse(Heads)) {
- if (Tails.count(SI))
+ for (int Cnt = E; Cnt > 0; --Cnt) {
+ int I = Cnt - 1;
+ if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
continue;
-
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
- StoreInst *I = SI;
// Collect the chain into a list.
- while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
- Operands.push_back(I);
+ while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
+ Operands.push_back(Stores[I]);
// Move to the next value in the chain.
I = ConsecutiveChain[I];
}
+ // If a vector register can't hold 1 element, we are done.
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
+ unsigned EltSize = R.getVectorElementSize(Stores[0]);
+ if (MaxVecRegSize % EltSize != 0)
+ continue;
+
+ unsigned MaxElts = MaxVecRegSize / EltSize;
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
- for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
- Size /= 2) {
- if (vectorizeStoreChain(Operands, R, Size)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Operands.begin(), Operands.end());
- Changed = true;
- break;
+ unsigned StartIdx = 0;
+ for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+ ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
+ if (!VectorizedStores.count(Slice.front()) &&
+ !VectorizedStores.count(Slice.back()) &&
+ vectorizeStoreChain(Slice, R, Cnt)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Slice.begin(), Slice.end());
+ Changed = true;
+ // If we vectorized initial block, no need to try to vectorize it
+ // again.
+ if (Cnt == StartIdx)
+ StartIdx += Size;
+ Cnt += Size;
+ continue;
+ }
+ ++Cnt;
}
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= Operands.size())
+ break;
}
}
@@ -5835,38 +6181,36 @@ class HorizontalReduction {
explicit operator bool() const { return Opcode; }
- /// Get the index of the first operand.
- unsigned getFirstOperandIndex() const {
- assert(!!*this && "The opcode is not set.");
+ /// Return true if this operation is any kind of minimum or maximum.
+ bool isMinMax() const {
switch (Kind) {
+ case RK_Arithmetic:
+ return false;
case RK_Min:
- case RK_UMin:
case RK_Max:
+ case RK_UMin:
case RK_UMax:
- return 1;
- case RK_Arithmetic:
+ return true;
case RK_None:
break;
}
- return 0;
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Get the index of the first operand.
+ unsigned getFirstOperandIndex() const {
+ assert(!!*this && "The opcode is not set.");
+ // We allow calling this before 'Kind' is set, so handle that specially.
+ if (Kind == RK_None)
+ return 0;
+ return isMinMax() ? 1 : 0;
}
/// Total number of operands in the reduction operation.
unsigned getNumberOfOperands() const {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
- switch (Kind) {
- case RK_Arithmetic:
- return 2;
- case RK_Min:
- case RK_UMin:
- case RK_Max:
- case RK_UMax:
- return 3;
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
+ return isMinMax() ? 3 : 2;
}
/// Checks if the operation has the same parent as \p P.
@@ -5875,79 +6219,46 @@ class HorizontalReduction {
"Expected reduction operation.");
if (!IsRedOp)
return I->getParent() == P;
- switch (Kind) {
- case RK_Arithmetic:
- // Arithmetic reduction operation must be used once only.
- return I->getParent() == P;
- case RK_Min:
- case RK_UMin:
- case RK_Max:
- case RK_UMax: {
+ if (isMinMax()) {
// SelectInst must be used twice while the condition op must have single
// use only.
auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
return I->getParent() == P && Cmp && Cmp->getParent() == P;
}
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
+ // Arithmetic reduction operation must be used once only.
+ return I->getParent() == P;
}
+
/// Expected number of uses for reduction operations/reduced values.
bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
- switch (Kind) {
- case RK_Arithmetic:
- return I->hasOneUse();
- case RK_Min:
- case RK_UMin:
- case RK_Max:
- case RK_UMax:
+ if (isMinMax())
return I->hasNUses(2) &&
(!IsReductionOp ||
cast<SelectInst>(I)->getCondition()->hasOneUse());
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
+ return I->hasOneUse();
}
/// Initializes the list of reduction operations.
void initReductionOps(ReductionOpsListType &ReductionOps) {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
- switch (Kind) {
- case RK_Arithmetic:
- ReductionOps.assign(1, ReductionOpsType());
- break;
- case RK_Min:
- case RK_UMin:
- case RK_Max:
- case RK_UMax:
+ if (isMinMax())
ReductionOps.assign(2, ReductionOpsType());
- break;
- case RK_None:
- llvm_unreachable("Reduction kind is not set");
- }
+ else
+ ReductionOps.assign(1, ReductionOpsType());
}
+
/// Add all reduction operations for the reduction instruction \p I.
void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
- switch (Kind) {
- case RK_Arithmetic:
- ReductionOps[0].emplace_back(I);
- break;
- case RK_Min:
- case RK_UMin:
- case RK_Max:
- case RK_UMax:
+ if (isMinMax()) {
ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
ReductionOps[1].emplace_back(I);
- break;
- case RK_None:
- llvm_unreachable("Reduction kind is not set");
+ } else {
+ ReductionOps[0].emplace_back(I);
}
}
@@ -5980,12 +6291,12 @@ class HorizontalReduction {
/// Checks if two operation data are both a reduction op or both a reduced
/// value.
- bool operator==(const OperationData &OD) {
+ bool operator==(const OperationData &OD) const {
assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
"One of the comparing operations is incorrect.");
return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
}
- bool operator!=(const OperationData &OD) { return !(*this == OD); }
+ bool operator!=(const OperationData &OD) const { return !(*this == OD); }
void clear() {
Opcode = 0;
LHS = nullptr;
@@ -6005,18 +6316,7 @@ class HorizontalReduction {
Value *getLHS() const { return LHS; }
Value *getRHS() const { return RHS; }
Type *getConditionType() const {
- switch (Kind) {
- case RK_Arithmetic:
- return nullptr;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- return CmpInst::makeCmpResultType(LHS->getType());
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
+ return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;
}
/// Creates reduction operation with the current opcode with the IR flags
@@ -6400,6 +6700,18 @@ public:
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
}
+
+ // The compare instruction of a min/max is the insertion point for new
+ // instructions and may be replaced with a new compare instruction.
+ auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+ assert(isa<SelectInst>(RdxRootInst) &&
+ "Expected min/max reduction to have select root instruction");
+ Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
+ assert(isa<Instruction>(ScalarCond) &&
+ "Expected min/max reduction to have compare condition");
+ return cast<Instruction>(ScalarCond);
+ };
+
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
@@ -6455,8 +6767,14 @@ public:
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
- // Emit a reduction.
- Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
+ // Emit a reduction. For min/max, the root is a select, but the insertion
+ // point is the compare condition of that select.
+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+ if (ReductionData.isMinMax())
+ Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
+ else
+ Builder.SetInsertPoint(RdxRootInst);
+
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (VectorizedTree) {
@@ -6492,8 +6810,20 @@ public:
VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
}
}
- // Update users.
+
+ // Update users. For a min/max reduction that ends with a compare and
+ // select, we also have to RAUW for the compare instruction feeding the
+ // reduction root. That's because the original compare may have extra uses
+ // besides the final select of the reduction.
+ if (ReductionData.isMinMax()) {
+ if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
+ Instruction *ScalarCmp =
+ getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
+ ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());
+ }
+ }
ReductionRoot->replaceAllUsesWith(VectorizedTree);
+
// Mark all scalar reduction ops for deletion, they are replaced by the
// vector reductions.
V.eraseInstructions(IgnoreList);
@@ -6619,45 +6949,54 @@ private:
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
-/// starting from the last insertelement instruction.
+/// starting from the last insertelement or insertvalue instruction.
///
-/// Returns true if it matches
-static bool findBuildVector(InsertElementInst *LastInsertElem,
- TargetTransformInfo *TTI,
- SmallVectorImpl<Value *> &BuildVectorOpds,
- int &UserCost) {
- UserCost = 0;
- Value *V = nullptr;
- do {
- if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
- UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
- LastInsertElem->getType(),
- CI->getZExtValue());
- }
- BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
- V = LastInsertElem->getOperand(0);
- if (isa<UndefValue>(V))
- break;
- LastInsertElem = dyn_cast<InsertElementInst>(V);
- if (!LastInsertElem || !LastInsertElem->hasOneUse())
- return false;
- } while (true);
- std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
- return true;
-}
-
-/// Like findBuildVector, but looks for construction of aggregate.
+/// Also recognize aggregates like {<2 x float>, <2 x float>},
+/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
+/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
+///
+/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
///
/// \return true if it matches.
-static bool findBuildAggregate(InsertValueInst *IV,
- SmallVectorImpl<Value *> &BuildVectorOpds) {
+static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ int &UserCost) {
+ assert((isa<InsertElementInst>(LastInsertInst) ||
+ isa<InsertValueInst>(LastInsertInst)) &&
+ "Expected insertelement or insertvalue instruction!");
+ UserCost = 0;
do {
- BuildVectorOpds.push_back(IV->getInsertedValueOperand());
- Value *V = IV->getAggregateOperand();
- if (isa<UndefValue>(V))
+ Value *InsertedOperand;
+ if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+ InsertedOperand = IE->getOperand(1);
+ LastInsertInst = IE->getOperand(0);
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+ UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+ IE->getType(), CI->getZExtValue());
+ }
+ } else {
+ auto *IV = cast<InsertValueInst>(LastInsertInst);
+ InsertedOperand = IV->getInsertedValueOperand();
+ LastInsertInst = IV->getAggregateOperand();
+ }
+ if (isa<InsertElementInst>(InsertedOperand) ||
+ isa<InsertValueInst>(InsertedOperand)) {
+ int TmpUserCost;
+ SmallVector<Value *, 8> TmpBuildVectorOpds;
+ if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
+ TmpUserCost))
+ return false;
+ BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
+ TmpBuildVectorOpds.rend());
+ UserCost += TmpUserCost;
+ } else {
+ BuildVectorOpds.push_back(InsertedOperand);
+ }
+ if (isa<UndefValue>(LastInsertInst))
break;
- IV = dyn_cast<InsertValueInst>(V);
- if (!IV || !IV->hasOneUse())
+ if ((!isa<InsertValueInst>(LastInsertInst) &&
+ !isa<InsertElementInst>(LastInsertInst)) ||
+ !LastInsertInst->hasOneUse())
return false;
} while (true);
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
@@ -6825,25 +7164,26 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
BasicBlock *BB, BoUpSLP &R) {
+ int UserCost = 0;
const DataLayout &DL = BB->getModule()->getDataLayout();
if (!R.canMapToVector(IVI->getType(), DL))
return false;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IVI, BuildVectorOpds))
+ if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R);
+ return tryToVectorizeList(BuildVectorOpds, R, UserCost);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
int UserCost;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+ if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
(llvm::all_of(BuildVectorOpds,
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
isShuffle(BuildVectorOpds)))
@@ -7118,14 +7458,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n");
- // Process the stores in chunks of 16.
- // TODO: The limit of 16 inhibits greater vectorization factors.
- // For example, AVX2 supports v32i8. Increasing this limit, however,
- // may cause a significant compile-time increase.
- for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
- unsigned Len = std::min<unsigned>(CE - CI, 16);
- Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
- }
+ Changed |= vectorizeStores(it->second, R);
}
return Changed;
}