summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp846
1 files changed, 576 insertions, 270 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 328f270029604..da3ac06ab464e 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -39,6 +39,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
@@ -90,6 +91,10 @@ static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
+static cl::opt<bool>
+ ViewSLPTree("view-slp-tree", cl::Hidden,
+ cl::desc("Display the SLP trees with Graphviz"));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -212,14 +217,14 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
/// Flag set: NSW, NUW, exact, and all of fast-math.
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
if (auto *VecOp = dyn_cast<Instruction>(I)) {
- if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
- // Intersection is initialized to the 0th scalar,
- // so start counting from index '1'.
+ if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+ // VecOVp is initialized to the 0th scalar, so start counting from index
+ // '1'.
+ VecOp->copyIRFlags(I0);
for (int i = 1, e = VL.size(); i < e; ++i) {
if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
- Intersection->andIRFlags(Scalar);
+ VecOp->andIRFlags(Scalar);
}
- VecOp->copyIRFlags(Intersection);
}
}
}
@@ -304,6 +309,8 @@ public:
typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
+ typedef MapVector<Value *, SmallVector<Instruction *, 2>>
+ ExtraValueToDebugLocsMap;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
@@ -330,6 +337,10 @@ public:
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
+ /// Vectorize the tree but with the list of externally used values \p
+ /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+ /// generated extractvalue instructions.
+ Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
@@ -343,6 +354,13 @@ public:
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+ /// into account (anf updating it, if required) list of externally used
+ /// values stored in \p ExternallyUsedValues.
+ void buildTree(ArrayRef<Value *> Roots,
+ ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
@@ -404,7 +422,7 @@ private:
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
- void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
/// \returns True if the ExtractElement/ExtractValue instructions in VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
@@ -451,8 +469,9 @@ private:
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
struct TreeEntry {
- TreeEntry() : Scalars(), VectorizedValue(nullptr),
- NeedToGather(0) {}
+ TreeEntry(std::vector<TreeEntry> &Container)
+ : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
+ Container(Container) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
@@ -468,11 +487,24 @@ private:
/// Do we need to gather this sequence ?
bool NeedToGather;
+
+ /// Points back to the VectorizableTree.
+ ///
+ /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
+ /// to be a pointer and needs to be able to initialize the child iterator.
+ /// Thus we need a reference back to the container to translate the indices
+ /// to entries.
+ std::vector<TreeEntry> &Container;
+
+ /// The TreeEntry index containing the user of this entry. We can actually
+ /// have multiple users so the data structure is not truly a tree.
+ SmallVector<int, 1> UserTreeIndices;
};
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
- VectorizableTree.emplace_back();
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+ int &UserTreeIdx) {
+ VectorizableTree.emplace_back(VectorizableTree);
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
@@ -485,6 +517,10 @@ private:
} else {
MustGather.insert(VL.begin(), VL.end());
}
+
+ if (UserTreeIdx >= 0)
+ Last->UserTreeIndices.push_back(UserTreeIdx);
+ UserTreeIdx = idx;
return Last;
}
@@ -558,7 +594,9 @@ private:
SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
/// A list of values that need to extracted out of the tree.
- /// This list holds pairs of (Internal Scalar : External User).
+ /// This list holds pairs of (Internal Scalar : External User). External User
+ /// can be nullptr, it means that this Internal Scalar will be used later,
+ /// after vectorization.
UserList ExternalUses;
/// Values used only by @llvm.assume calls.
@@ -706,6 +744,8 @@ private:
return os;
}
#endif
+ friend struct GraphTraits<BoUpSLP *>;
+ friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
///
@@ -916,17 +956,98 @@ private:
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+ typedef BoUpSLP::TreeEntry TreeEntry;
+
+ /// NodeRef has to be a pointer per the GraphWriter.
+ typedef TreeEntry *NodeRef;
+
+ /// \brief Add the VectorizableTree to the index iterator to be able to return
+ /// TreeEntry pointers.
+ struct ChildIteratorType
+ : public iterator_adaptor_base<ChildIteratorType,
+ SmallVector<int, 1>::iterator> {
+
+ std::vector<TreeEntry> &VectorizableTree;
+
+ ChildIteratorType(SmallVector<int, 1>::iterator W,
+ std::vector<TreeEntry> &VT)
+ : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+ NodeRef operator*() { return &VectorizableTree[*I]; }
+ };
+
+ static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
+
+ static ChildIteratorType child_begin(NodeRef N) {
+ return {N->UserTreeIndices.begin(), N->Container};
+ }
+ static ChildIteratorType child_end(NodeRef N) {
+ return {N->UserTreeIndices.end(), N->Container};
+ }
+
+ /// For the node iterator we just need to turn the TreeEntry iterator into a
+ /// TreeEntry* iterator so that it dereferences to NodeRef.
+ typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
+
+ static nodes_iterator nodes_begin(BoUpSLP *R) {
+ return nodes_iterator(R->VectorizableTree.begin());
+ }
+ static nodes_iterator nodes_end(BoUpSLP *R) {
+ return nodes_iterator(R->VectorizableTree.end());
+ }
+
+ static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+ typedef BoUpSLP::TreeEntry TreeEntry;
+
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+ std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+ std::string Str;
+ raw_string_ostream OS(Str);
+ if (isSplat(Entry->Scalars)) {
+ OS << "<splat> " << *Entry->Scalars[0];
+ return Str;
+ }
+ for (auto V : Entry->Scalars) {
+ OS << *V;
+ if (std::any_of(
+ R->ExternalUses.begin(), R->ExternalUses.end(),
+ [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+ OS << " <extract>";
+ OS << "\n";
+ }
+ return Str;
+ }
+
+ static std::string getNodeAttributes(const TreeEntry *Entry,
+ const BoUpSLP *) {
+ if (Entry->NeedToGather)
+ return "color=red";
+ return "";
+ }
+};
} // end namespace llvm
-} // end namespace slpvectorizer
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
+ ExtraValueToDebugLocsMap ExternallyUsedValues;
+ buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
UserIgnoreList = UserIgnoreLst;
if (!allSameType(Roots))
return;
- buildTree_rec(Roots, 0);
+ buildTree_rec(Roots, 0, -1);
// Collect the values that we need to extract from the tree.
for (TreeEntry &EIdx : VectorizableTree) {
@@ -940,6 +1061,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
if (Entry->NeedToGather)
continue;
+ // Check if the scalar is externally used as an extra arg.
+ auto ExtI = ExternallyUsedValues.find(Scalar);
+ if (ExtI != ExternallyUsedValues.end()) {
+ DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
+ Lane << " from " << *Scalar << ".\n");
+ ExternalUses.emplace_back(Scalar, nullptr, Lane);
+ continue;
+ }
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@@ -976,28 +1105,28 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
}
}
-
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+ int UserTreeIdx) {
bool isAltShuffle = false;
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Don't handle vectors.
if (VL[0]->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
if (SI->getValueOperand()->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
unsigned Opcode = getSameOpcode(VL);
@@ -1014,7 +1143,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1026,7 +1155,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (EphValues.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is ephemeral.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
@@ -1039,10 +1168,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
if (E->Scalars[i] != VL[i]) {
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
+ // Record the reuse of the tree node. FIXME, currently this is only used to
+ // properly draw the graph rather than for the actual vectorization.
+ E->UserTreeIndices.push_back(UserTreeIdx);
DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
return;
}
@@ -1052,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (ScalarToTreeEntry.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is already in tree.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
@@ -1062,7 +1194,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
@@ -1076,7 +1208,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1085,7 +1217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned j = i+1; j < e; ++j)
if (VL[i] == VL[j]) {
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1100,7 +1232,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
assert((!BS.getScheduleData(VL[0]) ||
!BS.getScheduleData(VL[0])->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1117,12 +1249,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1132,7 +1264,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
- buildTree_rec(Operands, Depth + 1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
@@ -1144,7 +1276,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
} else {
BS.cancelScheduling(VL);
}
- newTreeEntry(VL, Reuse);
+ newTreeEntry(VL, Reuse, UserTreeIdx);
return;
}
case Instruction::Load: {
@@ -1160,7 +1292,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
@@ -1171,7 +1303,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
@@ -1193,7 +1325,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (Consecutive) {
++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
}
@@ -1208,7 +1340,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
if (ReverseConsecutive) {
++NumLoadsWantToChangeOrder;
@@ -1235,12 +1367,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return;
}
}
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1249,7 +1381,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth+1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
@@ -1263,13 +1395,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1278,7 +1410,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth+1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
@@ -1301,7 +1433,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
@@ -1309,8 +1441,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right);
- buildTree_rec(Left, Depth + 1);
- buildTree_rec(Right, Depth + 1);
+ buildTree_rec(Left, Depth + 1, UserTreeIdx);
+ buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
}
@@ -1320,7 +1452,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth+1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
@@ -1330,7 +1462,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
@@ -1343,7 +1475,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
@@ -1355,12 +1487,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
@@ -1368,7 +1500,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
@@ -1377,19 +1509,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(0));
- buildTree_rec(Operands, Depth + 1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
return;
}
case Instruction::Call: {
@@ -1400,7 +1532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
@@ -1414,7 +1546,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n");
return;
@@ -1425,7 +1557,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J
<< "\n");
@@ -1438,14 +1570,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
<< *VL[i] << '\n');
return;
}
}
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1453,7 +1585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
- buildTree_rec(Operands, Depth + 1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
@@ -1462,19 +1594,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// then do not vectorize this instruction.
if (!isAltShuffle) {
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- newTreeEntry(VL, true);
+ newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
reorderAltShuffleOperands(VL, Left, Right);
- buildTree_rec(Left, Depth + 1);
- buildTree_rec(Right, Depth + 1);
+ buildTree_rec(Left, Depth + 1, UserTreeIdx);
+ buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
}
@@ -1484,13 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
default:
BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
+ newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
@@ -1570,6 +1702,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
+ else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// If we have computed a smaller type for the expression, update VecTy so
@@ -1599,7 +1733,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
- if (E->hasOneUse())
+ // If all users are going to be vectorized, instruction can be
+ // considered as dead.
+ // The same, if have only one user, it will be vectorized for sure.
+ if (E->hasOneUse() ||
+ std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
+ return ScalarToTreeEntry.count(U) > 0;
+ }))
// Take credit for instruction that will become dead.
DeadCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
@@ -1624,10 +1764,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
- VL0->getType(), SrcTy);
+ VL0->getType(), SrcTy, VL0);
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
- int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+ int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
return VecCost - ScalarCost;
}
case Instruction::FCmp:
@@ -1636,8 +1776,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
- int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+ TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+ int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
return VecCost - ScalarCost;
}
case Instruction::Add:
@@ -1720,18 +1860,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Cost of wide load - cost of scalar loads.
unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
int ScalarLdCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
- VecTy, alignment, 0);
+ VecTy, alignment, 0, VL0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
int ScalarStCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, alignment, 0);
+ VecTy, alignment, 0, VL0);
return VecStCost - ScalarStCost;
}
case Instruction::Call: {
@@ -1739,12 +1879,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type*, 4> ScalarTys, VecTys;
- for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
+ SmallVector<Type*, 4> ScalarTys;
+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
ScalarTys.push_back(CI->getArgOperand(op)->getType());
- VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
- VecTy->getNumElements()));
- }
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
@@ -1753,7 +1890,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
int ScalarCallCost = VecTy->getNumElements() *
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
- int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+ SmallVector<Value *, 4> Args(CI->arg_operands());
+ int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
+ VecTy->getNumElements());
DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -1947,9 +2086,18 @@ int BoUpSLP::getTreeCost() {
int SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
- << "SLP: Extract Cost = " << ExtractCost << ".\n"
- << "SLP: Total Cost = " << Cost << ".\n");
+ std::string Str;
+ {
+ raw_string_ostream OS(Str);
+ OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+ << "SLP: Extract Cost = " << ExtractCost << ".\n"
+ << "SLP: Total Cost = " << Cost << ".\n";
+ }
+ DEBUG(dbgs() << Str);
+
+ if (ViewSLPTree)
+ ViewGraph(this, "SLP" + F->getName(), false, Str);
+
return Cost;
}
@@ -2702,6 +2850,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *BoUpSLP::vectorizeTree() {
+ ExtraValueToDebugLocsMap ExternallyUsedValues;
+ return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
@@ -2744,7 +2898,7 @@ Value *BoUpSLP::vectorizeTree() {
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
- if (!is_contained(Scalar->users(), User))
+ if (User && !is_contained(Scalar->users(), User))
continue;
assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
@@ -2756,6 +2910,28 @@ Value *BoUpSLP::vectorizeTree() {
assert(Vec && "Can't find vectorizable value");
Value *Lane = Builder.getInt32(ExternalUse.Lane);
+ // If User == nullptr, the Scalar is used as extra arg. Generate
+ // ExtractElement instruction and update the record for this scalar in
+ // ExternallyUsedValues.
+ if (!User) {
+ assert(ExternallyUsedValues.count(Scalar) &&
+ "Scalar with nullptr as an external user must be registered in "
+ "ExternallyUsedValues map");
+ if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
+ } else {
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ }
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+ auto &Locs = ExternallyUsedValues[Scalar];
+ ExternallyUsedValues.insert({Ex, Locs});
+ ExternallyUsedValues.erase(Scalar);
+ continue;
+ }
+
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
@@ -3264,7 +3440,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
// sorted by the original instruction location. This lets the final schedule
// be as close as possible to the original instruction order.
struct ScheduleDataCompare {
- bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
+ bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
return SD2->SchedulingPriority < SD1->SchedulingPriority;
}
};
@@ -3645,9 +3821,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
if (!Changed)
return PreservedAnalyses::all();
+
PreservedAnalyses PA;
- PA.preserve<LoopAnalysis>();
- PA.preserve<DominatorTreeAnalysis>();
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<AAManager>();
PA.preserve<GlobalsAA>();
return PA;
@@ -4026,36 +4202,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
if (!V)
return false;
+ Value *P = V->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+ return false;
+
// Try to vectorize V.
- if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+ if (tryToVectorizePair(Op0, Op1, R))
return true;
- BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
- BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
// Try to skip B.
if (B && B->hasOneUse()) {
- BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (tryToVectorizePair(A, B0, R)) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
return true;
- }
- if (tryToVectorizePair(A, B1, R)) {
+ if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
return true;
- }
}
// Try to skip A.
if (A && A->hasOneUse()) {
- BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (tryToVectorizePair(A0, B, R)) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
return true;
- }
- if (tryToVectorizePair(A1, B, R)) {
+ if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
return true;
- }
}
- return 0;
+ return false;
}
/// \brief Generate a shuffle mask to be used in a reduction tree.
@@ -4119,37 +4299,41 @@ namespace {
class HorizontalReduction {
SmallVector<Value *, 16> ReductionOps;
SmallVector<Value *, 32> ReducedVals;
+ // Use map vector to make stable output.
+ MapVector<Instruction *, Value *> ExtraArgs;
- BinaryOperator *ReductionRoot;
- // After successfull horizontal reduction vectorization attempt for PHI node
- // vectorizer tries to update root binary op by combining vectorized tree and
- // the ReductionPHI node. But during vectorization this ReductionPHI can be
- // vectorized itself and replaced by the undef value, while the instruction
- // itself is marked for deletion. This 'marked for deletion' PHI node then can
- // be used in new binary operation, causing "Use still stuck around after Def
- // is destroyed" crash upon PHI node deletion.
- WeakVH ReductionPHI;
+ BinaryOperator *ReductionRoot = nullptr;
/// The opcode of the reduction.
- unsigned ReductionOpcode;
+ Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
/// The opcode of the values we perform a reduction on.
- unsigned ReducedValueOpcode;
+ unsigned ReducedValueOpcode = 0;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
- bool IsPairwiseReduction;
+ bool IsPairwiseReduction = false;
+
+ /// Checks if the ParentStackElem.first should be marked as a reduction
+ /// operation with an extra argument or as extra argument itself.
+ void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+ Value *ExtraArg) {
+ if (ExtraArgs.count(ParentStackElem.first)) {
+ ExtraArgs[ParentStackElem.first] = nullptr;
+ // We ran into something like:
+ // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+ // The whole ParentStackElem.first should be considered as an extra value
+ // in this case.
+ // Do not perform analysis of remaining operands of ParentStackElem.first
+ // instruction, this whole instruction is an extra argument.
+ ParentStackElem.second = ParentStackElem.first->getNumOperands();
+ } else {
+ // We ran into something like:
+ // ParentStackElem.first += ... + ExtraArg + ...
+ ExtraArgs[ParentStackElem.first] = ExtraArg;
+ }
+ }
public:
- /// The width of one full horizontal reduction operation.
- unsigned ReduxWidth;
-
- /// Minimal width of available vector registers. It's used to determine
- /// ReduxWidth.
- unsigned MinVecRegSize;
-
- HorizontalReduction(unsigned MinVecRegSize)
- : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
- IsPairwiseReduction(false), ReduxWidth(0),
- MinVecRegSize(MinVecRegSize) {}
+ HorizontalReduction() = default;
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -4176,21 +4360,14 @@ public:
if (!isValidElementType(Ty))
return false;
- const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
- // FIXME: Register size should be a parameter to this function, so we can
- // try different vectorization factors.
- ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
- ReductionPHI = Phi;
-
- if (ReduxWidth < 4)
- return false;
// We currently only support adds.
- if (ReductionOpcode != Instruction::Add &&
- ReductionOpcode != Instruction::FAdd)
+ if ((ReductionOpcode != Instruction::Add &&
+ ReductionOpcode != Instruction::FAdd) ||
+ !B->isAssociative())
return false;
// Post order traverse the reduction tree starting at B. We only handle true
@@ -4202,30 +4379,26 @@ public:
unsigned EdgeToVist = Stack.back().second++;
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
- // Only handle trees in the current basic block.
- if (TreeN->getParent() != B->getParent())
- return false;
-
- // Each tree node needs to have one user except for the ultimate
- // reduction.
- if (!TreeN->hasOneUse() && TreeN != B)
- return false;
-
// Postorder vist.
if (EdgeToVist == 2 || IsReducedValue) {
- if (IsReducedValue) {
- // Make sure that the opcodes of the operations that we are going to
- // reduce match.
- if (!ReducedValueOpcode)
- ReducedValueOpcode = TreeN->getOpcode();
- else if (ReducedValueOpcode != TreeN->getOpcode())
- return false;
+ if (IsReducedValue)
ReducedVals.push_back(TreeN);
- } else {
- // We need to be able to reassociate the adds.
- if (!TreeN->isAssociative())
- return false;
- ReductionOps.push_back(TreeN);
+ else {
+ auto I = ExtraArgs.find(TreeN);
+ if (I != ExtraArgs.end() && !I->second) {
+ // Check if TreeN is an extra argument of its parent operation.
+ if (Stack.size() <= 1) {
+ // TreeN can't be an extra argument as it is a root reduction
+ // operation.
+ return false;
+ }
+ // Yes, TreeN is an extra argument, do not add it to a list of
+ // reduction operations.
+ // Stack[Stack.size() - 2] always points to the parent operation.
+ markExtraArg(Stack[Stack.size() - 2], TreeN);
+ ExtraArgs.erase(TreeN);
+ } else
+ ReductionOps.push_back(TreeN);
}
// Retract.
Stack.pop_back();
@@ -4242,13 +4415,44 @@ public:
// reduced value class.
if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
I->getOpcode() == ReductionOpcode)) {
- if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
+ // Only handle trees in the current basic block.
+ if (I->getParent() != B->getParent()) {
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ }
+
+ // Each tree node needs to have one user except for the ultimate
+ // reduction.
+ if (!I->hasOneUse() && I != B) {
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ }
+
+ if (I->getOpcode() == ReductionOpcode) {
+ // We need to be able to reassociate the reduction operations.
+ if (!I->isAssociative()) {
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ }
+ } else if (ReducedValueOpcode &&
+ ReducedValueOpcode != I->getOpcode()) {
+ // Make sure that the opcodes of the operations that we are going to
+ // reduce match.
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ } else if (!ReducedValueOpcode)
ReducedValueOpcode = I->getOpcode();
+
Stack.push_back(std::make_pair(I, 0));
continue;
}
- return false;
}
+ // NextV is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), NextV);
}
return true;
}
@@ -4259,10 +4463,15 @@ public:
if (ReducedVals.empty())
return false;
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
unsigned NumReducedVals = ReducedVals.size();
- if (NumReducedVals < ReduxWidth)
+ if (NumReducedVals < 4)
return false;
+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+
Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;
@@ -4270,20 +4479,26 @@ public:
Builder.setFastMathFlags(Unsafe);
unsigned i = 0;
- for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+ BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+ // The same extra argument may be used several time, so log each attempt
+ // to use it.
+ for (auto &Pair : ExtraArgs)
+ ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, ReductionOps);
+ V.buildTree(VL, ExternallyUsedValues, ReductionOps);
if (V.shouldReorder()) {
SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
- V.buildTree(Reversed, ReductionOps);
+ V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
}
if (V.isTreeTinyAndNotFullyVectorizable())
- continue;
+ break;
V.computeMinimumValueSizes();
// Estimate cost.
- int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+ int Cost =
+ V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
if (Cost >= -SLPCostThreshold)
break;
@@ -4292,33 +4507,44 @@ public:
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
- Value *VectorizedRoot = V.vectorizeTree();
+ Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
// Emit a reduction.
- Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+ Value *ReducedSubTree =
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
- VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
- ReducedSubTree, "bin.rdx");
+ VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+ ReducedSubTree, "bin.rdx");
+ propagateIRFlags(VectorizedTree, ReductionOps);
} else
VectorizedTree = ReducedSubTree;
+ i += ReduxWidth;
+ ReduxWidth = PowerOf2Floor(NumReducedVals - i);
}
if (VectorizedTree) {
// Finish the reduction.
for (; i < NumReducedVals; ++i) {
- Builder.SetCurrentDebugLocation(
- cast<Instruction>(ReducedVals[i])->getDebugLoc());
- VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
- ReducedVals[i]);
+ auto *I = cast<Instruction>(ReducedVals[i]);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ VectorizedTree =
+ Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
+ propagateIRFlags(VectorizedTree, ReductionOps);
+ }
+ for (auto &Pair : ExternallyUsedValues) {
+ assert(!Pair.second.empty() &&
+ "At least one DebugLoc must be inserted");
+ // Add each externally used value to the final reduction.
+ for (auto *I : Pair.second) {
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+ Pair.first, "bin.extra");
+ propagateIRFlags(VectorizedTree, I);
+ }
}
// Update users.
- if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
- assert(ReductionRoot && "Need a reduction operation");
- ReductionRoot->setOperand(0, VectorizedTree);
- ReductionRoot->setOperand(1, ReductionPHI);
- } else
- ReductionRoot->replaceAllUsesWith(VectorizedTree);
+ ReductionRoot->replaceAllUsesWith(VectorizedTree);
}
return VectorizedTree != nullptr;
}
@@ -4329,7 +4555,8 @@ public:
private:
/// \brief Calculate the cost of a reduction.
- int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+ int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
+ unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
@@ -4352,15 +4579,9 @@ private:
return VecReduxCost - ScalarReduxCost;
}
- static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
- Value *R, const Twine &Name = "") {
- if (Opcode == Instruction::FAdd)
- return Builder.CreateFAdd(L, R, Name);
- return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
- }
-
/// \brief Emit a horizontal reduction of the vectorized value.
- Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+ Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+ unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
@@ -4378,15 +4599,16 @@ private:
Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
- "bin.rdx");
+ TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
+ "bin.rdx");
} else {
Value *UpperHalf =
createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
Value *Shuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
- TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
+ TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
}
+ propagateIRFlags(TmpVec, RedOps);
}
// The result is in the first element of the vector.
@@ -4438,16 +4660,19 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem,
static bool findBuildAggregate(InsertValueInst *IV,
SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
- if (!IV->hasOneUse())
- return false;
- Value *V = IV->getAggregateOperand();
- if (!isa<UndefValue>(V)) {
- InsertValueInst *I = dyn_cast<InsertValueInst>(V);
- if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+ Value *V;
+ do {
+ BuildVector.push_back(IV);
+ BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+ V = IV->getAggregateOperand();
+ if (isa<UndefValue>(V))
+ break;
+ IV = dyn_cast<InsertValueInst>(V);
+ if (!IV || !IV->hasOneUse())
return false;
- }
- BuildVector.push_back(IV);
- BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+ } while (true);
+ std::reverse(BuildVector.begin(), BuildVector.end());
+ std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
@@ -4507,29 +4732,137 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
return nullptr;
}
+namespace {
+/// Tracks instructons and its children.
+class WeakVHWithLevel final : public CallbackVH {
+ /// Operand index of the instruction currently beeing analized.
+ unsigned Level = 0;
+ /// Is this the instruction that should be vectorized, or are we now
+ /// processing children (i.e. operands of this instruction) for potential
+ /// vectorization?
+ bool IsInitial = true;
+
+public:
+ explicit WeakVHWithLevel() = default;
+ WeakVHWithLevel(Value *V) : CallbackVH(V){};
+ /// Restart children analysis each time it is repaced by the new instruction.
+ void allUsesReplacedWith(Value *New) override {
+ setValPtr(New);
+ Level = 0;
+ IsInitial = true;
+ }
+ /// Check if the instruction was not deleted during vectorization.
+ bool isValid() const { return !getValPtr(); }
+ /// Is the istruction itself must be vectorized?
+ bool isInitial() const { return IsInitial; }
+ /// Try to vectorize children.
+ void clearInitial() { IsInitial = false; }
+ /// Are all children processed already?
+ bool isFinal() const {
+ assert(getValPtr() &&
+ (isa<Instruction>(getValPtr()) &&
+ cast<Instruction>(getValPtr())->getNumOperands() >= Level));
+ return getValPtr() &&
+ cast<Instruction>(getValPtr())->getNumOperands() == Level;
+ }
+ /// Get next child operation.
+ Value *nextOperand() {
+ assert(getValPtr() && isa<Instruction>(getValPtr()) &&
+ cast<Instruction>(getValPtr())->getNumOperands() > Level);
+ return cast<Instruction>(getValPtr())->getOperand(Level++);
+ }
+ virtual ~WeakVHWithLevel() = default;
+};
+} // namespace
+
/// \brief Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators BI, then check if it
-/// can be done.
+/// the phi node P with reduction operators Root in a basic block BB, then check
+/// if it can be done.
/// \returns true if a horizontal reduction was matched and reduced.
/// \returns false if a horizontal reduction was not matched.
-static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
- BoUpSLP &R, TargetTransformInfo *TTI,
- unsigned MinRegSize) {
+static bool canBeVectorized(
+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI,
+ const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
- HorizontalReduction HorRdx(MinRegSize);
- if (!HorRdx.matchAssociativeReduction(P, BI))
+ if (!Root)
return false;
- // If there is a sufficient number of reduction values, reduce
- // to a nearby power-of-2. Can safely generate oversized
- // vectors and rely on the backend to split them to legal sizes.
- HorRdx.ReduxWidth =
- std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+ if (Root->getParent() != BB)
+ return false;
+ SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
+ SmallSet<Value *, 8> VisitedInstrs;
+ bool Res = false;
+ while (!Stack.empty()) {
+ Value *V = Stack.back();
+ if (!V) {
+ Stack.pop_back();
+ continue;
+ }
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || isa<PHINode>(Inst)) {
+ Stack.pop_back();
+ continue;
+ }
+ if (Stack.back().isInitial()) {
+ Stack.back().clearInitial();
+ if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+ HorizontalReduction HorRdx;
+ if (HorRdx.matchAssociativeReduction(P, BI)) {
+ if (HorRdx.tryToReduce(R, TTI)) {
+ Res = true;
+ P = nullptr;
+ continue;
+ }
+ }
+ if (P) {
+ Inst = dyn_cast<Instruction>(BI->getOperand(0));
+ if (Inst == P)
+ Inst = dyn_cast<Instruction>(BI->getOperand(1));
+ if (!Inst) {
+ P = nullptr;
+ continue;
+ }
+ }
+ }
+ P = nullptr;
+ if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ Res = true;
+ continue;
+ }
+ }
+ if (Stack.back().isFinal()) {
+ Stack.pop_back();
+ continue;
+ }
- return HorRdx.tryToReduce(R, TTI);
+ if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
+ if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
+ Stack.size() < RecursionMaxDepth)
+ Stack.push_back(NextV);
+ }
+ return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+ BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI) {
+ if (!V)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator>(I))
+ P = nullptr;
+ // Try to match and vectorize a horizontal reduction.
+ return canBeVectorized(P, I, BB, R, TTI,
+ [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
+ return tryToVectorize(BI, R);
+ });
}
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
@@ -4599,67 +4932,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (P->getNumIncomingValues() != 2)
return Changed;
- Value *Rdx = getReductionValue(DT, P, BB, LI);
-
- // Check if this is a Binary Operator.
- BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
- if (!BI)
- continue;
-
// Try to match and vectorize a horizontal reduction.
- if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
+ if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+ TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
continue;
}
-
- Value *Inst = BI->getOperand(0);
- if (Inst == P)
- Inst = BI->getOperand(1);
-
- if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
-
continue;
}
- if (ShouldStartVectorizeHorAtStore)
- if (StoreInst *SI = dyn_cast<StoreInst>(it))
- if (BinaryOperator *BinOp =
- dyn_cast<BinaryOperator>(SI->getValueOperand())) {
- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
- R.getMinVecRegSize()) ||
- tryToVectorize(BinOp, R)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
+ if (ShouldStartVectorizeHorAtStore) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
+ TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
}
+ }
+ }
// Try to vectorize horizontal reductions feeding into a return.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
- if (RI->getNumOperands() != 0)
- if (BinaryOperator *BinOp =
- dyn_cast<BinaryOperator>(RI->getOperand(0))) {
- DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
- R.getMinVecRegSize()) ||
- tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
- R)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
+ if (RI->getNumOperands() != 0) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
}
+ }
+ }
// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
@@ -4672,16 +4980,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- for (int i = 0; i < 2; ++i) {
- if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
- if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- break;
- }
+ for (int I = 0; I < 2; ++I) {
+ if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ break;
}
}
continue;