diff options
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 846 |
1 files changed, 576 insertions, 270 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 328f270029604..da3ac06ab464e 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -39,6 +39,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> @@ -90,6 +91,10 @@ static cl::opt<unsigned> MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +static cl::opt<bool> + ViewSLPTree("view-slp-tree", cl::Hidden, + cl::desc("Display the SLP trees with Graphviz")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -212,14 +217,14 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { /// Flag set: NSW, NUW, exact, and all of fast-math. static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { if (auto *VecOp = dyn_cast<Instruction>(I)) { - if (auto *Intersection = dyn_cast<Instruction>(VL[0])) { - // Intersection is initialized to the 0th scalar, - // so start counting from index '1'. + if (auto *I0 = dyn_cast<Instruction>(VL[0])) { + // VecOVp is initialized to the 0th scalar, so start counting from index + // '1'. + VecOp->copyIRFlags(I0); for (int i = 1, e = VL.size(); i < e; ++i) { if (auto *Scalar = dyn_cast<Instruction>(VL[i])) - Intersection->andIRFlags(Scalar); + VecOp->andIRFlags(Scalar); } - VecOp->copyIRFlags(Intersection); } } } @@ -304,6 +309,8 @@ public: typedef SmallVector<Instruction *, 16> InstrList; typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; + typedef MapVector<Value *, SmallVector<Instruction *, 2>> + ExtraValueToDebugLocsMap; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, @@ -330,6 +337,10 @@ public: /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. Value *vectorizeTree(); + /// Vectorize the tree but with the list of externally used values \p + /// ExternallyUsedValues. Values in this MapVector can be replaced but the + /// generated extractvalue instructions. + Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. @@ -343,6 +354,13 @@ public: /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst = None); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking + /// into account (anf updating it, if required) list of externally used + /// values stored in \p ExternallyUsedValues. + void buildTree(ArrayRef<Value *> Roots, + ExtraValueToDebugLocsMap &ExternallyUsedValues, + ArrayRef<Value *> UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { @@ -404,7 +422,7 @@ private: int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); /// \returns True if the ExtractElement/ExtractValue instructions in VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). @@ -451,8 +469,9 @@ private: SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right); struct TreeEntry { - TreeEntry() : Scalars(), VectorizedValue(nullptr), - NeedToGather(0) {} + TreeEntry(std::vector<TreeEntry> &Container) + : Scalars(), VectorizedValue(nullptr), NeedToGather(0), + Container(Container) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { @@ -468,11 +487,24 @@ private: /// Do we need to gather this sequence ? bool NeedToGather; + + /// Points back to the VectorizableTree. + /// + /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has + /// to be a pointer and needs to be able to initialize the child iterator. + /// Thus we need a reference back to the container to translate the indices + /// to entries. + std::vector<TreeEntry> &Container; + + /// The TreeEntry index containing the user of this entry. We can actually + /// have multiple users so the data structure is not truly a tree. + SmallVector<int, 1> UserTreeIndices; }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { - VectorizableTree.emplace_back(); + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, + int &UserTreeIdx) { + VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); @@ -485,6 +517,10 @@ private: } else { MustGather.insert(VL.begin(), VL.end()); } + + if (UserTreeIdx >= 0) + Last->UserTreeIndices.push_back(UserTreeIdx); + UserTreeIdx = idx; return Last; } @@ -558,7 +594,9 @@ private: SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; /// A list of values that need to extracted out of the tree. - /// This list holds pairs of (Internal Scalar : External User). + /// This list holds pairs of (Internal Scalar : External User). External User + /// can be nullptr, it means that this Internal Scalar will be used later, + /// after vectorization. UserList ExternalUses; /// Values used only by @llvm.assume calls. @@ -706,6 +744,8 @@ private: return os; } #endif + friend struct GraphTraits<BoUpSLP *>; + friend struct DOTGraphTraits<BoUpSLP *>; /// Contains all scheduling data for a basic block. /// @@ -916,17 +956,98 @@ private: /// original width. MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; }; +} // end namespace slpvectorizer + +template <> struct GraphTraits<BoUpSLP *> { + typedef BoUpSLP::TreeEntry TreeEntry; + + /// NodeRef has to be a pointer per the GraphWriter. + typedef TreeEntry *NodeRef; + + /// \brief Add the VectorizableTree to the index iterator to be able to return + /// TreeEntry pointers. + struct ChildIteratorType + : public iterator_adaptor_base<ChildIteratorType, + SmallVector<int, 1>::iterator> { + + std::vector<TreeEntry> &VectorizableTree; + + ChildIteratorType(SmallVector<int, 1>::iterator W, + std::vector<TreeEntry> &VT) + : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} + + NodeRef operator*() { return &VectorizableTree[*I]; } + }; + + static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; } + + static ChildIteratorType child_begin(NodeRef N) { + return {N->UserTreeIndices.begin(), N->Container}; + } + static ChildIteratorType child_end(NodeRef N) { + return {N->UserTreeIndices.end(), N->Container}; + } + + /// For the node iterator we just need to turn the TreeEntry iterator into a + /// TreeEntry* iterator so that it dereferences to NodeRef. + typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator; + + static nodes_iterator nodes_begin(BoUpSLP *R) { + return nodes_iterator(R->VectorizableTree.begin()); + } + static nodes_iterator nodes_end(BoUpSLP *R) { + return nodes_iterator(R->VectorizableTree.end()); + } + + static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } +}; + +template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { + typedef BoUpSLP::TreeEntry TreeEntry; + + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { + std::string Str; + raw_string_ostream OS(Str); + if (isSplat(Entry->Scalars)) { + OS << "<splat> " << *Entry->Scalars[0]; + return Str; + } + for (auto V : Entry->Scalars) { + OS << *V; + if (std::any_of( + R->ExternalUses.begin(), R->ExternalUses.end(), + [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) + OS << " <extract>"; + OS << "\n"; + } + return Str; + } + + static std::string getNodeAttributes(const TreeEntry *Entry, + const BoUpSLP *) { + if (Entry->NeedToGather) + return "color=red"; + return ""; + } +}; } // end namespace llvm -} // end namespace slpvectorizer void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst) { + ExtraValueToDebugLocsMap ExternallyUsedValues; + buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); +} +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, + ExtraValueToDebugLocsMap &ExternallyUsedValues, + ArrayRef<Value *> UserIgnoreLst) { deleteTree(); UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0); + buildTree_rec(Roots, 0, -1); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { @@ -940,6 +1061,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, if (Entry->NeedToGather) continue; + // Check if the scalar is externally used as an extra arg. + auto ExtI = ExternallyUsedValues.find(Scalar); + if (ExtI != ExternallyUsedValues.end()) { + DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << + Lane << " from " << *Scalar << ".\n"); + ExternalUses.emplace_back(Scalar, nullptr, Lane); + continue; + } for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); @@ -976,28 +1105,28 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, } } - -void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { +void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, + int UserTreeIdx) { bool isAltShuffle = false; assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } // Don't handle vectors. if (VL[0]->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } unsigned Opcode = getSameOpcode(VL); @@ -1014,7 +1143,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1026,7 +1155,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1039,10 +1168,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } + // Record the reuse of the tree node. FIXME, currently this is only used to + // properly draw the graph rather than for the actual vectorization. + E->UserTreeIndices.push_back(UserTreeIdx); DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); return; } @@ -1052,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (ScalarToTreeEntry.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1062,7 +1194,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1076,7 +1208,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1085,7 +1217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned j = i+1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1100,7 +1232,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1117,12 +1249,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1132,7 +1264,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1144,7 +1276,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } else { BS.cancelScheduling(VL); } - newTreeEntry(VL, Reuse); + newTreeEntry(VL, Reuse, UserTreeIdx); return; } case Instruction::Load: { @@ -1160,7 +1292,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1171,7 +1303,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { LoadInst *L = cast<LoadInst>(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1193,7 +1325,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1208,7 +1340,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1235,12 +1367,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1249,7 +1381,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth+1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1263,13 +1395,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1278,7 +1410,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth+1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1301,7 +1433,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1309,8 +1441,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1, UserTreeIdx); + buildTree_rec(Right, Depth + 1, UserTreeIdx); return; } @@ -1320,7 +1452,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth+1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1330,7 +1462,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (cast<Instruction>(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1343,7 +1475,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1355,12 +1487,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1368,7 +1500,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1377,19 +1509,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(0)); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); return; } case Instruction::Call: { @@ -1400,7 +1532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1414,7 +1546,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1425,7 +1557,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1438,14 +1570,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1453,7 +1585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { CallInst *CI2 = dyn_cast<CallInst>(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1462,19 +1594,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // then do not vectorize this instruction. if (!isAltShuffle) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; reorderAltShuffleOperands(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1, UserTreeIdx); + buildTree_rec(Right, Depth + 1, UserTreeIdx); return; } @@ -1484,13 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } default: BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1570,6 +1702,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) ScalarTy = SI->getValueOperand()->getType(); + else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) + ScalarTy = CI->getOperand(0)->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // If we have computed a smaller type for the expression, update VecTy so @@ -1599,7 +1733,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast<Instruction>(VL[i]); - if (E->hasOneUse()) + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (E->hasOneUse() || + std::all_of(E->user_begin(), E->user_end(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0; + })) // Take credit for instruction that will become dead. DeadCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); @@ -1624,10 +1764,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); + VL0->getType(), SrcTy, VL0); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -1636,8 +1776,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0); + int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0); return VecCost - ScalarCost; } case Instruction::Add: @@ -1720,18 +1860,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Cost of wide load - cost of scalar loads. unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment(); int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0); + VecTy, alignment, 0, VL0); return VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment(); int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0); + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0); + VecTy, alignment, 0, VL0); return VecStCost - ScalarStCost; } case Instruction::Call: { @@ -1739,12 +1879,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector<Type*, 4> ScalarTys, VecTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { + SmallVector<Type*, 4> ScalarTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) ScalarTys.push_back(CI->getArgOperand(op)->getType()); - VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), - VecTy->getNumElements())); - } FastMathFlags FMF; if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) @@ -1753,7 +1890,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int ScalarCallCost = VecTy->getNumElements() * TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + SmallVector<Value *, 4> Args(CI->arg_operands()); + int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, + VecTy->getNumElements()); DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -1947,9 +2086,18 @@ int BoUpSLP::getTreeCost() { int SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"); + std::string Str; + { + raw_string_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; + } + DEBUG(dbgs() << Str); + + if (ViewSLPTree) + ViewGraph(this, "SLP" + F->getName(), false, Str); + return Cost; } @@ -2702,6 +2850,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *BoUpSLP::vectorizeTree() { + ExtraValueToDebugLocsMap ExternallyUsedValues; + return vectorizeTree(ExternallyUsedValues); +} + +Value * +BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { @@ -2744,7 +2898,7 @@ Value *BoUpSLP::vectorizeTree() { // Skip users that we already RAUW. This happens when one instruction // has multiple uses of the same value. - if (!is_contained(Scalar->users(), User)) + if (User && !is_contained(Scalar->users(), User)) continue; assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); @@ -2756,6 +2910,28 @@ Value *BoUpSLP::vectorizeTree() { assert(Vec && "Can't find vectorizable value"); Value *Lane = Builder.getInt32(ExternalUse.Lane); + // If User == nullptr, the Scalar is used as extra arg. Generate + // ExtractElement instruction and update the record for this scalar in + // ExternallyUsedValues. + if (!User) { + assert(ExternallyUsedValues.count(Scalar) && + "Scalar with nullptr as an external user must be registered in " + "ExternallyUsedValues map"); + if (auto *VecI = dyn_cast<Instruction>(Vec)) { + Builder.SetInsertPoint(VecI->getParent(), + std::next(VecI->getIterator())); + } else { + Builder.SetInsertPoint(&F->getEntryBlock().front()); + } + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); + CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); + auto &Locs = ExternallyUsedValues[Scalar]; + ExternallyUsedValues.insert({Ex, Locs}); + ExternallyUsedValues.erase(Scalar); + continue; + } + // Generate extracts for out-of-tree users. // Find the insertion point for the extractelement lane. if (auto *VecI = dyn_cast<Instruction>(Vec)) { @@ -3264,7 +3440,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // sorted by the original instruction location. This lets the final schedule // be as close as possible to the original instruction order. struct ScheduleDataCompare { - bool operator()(ScheduleData *SD1, ScheduleData *SD2) { + bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { return SD2->SchedulingPriority < SD1->SchedulingPriority; } }; @@ -3645,9 +3821,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); if (!Changed) return PreservedAnalyses::all(); + PreservedAnalyses PA; - PA.preserve<LoopAnalysis>(); - PA.preserve<DominatorTreeAnalysis>(); + PA.preserveSet<CFGAnalyses>(); PA.preserve<AAManager>(); PA.preserve<GlobalsAA>(); return PA; @@ -4026,36 +4202,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; + Value *P = V->getParent(); + + // Vectorize in current basic block only. + auto *Op0 = dyn_cast<Instruction>(V->getOperand(0)); + auto *Op1 = dyn_cast<Instruction>(V->getOperand(1)); + if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) + return false; + // Try to vectorize V. - if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) + if (tryToVectorizePair(Op0, Op1, R)) return true; - BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); - BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); + auto *A = dyn_cast<BinaryOperator>(Op0); + auto *B = dyn_cast<BinaryOperator>(Op1); // Try to skip B. if (B && B->hasOneUse()) { - BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); - BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); - if (tryToVectorizePair(A, B0, R)) { + auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); + auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); + if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) return true; - } - if (tryToVectorizePair(A, B1, R)) { + if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) return true; - } } // Try to skip A. if (A && A->hasOneUse()) { - BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); - BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); - if (tryToVectorizePair(A0, B, R)) { + auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); + auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); + if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) return true; - } - if (tryToVectorizePair(A1, B, R)) { + if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) return true; - } } - return 0; + return false; } /// \brief Generate a shuffle mask to be used in a reduction tree. @@ -4119,37 +4299,41 @@ namespace { class HorizontalReduction { SmallVector<Value *, 16> ReductionOps; SmallVector<Value *, 32> ReducedVals; + // Use map vector to make stable output. + MapVector<Instruction *, Value *> ExtraArgs; - BinaryOperator *ReductionRoot; - // After successfull horizontal reduction vectorization attempt for PHI node - // vectorizer tries to update root binary op by combining vectorized tree and - // the ReductionPHI node. But during vectorization this ReductionPHI can be - // vectorized itself and replaced by the undef value, while the instruction - // itself is marked for deletion. This 'marked for deletion' PHI node then can - // be used in new binary operation, causing "Use still stuck around after Def - // is destroyed" crash upon PHI node deletion. - WeakVH ReductionPHI; + BinaryOperator *ReductionRoot = nullptr; /// The opcode of the reduction. - unsigned ReductionOpcode; + Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd; /// The opcode of the values we perform a reduction on. - unsigned ReducedValueOpcode; + unsigned ReducedValueOpcode = 0; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. - bool IsPairwiseReduction; + bool IsPairwiseReduction = false; + + /// Checks if the ParentStackElem.first should be marked as a reduction + /// operation with an extra argument or as extra argument itself. + void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, + Value *ExtraArg) { + if (ExtraArgs.count(ParentStackElem.first)) { + ExtraArgs[ParentStackElem.first] = nullptr; + // We ran into something like: + // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. + // The whole ParentStackElem.first should be considered as an extra value + // in this case. + // Do not perform analysis of remaining operands of ParentStackElem.first + // instruction, this whole instruction is an extra argument. + ParentStackElem.second = ParentStackElem.first->getNumOperands(); + } else { + // We ran into something like: + // ParentStackElem.first += ... + ExtraArg + ... + ExtraArgs[ParentStackElem.first] = ExtraArg; + } + } public: - /// The width of one full horizontal reduction operation. - unsigned ReduxWidth; - - /// Minimal width of available vector registers. It's used to determine - /// ReduxWidth. - unsigned MinVecRegSize; - - HorizontalReduction(unsigned MinVecRegSize) - : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), - IsPairwiseReduction(false), ReduxWidth(0), - MinVecRegSize(MinVecRegSize) {} + HorizontalReduction() = default; /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { @@ -4176,21 +4360,14 @@ public: if (!isValidElementType(Ty)) return false; - const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; - // FIXME: Register size should be a parameter to this function, so we can - // try different vectorization factors. - ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; - ReductionPHI = Phi; - - if (ReduxWidth < 4) - return false; // We currently only support adds. - if (ReductionOpcode != Instruction::Add && - ReductionOpcode != Instruction::FAdd) + if ((ReductionOpcode != Instruction::Add && + ReductionOpcode != Instruction::FAdd) || + !B->isAssociative()) return false; // Post order traverse the reduction tree starting at B. We only handle true @@ -4202,30 +4379,26 @@ public: unsigned EdgeToVist = Stack.back().second++; bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; - // Only handle trees in the current basic block. - if (TreeN->getParent() != B->getParent()) - return false; - - // Each tree node needs to have one user except for the ultimate - // reduction. - if (!TreeN->hasOneUse() && TreeN != B) - return false; - // Postorder vist. if (EdgeToVist == 2 || IsReducedValue) { - if (IsReducedValue) { - // Make sure that the opcodes of the operations that we are going to - // reduce match. - if (!ReducedValueOpcode) - ReducedValueOpcode = TreeN->getOpcode(); - else if (ReducedValueOpcode != TreeN->getOpcode()) - return false; + if (IsReducedValue) ReducedVals.push_back(TreeN); - } else { - // We need to be able to reassociate the adds. - if (!TreeN->isAssociative()) - return false; - ReductionOps.push_back(TreeN); + else { + auto I = ExtraArgs.find(TreeN); + if (I != ExtraArgs.end() && !I->second) { + // Check if TreeN is an extra argument of its parent operation. + if (Stack.size() <= 1) { + // TreeN can't be an extra argument as it is a root reduction + // operation. + return false; + } + // Yes, TreeN is an extra argument, do not add it to a list of + // reduction operations. + // Stack[Stack.size() - 2] always points to the parent operation. + markExtraArg(Stack[Stack.size() - 2], TreeN); + ExtraArgs.erase(TreeN); + } else + ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); @@ -4242,13 +4415,44 @@ public: // reduced value class. if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode || I->getOpcode() == ReductionOpcode)) { - if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode) + // Only handle trees in the current basic block. + if (I->getParent() != B->getParent()) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } + + // Each tree node needs to have one user except for the ultimate + // reduction. + if (!I->hasOneUse() && I != B) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } + + if (I->getOpcode() == ReductionOpcode) { + // We need to be able to reassociate the reduction operations. + if (!I->isAssociative()) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } + } else if (ReducedValueOpcode && + ReducedValueOpcode != I->getOpcode()) { + // Make sure that the opcodes of the operations that we are going to + // reduce match. + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } else if (!ReducedValueOpcode) ReducedValueOpcode = I->getOpcode(); + Stack.push_back(std::make_pair(I, 0)); continue; } - return false; } + // NextV is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), NextV); } return true; } @@ -4259,10 +4463,15 @@ public: if (ReducedVals.empty()) return false; + // If there is a sufficient number of reduction values, reduce + // to a nearby power-of-2. Can safely generate oversized + // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < ReduxWidth) + if (NumReducedVals < 4) return false; + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + Value *VectorizedTree = nullptr; IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; @@ -4270,20 +4479,26 @@ public: Builder.setFastMathFlags(Unsafe); unsigned i = 0; - for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { + BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; + // The same extra argument may be used several time, so log each attempt + // to use it. + for (auto &Pair : ExtraArgs) + ExternallyUsedValues[Pair.second].push_back(Pair.first); + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ReductionOps); + V.buildTree(VL, ExternallyUsedValues, ReductionOps); if (V.shouldReorder()) { SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend()); - V.buildTree(Reversed, ReductionOps); + V.buildTree(Reversed, ExternallyUsedValues, ReductionOps); } if (V.isTreeTinyAndNotFullyVectorizable()) - continue; + break; V.computeMinimumValueSizes(); // Estimate cost. - int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); + int Cost = + V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth); if (Cost >= -SLPCostThreshold) break; @@ -4292,33 +4507,44 @@ public: // Vectorize a tree. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); - Value *VectorizedRoot = V.vectorizeTree(); + Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); // Emit a reduction. - Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); + Value *ReducedSubTree = + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedSubTree, "bin.rdx"); + VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, + ReducedSubTree, "bin.rdx"); + propagateIRFlags(VectorizedTree, ReductionOps); } else VectorizedTree = ReducedSubTree; + i += ReduxWidth; + ReduxWidth = PowerOf2Floor(NumReducedVals - i); } if (VectorizedTree) { // Finish the reduction. for (; i < NumReducedVals; ++i) { - Builder.SetCurrentDebugLocation( - cast<Instruction>(ReducedVals[i])->getDebugLoc()); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedVals[i]); + auto *I = cast<Instruction>(ReducedVals[i]); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + VectorizedTree = + Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); + propagateIRFlags(VectorizedTree, ReductionOps); + } + for (auto &Pair : ExternallyUsedValues) { + assert(!Pair.second.empty() && + "At least one DebugLoc must be inserted"); + // Add each externally used value to the final reduction. + for (auto *I : Pair.second) { + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, + Pair.first, "bin.extra"); + propagateIRFlags(VectorizedTree, I); + } } // Update users. - if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) { - assert(ReductionRoot && "Need a reduction operation"); - ReductionRoot->setOperand(0, VectorizedTree); - ReductionRoot->setOperand(1, ReductionPHI); - } else - ReductionRoot->replaceAllUsesWith(VectorizedTree); + ReductionRoot->replaceAllUsesWith(VectorizedTree); } return VectorizedTree != nullptr; } @@ -4329,7 +4555,8 @@ public: private: /// \brief Calculate the cost of a reduction. - int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { + int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal, + unsigned ReduxWidth) { Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); @@ -4352,15 +4579,9 @@ private: return VecReduxCost - ScalarReduxCost; } - static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, - Value *R, const Twine &Name = "") { - if (Opcode == Instruction::FAdd) - return Builder.CreateFAdd(L, R, Name); - return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); - } - /// \brief Emit a horizontal reduction of the vectorized value. - Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { + Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, + unsigned ReduxWidth, ArrayRef<Value *> RedOps) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); @@ -4378,15 +4599,16 @@ private: Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); + TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, + "bin.rdx"); } else { Value *UpperHalf = createRdxShuffleMask(ReduxWidth, i, false, false, Builder); Value *Shuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); + TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); } + propagateIRFlags(TmpVec, RedOps); } // The result is in the first element of the vector. @@ -4438,16 +4660,19 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem, static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl<Value *> &BuildVector, SmallVectorImpl<Value *> &BuildVectorOpds) { - if (!IV->hasOneUse()) - return false; - Value *V = IV->getAggregateOperand(); - if (!isa<UndefValue>(V)) { - InsertValueInst *I = dyn_cast<InsertValueInst>(V); - if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds)) + Value *V; + do { + BuildVector.push_back(IV); + BuildVectorOpds.push_back(IV->getInsertedValueOperand()); + V = IV->getAggregateOperand(); + if (isa<UndefValue>(V)) + break; + IV = dyn_cast<InsertValueInst>(V); + if (!IV || !IV->hasOneUse()) return false; - } - BuildVector.push_back(IV); - BuildVectorOpds.push_back(IV->getInsertedValueOperand()); + } while (true); + std::reverse(BuildVector.begin(), BuildVector.end()); + std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); return true; } @@ -4507,29 +4732,137 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, return nullptr; } +namespace { +/// Tracks instructons and its children. +class WeakVHWithLevel final : public CallbackVH { + /// Operand index of the instruction currently beeing analized. + unsigned Level = 0; + /// Is this the instruction that should be vectorized, or are we now + /// processing children (i.e. operands of this instruction) for potential + /// vectorization? + bool IsInitial = true; + +public: + explicit WeakVHWithLevel() = default; + WeakVHWithLevel(Value *V) : CallbackVH(V){}; + /// Restart children analysis each time it is repaced by the new instruction. + void allUsesReplacedWith(Value *New) override { + setValPtr(New); + Level = 0; + IsInitial = true; + } + /// Check if the instruction was not deleted during vectorization. + bool isValid() const { return !getValPtr(); } + /// Is the istruction itself must be vectorized? + bool isInitial() const { return IsInitial; } + /// Try to vectorize children. + void clearInitial() { IsInitial = false; } + /// Are all children processed already? + bool isFinal() const { + assert(getValPtr() && + (isa<Instruction>(getValPtr()) && + cast<Instruction>(getValPtr())->getNumOperands() >= Level)); + return getValPtr() && + cast<Instruction>(getValPtr())->getNumOperands() == Level; + } + /// Get next child operation. + Value *nextOperand() { + assert(getValPtr() && isa<Instruction>(getValPtr()) && + cast<Instruction>(getValPtr())->getNumOperands() > Level); + return cast<Instruction>(getValPtr())->getOperand(Level++); + } + virtual ~WeakVHWithLevel() = default; +}; +} // namespace + /// \brief Attempt to reduce a horizontal reduction. /// If it is legal to match a horizontal reduction feeding -/// the phi node P with reduction operators BI, then check if it -/// can be done. +/// the phi node P with reduction operators Root in a basic block BB, then check +/// if it can be done. /// \returns true if a horizontal reduction was matched and reduced. /// \returns false if a horizontal reduction was not matched. -static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, - BoUpSLP &R, TargetTransformInfo *TTI, - unsigned MinRegSize) { +static bool canBeVectorized( + PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, + TargetTransformInfo *TTI, + const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) { if (!ShouldVectorizeHor) return false; - HorizontalReduction HorRdx(MinRegSize); - if (!HorRdx.matchAssociativeReduction(P, BI)) + if (!Root) return false; - // If there is a sufficient number of reduction values, reduce - // to a nearby power-of-2. Can safely generate oversized - // vectors and rely on the backend to split them to legal sizes. - HorRdx.ReduxWidth = - std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + if (Root->getParent() != BB) + return false; + SmallVector<WeakVHWithLevel, 8> Stack(1, Root); + SmallSet<Value *, 8> VisitedInstrs; + bool Res = false; + while (!Stack.empty()) { + Value *V = Stack.back(); + if (!V) { + Stack.pop_back(); + continue; + } + auto *Inst = dyn_cast<Instruction>(V); + if (!Inst || isa<PHINode>(Inst)) { + Stack.pop_back(); + continue; + } + if (Stack.back().isInitial()) { + Stack.back().clearInitial(); + if (auto *BI = dyn_cast<BinaryOperator>(Inst)) { + HorizontalReduction HorRdx; + if (HorRdx.matchAssociativeReduction(P, BI)) { + if (HorRdx.tryToReduce(R, TTI)) { + Res = true; + P = nullptr; + continue; + } + } + if (P) { + Inst = dyn_cast<Instruction>(BI->getOperand(0)); + if (Inst == P) + Inst = dyn_cast<Instruction>(BI->getOperand(1)); + if (!Inst) { + P = nullptr; + continue; + } + } + } + P = nullptr; + if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) { + Res = true; + continue; + } + } + if (Stack.back().isFinal()) { + Stack.pop_back(); + continue; + } - return HorRdx.tryToReduce(R, TTI); + if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand())) + if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second && + Stack.size() < RecursionMaxDepth) + Stack.push_back(NextV); + } + return Res; +} + +bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, + BasicBlock *BB, BoUpSLP &R, + TargetTransformInfo *TTI) { + if (!V) + return false; + auto *I = dyn_cast<Instruction>(V); + if (!I) + return false; + + if (!isa<BinaryOperator>(I)) + P = nullptr; + // Try to match and vectorize a horizontal reduction. + return canBeVectorized(P, I, BB, R, TTI, + [this](BinaryOperator *BI, BoUpSLP &R) -> bool { + return tryToVectorize(BI, R); + }); } bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { @@ -4599,67 +4932,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = getReductionValue(DT, P, BB, LI); - - // Check if this is a Binary Operator. - BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); - if (!BI) - continue; - // Try to match and vectorize a horizontal reduction. - if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) { + if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, + TTI)) { Changed = true; it = BB->begin(); e = BB->end(); continue; } - - Value *Inst = BI->getOperand(0); - if (Inst == P) - Inst = BI->getOperand(1); - - if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - continue; } - if (ShouldStartVectorizeHorAtStore) - if (StoreInst *SI = dyn_cast<StoreInst>(it)) - if (BinaryOperator *BinOp = - dyn_cast<BinaryOperator>(SI->getValueOperand())) { - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - R.getMinVecRegSize()) || - tryToVectorize(BinOp, R)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } + if (ShouldStartVectorizeHorAtStore) { + if (StoreInst *SI = dyn_cast<StoreInst>(it)) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R, + TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; } + } + } // Try to vectorize horizontal reductions feeding into a return. - if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) - if (RI->getNumOperands() != 0) - if (BinaryOperator *BinOp = - dyn_cast<BinaryOperator>(RI->getOperand(0))) { - DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - R.getMinVecRegSize()) || - tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1), - R)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } + if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) { + if (RI->getNumOperands() != 0) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; } + } + } // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast<CmpInst>(it)) { @@ -4672,16 +4980,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; } - for (int i = 0; i < 2; ++i) { - if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { - if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { - Changed = true; - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - break; - } + for (int I = 0; I < 2; ++I) { + if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) { + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); + break; } } continue; |