aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp1362
1 files changed, 898 insertions, 464 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2e856a7e6802..27a86c0bca91 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1,9 +1,8 @@
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -106,6 +105,10 @@ using namespace slpvectorizer;
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+cl::opt<bool>
+ llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,
+ cl::desc("Run the SLP vectorization passes"));
+
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
@@ -207,6 +210,13 @@ static bool isSplat(ArrayRef<Value *> VL) {
return true;
}
+/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
+static bool isCommutative(Instruction *I) {
+ if (auto *IC = dyn_cast<CmpInst>(I))
+ return IC->isCommutative();
+ return I->isCommutative();
+}
+
/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
@@ -438,8 +448,9 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (hasVectorInstrinsicScalarOpd(ID, 1)) {
- return (CI->getArgOperand(1) == Scalar);
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ if (hasVectorInstrinsicScalarOpd(ID, i))
+ return (CI->getArgOperand(i) == Scalar);
}
LLVM_FALLTHROUGH;
}
@@ -474,6 +485,8 @@ namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
+ struct TreeEntry;
+
public:
using ValueList = SmallVector<Value *, 8>;
using InstrList = SmallVector<Instruction *, 16>;
@@ -517,7 +530,7 @@ public:
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
- int getSpillCost();
+ int getSpillCost() const;
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
@@ -576,7 +589,7 @@ public:
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
- unsigned getVectorElementSize(Value *V);
+ unsigned getVectorElementSize(Value *V) const;
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
@@ -599,13 +612,512 @@ public:
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
- bool isTreeTinyAndNotFullyVectorizable();
+ bool isTreeTinyAndNotFullyVectorizable() const;
OptimizationRemarkEmitter *getORE() { return ORE; }
-private:
- struct TreeEntry;
+ /// This structure holds any data we need about the edges being traversed
+ /// during buildTree_rec(). We keep track of:
+ /// (i) the user TreeEntry index, and
+ /// (ii) the index of the edge.
+ struct EdgeInfo {
+ EdgeInfo() = default;
+ EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
+ : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
+ /// The user TreeEntry.
+ TreeEntry *UserTE = nullptr;
+ /// The operand index of the use.
+ unsigned EdgeIdx = UINT_MAX;
+#ifndef NDEBUG
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const BoUpSLP::EdgeInfo &EI) {
+ EI.dump(OS);
+ return OS;
+ }
+ /// Debug print.
+ void dump(raw_ostream &OS) const {
+ OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
+ << " EdgeIdx:" << EdgeIdx << "}";
+ }
+ LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
+#endif
+ };
+
+ /// A helper data structure to hold the operands of a vector of instructions.
+ /// This supports a fixed vector length for all operand vectors.
+ class VLOperands {
+ /// For each operand we need (i) the value, and (ii) the opcode that it
+ /// would be attached to if the expression was in a left-linearized form.
+ /// This is required to avoid illegal operand reordering.
+ /// For example:
+ /// \verbatim
+ /// 0 Op1
+ /// |/
+ /// Op1 Op2 Linearized + Op2
+ /// \ / ----------> |/
+ /// - -
+ ///
+ /// Op1 - Op2 (0 + Op1) - Op2
+ /// \endverbatim
+ ///
+ /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+ ///
+ /// Another way to think of this is to track all the operations across the
+ /// path from the operand all the way to the root of the tree and to
+ /// calculate the operation that corresponds to this path. For example, the
+ /// path from Op2 to the root crosses the RHS of the '-', therefore the
+ /// corresponding operation is a '-' (which matches the one in the
+ /// linearized tree, as shown above).
+ ///
+ /// For lack of a better term, we refer to this operation as Accumulated
+ /// Path Operation (APO).
+ struct OperandData {
+ OperandData() = default;
+ OperandData(Value *V, bool APO, bool IsUsed)
+ : V(V), APO(APO), IsUsed(IsUsed) {}
+ /// The operand value.
+ Value *V = nullptr;
+ /// TreeEntries only allow a single opcode, or an alternate sequence of
+ /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+ /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+ /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+ /// (e.g., Add/Mul)
+ bool APO = false;
+ /// Helper data for the reordering function.
+ bool IsUsed = false;
+ };
+
+ /// During operand reordering, we are trying to select the operand at lane
+ /// that matches best with the operand at the neighboring lane. Our
+ /// selection is based on the type of value we are looking for. For example,
+ /// if the neighboring lane has a load, we need to look for a load that is
+ /// accessing a consecutive address. These strategies are summarized in the
+ /// 'ReorderingMode' enumerator.
+ enum class ReorderingMode {
+ Load, ///< Matching loads to consecutive memory addresses
+ Opcode, ///< Matching instructions based on opcode (same or alternate)
+ Constant, ///< Matching constants
+ Splat, ///< Matching the same instruction multiple times (broadcast)
+ Failed, ///< We failed to create a vectorizable group
+ };
+
+ using OperandDataVec = SmallVector<OperandData, 2>;
+
+ /// A vector of operand vectors.
+ SmallVector<OperandDataVec, 4> OpsVec;
+
+ const DataLayout &DL;
+ ScalarEvolution &SE;
+
+ /// \returns the operand data at \p OpIdx and \p Lane.
+ OperandData &getData(unsigned OpIdx, unsigned Lane) {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+ const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// Clears the used flag for all entries.
+ void clearUsed() {
+ for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane)
+ OpsVec[OpIdx][Lane].IsUsed = false;
+ }
+
+ /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+ void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+ }
+
+ // Search all operands in Ops[*][Lane] for the one that matches best
+ // Ops[OpIdx][LastLane] and return its opreand index.
+ // If no good match can be found, return None.
+ Optional<unsigned>
+ getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ ArrayRef<ReorderingMode> ReorderingModes) {
+ unsigned NumOperands = getNumOperands();
+
+ // The operand of the previous lane at OpIdx.
+ Value *OpLastLane = getData(OpIdx, LastLane).V;
+
+ // Our strategy mode for OpIdx.
+ ReorderingMode RMode = ReorderingModes[OpIdx];
+
+ // The linearized opcode of the operand at OpIdx, Lane.
+ bool OpIdxAPO = getData(OpIdx, Lane).APO;
+
+ const unsigned BestScore = 2;
+ const unsigned GoodScore = 1;
+
+ // The best operand index and its score.
+ // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
+ // are using the score to differentiate between the two.
+ struct BestOpData {
+ Optional<unsigned> Idx = None;
+ unsigned Score = 0;
+ } BestOp;
+
+ // Iterate through all unused operands and look for the best.
+ for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
+ // Get the operand at Idx and Lane.
+ OperandData &OpData = getData(Idx, Lane);
+ Value *Op = OpData.V;
+ bool OpAPO = OpData.APO;
+
+ // Skip already selected operands.
+ if (OpData.IsUsed)
+ continue;
+
+ // Skip if we are trying to move the operand to a position with a
+ // different opcode in the linearized tree form. This would break the
+ // semantics.
+ if (OpAPO != OpIdxAPO)
+ continue;
+
+ // Look for an operand that matches the current mode.
+ switch (RMode) {
+ case ReorderingMode::Load:
+ if (isa<LoadInst>(Op)) {
+ // Figure out which is left and right, so that we can check for
+ // consecutive loads
+ bool LeftToRight = Lane > LastLane;
+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+ if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
+ cast<LoadInst>(OpRight), DL, SE))
+ BestOp.Idx = Idx;
+ }
+ break;
+ case ReorderingMode::Opcode:
+ // We accept both Instructions and Undefs, but with different scores.
+ if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
+ cast<Instruction>(Op)->getOpcode() ==
+ cast<Instruction>(OpLastLane)->getOpcode()) ||
+ (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
+ isa<UndefValue>(Op)) {
+ // An instruction has a higher score than an undef.
+ unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
+ if (Score > BestOp.Score) {
+ BestOp.Idx = Idx;
+ BestOp.Score = Score;
+ }
+ }
+ break;
+ case ReorderingMode::Constant:
+ if (isa<Constant>(Op)) {
+ unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
+ if (Score > BestOp.Score) {
+ BestOp.Idx = Idx;
+ BestOp.Score = Score;
+ }
+ }
+ break;
+ case ReorderingMode::Splat:
+ if (Op == OpLastLane)
+ BestOp.Idx = Idx;
+ break;
+ case ReorderingMode::Failed:
+ return None;
+ }
+ }
+
+ if (BestOp.Idx) {
+ getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
+ return BestOp.Idx;
+ }
+ // If we could not find a good match return None.
+ return None;
+ }
+
+ /// Helper for reorderOperandVecs. \Returns the lane that we should start
+ /// reordering from. This is the one which has the least number of operands
+ /// that can freely move about.
+ unsigned getBestLaneToStartReordering() const {
+ unsigned BestLane = 0;
+ unsigned Min = UINT_MAX;
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane) {
+ unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
+ if (NumFreeOps < Min) {
+ Min = NumFreeOps;
+ BestLane = Lane;
+ }
+ }
+ return BestLane;
+ }
+
+ /// \Returns the maximum number of operands that are allowed to be reordered
+ /// for \p Lane. This is used as a heuristic for selecting the first lane to
+ /// start operand reordering.
+ unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+ unsigned CntTrue = 0;
+ unsigned NumOperands = getNumOperands();
+ // Operands with the same APO can be reordered. We therefore need to count
+ // how many of them we have for each APO, like this: Cnt[APO] = x.
+ // Since we only have two APOs, namely true and false, we can avoid using
+ // a map. Instead we can simply count the number of operands that
+ // correspond to one of them (in this case the 'true' APO), and calculate
+ // the other by subtracting it from the total number of operands.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
+ if (getData(OpIdx, Lane).APO)
+ ++CntTrue;
+ unsigned CntFalse = NumOperands - CntTrue;
+ return std::max(CntTrue, CntFalse);
+ }
+
+ /// Go through the instructions in VL and append their operands.
+ void appendOperandsOfVL(ArrayRef<Value *> VL) {
+ assert(!VL.empty() && "Bad VL");
+ assert((empty() || VL.size() == getNumLanes()) &&
+ "Expected same number of lanes");
+ assert(isa<Instruction>(VL[0]) && "Expected instruction");
+ unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+ OpsVec.resize(NumOperands);
+ unsigned NumLanes = VL.size();
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ OpsVec[OpIdx].resize(NumLanes);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
+ // Our tree has just 3 nodes: the root and two operands.
+ // It is therefore trivial to get the APO. We only need to check the
+ // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
+ // RHS operand. The LHS operand of both add and sub is never attached
+ // to an inversese operation in the linearized form, therefore its APO
+ // is false. The RHS is true only if VL[Lane] is an inverse operation.
+
+ // Since operand reordering is performed on groups of commutative
+ // operations or alternating sequences (e.g., +, -), we can safely
+ // tell the inverse operations by checking commutativity.
+ bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+ bool APO = (OpIdx == 0) ? false : IsInverseOperation;
+ OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
+ APO, false};
+ }
+ }
+ }
+
+ /// \returns the number of operands.
+ unsigned getNumOperands() const { return OpsVec.size(); }
+
+ /// \returns the number of lanes.
+ unsigned getNumLanes() const { return OpsVec[0].size(); }
+
+ /// \returns the operand value at \p OpIdx and \p Lane.
+ Value *getValue(unsigned OpIdx, unsigned Lane) const {
+ return getData(OpIdx, Lane).V;
+ }
+ /// \returns true if the data structure is empty.
+ bool empty() const { return OpsVec.empty(); }
+
+ /// Clears the data.
+ void clear() { OpsVec.clear(); }
+
+ /// \Returns true if there are enough operands identical to \p Op to fill
+ /// the whole vector.
+ /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
+ bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ bool OpAPO = getData(OpIdx, Lane).APO;
+ for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
+ if (Ln == Lane)
+ continue;
+ // This is set to true if we found a candidate for broadcast at Lane.
+ bool FoundCandidate = false;
+ for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
+ OperandData &Data = getData(OpI, Ln);
+ if (Data.APO != OpAPO || Data.IsUsed)
+ continue;
+ if (Data.V == Op) {
+ FoundCandidate = true;
+ Data.IsUsed = true;
+ break;
+ }
+ }
+ if (!FoundCandidate)
+ return false;
+ }
+ return true;
+ }
+
+ public:
+ /// Initialize with all the operands of the instruction vector \p RootVL.
+ VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
+ ScalarEvolution &SE)
+ : DL(DL), SE(SE) {
+ // Append all the operands of RootVL.
+ appendOperandsOfVL(RootVL);
+ }
+
+ /// \Returns a value vector with the operands across all lanes for the
+ /// opearnd at \p OpIdx.
+ ValueList getVL(unsigned OpIdx) const {
+ ValueList OpVL(OpsVec[OpIdx].size());
+ assert(OpsVec[OpIdx].size() == getNumLanes() &&
+ "Expected same num of lanes across all operands");
+ for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
+ OpVL[Lane] = OpsVec[OpIdx][Lane].V;
+ return OpVL;
+ }
+
+ // Performs operand reordering for 2 or more operands.
+ // The original operands are in OrigOps[OpIdx][Lane].
+ // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
+ void reorder() {
+ unsigned NumOperands = getNumOperands();
+ unsigned NumLanes = getNumLanes();
+ // Each operand has its own mode. We are using this mode to help us select
+ // the instructions for each lane, so that they match best with the ones
+ // we have selected so far.
+ SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
+
+ // This is a greedy single-pass algorithm. We are going over each lane
+ // once and deciding on the best order right away with no back-tracking.
+ // However, in order to increase its effectiveness, we start with the lane
+ // that has operands that can move the least. For example, given the
+ // following lanes:
+ // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
+ // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
+ // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
+ // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
+ // we will start at Lane 1, since the operands of the subtraction cannot
+ // be reordered. Then we will visit the rest of the lanes in a circular
+ // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
+
+ // Find the first lane that we will start our search from.
+ unsigned FirstLane = getBestLaneToStartReordering();
+
+ // Initialize the modes.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ Value *OpLane0 = getValue(OpIdx, FirstLane);
+ // Keep track if we have instructions with all the same opcode on one
+ // side.
+ if (isa<LoadInst>(OpLane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Load;
+ else if (isa<Instruction>(OpLane0)) {
+ // Check if OpLane0 should be broadcast.
+ if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
+ ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else
+ ReorderingModes[OpIdx] = ReorderingMode::Opcode;
+ }
+ else if (isa<Constant>(OpLane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Constant;
+ else if (isa<Argument>(OpLane0))
+ // Our best hope is a Splat. It may save some cost in some cases.
+ ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else
+ // NOTE: This should be unreachable.
+ ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ }
+
+ // If the initial strategy fails for any of the operand indexes, then we
+ // perform reordering again in a second pass. This helps avoid assigning
+ // high priority to the failed strategy, and should improve reordering for
+ // the non-failed operand indexes.
+ for (int Pass = 0; Pass != 2; ++Pass) {
+ // Skip the second pass if the first pass did not fail.
+ bool StrategyFailed = false;
+ // Mark all operand data as free to use.
+ clearUsed();
+ // We keep the original operand order for the FirstLane, so reorder the
+ // rest of the lanes. We are visiting the nodes in a circular fashion,
+ // using FirstLane as the center point and increasing the radius
+ // distance.
+ for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
+ // Visit the lane on the right and then the lane on the left.
+ for (int Direction : {+1, -1}) {
+ int Lane = FirstLane + Direction * Distance;
+ if (Lane < 0 || Lane >= (int)NumLanes)
+ continue;
+ int LastLane = Lane - Direction;
+ assert(LastLane >= 0 && LastLane < (int)NumLanes &&
+ "Out of bounds");
+ // Look for a good match for each operand.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ // Search for the operand that matches SortedOps[OpIdx][Lane-1].
+ Optional<unsigned> BestIdx =
+ getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
+ // By not selecting a value, we allow the operands that follow to
+ // select a better matching value. We will get a non-null value in
+ // the next run of getBestOperand().
+ if (BestIdx) {
+ // Swap the current operand with the one returned by
+ // getBestOperand().
+ swap(OpIdx, BestIdx.getValue(), Lane);
+ } else {
+ // We failed to find a best operand, set mode to 'Failed'.
+ ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ // Enable the second pass.
+ StrategyFailed = true;
+ }
+ }
+ }
+ }
+ // Skip second pass if the strategy did not fail.
+ if (!StrategyFailed)
+ break;
+ }
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
+ switch (RMode) {
+ case ReorderingMode::Load:
+ return "Load";
+ case ReorderingMode::Opcode:
+ return "Opcode";
+ case ReorderingMode::Constant:
+ return "Constant";
+ case ReorderingMode::Splat:
+ return "Splat";
+ case ReorderingMode::Failed:
+ return "Failed";
+ }
+ llvm_unreachable("Unimplemented Reordering Type");
+ }
+
+ LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
+ raw_ostream &OS) {
+ return OS << getModeStr(RMode);
+ }
+
+ /// Debug print.
+ LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
+ printMode(RMode, dbgs());
+ }
+
+ friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
+ return printMode(RMode, OS);
+ }
+
+ LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
+ const unsigned Indent = 2;
+ unsigned Cnt = 0;
+ for (const OperandDataVec &OpDataVec : OpsVec) {
+ OS << "Operand " << Cnt++ << "\n";
+ for (const OperandData &OpData : OpDataVec) {
+ OS.indent(Indent) << "{";
+ if (Value *V = OpData.V)
+ OS << *V;
+ else
+ OS << "null";
+ OS << ", APO:" << OpData.APO << "}\n";
+ }
+ OS << "\n";
+ }
+ return OS;
+ }
+
+ /// Debug print.
+ LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
+ };
+
+private:
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I) const;
@@ -613,7 +1125,8 @@ private:
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
- void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
+ const EdgeInfo &EI);
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -631,12 +1144,12 @@ private:
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
+ int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
- int getGatherCost(ArrayRef<Value *> VL);
+ int getGatherCost(ArrayRef<Value *> VL) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
@@ -648,22 +1161,18 @@ private:
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
- bool isFullyVectorizableTinyTree();
+ bool isFullyVectorizableTinyTree() const;
- /// \reorder commutative operands in alt shuffle if they result in
- /// vectorized code.
- void reorderAltShuffleOperands(const InstructionsState &S,
- ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right);
-
- /// \reorder commutative operands to get better probability of
+ /// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
- void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right);
+ static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ const DataLayout &DL,
+ ScalarEvolution &SE);
struct TreeEntry {
- TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
+ using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
+ TreeEntry(VecTreeTy &Container) : Container(Container) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
@@ -696,20 +1205,103 @@ private:
/// to be a pointer and needs to be able to initialize the child iterator.
/// Thus we need a reference back to the container to translate the indices
/// to entries.
- std::vector<TreeEntry> &Container;
+ VecTreeTy &Container;
/// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree.
- SmallVector<int, 1> UserTreeIndices;
+ SmallVector<EdgeInfo, 1> UserTreeIndices;
+
+ /// The index of this treeEntry in VectorizableTree.
+ int Idx = -1;
+
+ private:
+ /// The operands of each instruction in each lane Operands[op_index][lane].
+ /// Note: This helps avoid the replication of the code that performs the
+ /// reordering of operands during buildTree_rec() and vectorizeTree().
+ SmallVector<ValueList, 2> Operands;
+
+ public:
+ /// Set this bundle's \p OpIdx'th operand to \p OpVL.
+ void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,
+ ArrayRef<unsigned> ReuseShuffleIndices) {
+ if (Operands.size() < OpIdx + 1)
+ Operands.resize(OpIdx + 1);
+ assert(Operands[OpIdx].size() == 0 && "Already resized?");
+ Operands[OpIdx].resize(Scalars.size());
+ for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
+ Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())
+ ? OpVL[ReuseShuffleIndices[Lane]]
+ : OpVL[Lane];
+ }
+
+ /// If there is a user TreeEntry, then set its operand.
+ void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,
+ ArrayRef<Value *> OpVL,
+ ArrayRef<unsigned> ReuseShuffleIndices) {
+ if (UserTreeIdx.UserTE)
+ UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL,
+ ReuseShuffleIndices);
+ }
+
+ /// \returns the \p OpIdx operand of this TreeEntry.
+ ValueList &getOperand(unsigned OpIdx) {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ return Operands[OpIdx];
+ }
+
+ /// \return the single \p OpIdx operand.
+ Value *getSingleOperand(unsigned OpIdx) const {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ assert(!Operands[OpIdx].empty() && "No operand available");
+ return Operands[OpIdx][0];
+ }
+
+#ifndef NDEBUG
+ /// Debug printer.
+ LLVM_DUMP_METHOD void dump() const {
+ dbgs() << Idx << ".\n";
+ for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
+ dbgs() << "Operand " << OpI << ":\n";
+ for (const Value *V : Operands[OpI])
+ dbgs().indent(2) << *V << "\n";
+ }
+ dbgs() << "Scalars: \n";
+ for (Value *V : Scalars)
+ dbgs().indent(2) << *V << "\n";
+ dbgs() << "NeedToGather: " << NeedToGather << "\n";
+ dbgs() << "VectorizedValue: ";
+ if (VectorizedValue)
+ dbgs() << *VectorizedValue;
+ else
+ dbgs() << "NULL";
+ dbgs() << "\n";
+ dbgs() << "ReuseShuffleIndices: ";
+ if (ReuseShuffleIndices.empty())
+ dbgs() << "Emtpy";
+ else
+ for (unsigned Idx : ReuseShuffleIndices)
+ dbgs() << Idx << ", ";
+ dbgs() << "\n";
+ dbgs() << "ReorderIndices: ";
+ for (unsigned Idx : ReorderIndices)
+ dbgs() << Idx << ", ";
+ dbgs() << "\n";
+ dbgs() << "UserTreeIndices: ";
+ for (const auto &EInfo : UserTreeIndices)
+ dbgs() << EInfo << ", ";
+ dbgs() << "\n";
+ }
+#endif
};
/// Create a new VectorizableTree entry.
- void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
- ArrayRef<unsigned> ReuseShuffleIndices = None,
- ArrayRef<unsigned> ReorderIndices = None) {
- VectorizableTree.emplace_back(VectorizableTree);
- int idx = VectorizableTree.size() - 1;
- TreeEntry *Last = &VectorizableTree[idx];
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+ const EdgeInfo &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
+ VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));
+ TreeEntry *Last = VectorizableTree.back().get();
+ Last->Idx = VectorizableTree.size() - 1;
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
@@ -718,25 +1310,44 @@ private:
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
- ScalarToTreeEntry[VL[i]] = idx;
+ ScalarToTreeEntry[VL[i]] = Last->Idx;
}
} else {
MustGather.insert(VL.begin(), VL.end());
}
- if (UserTreeIdx >= 0)
+ if (UserTreeIdx.UserTE)
Last->UserTreeIndices.push_back(UserTreeIdx);
- UserTreeIdx = idx;
+
+ Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);
+ return Last;
}
/// -- Vectorization State --
/// Holds all of the tree entries.
- std::vector<TreeEntry> VectorizableTree;
+ TreeEntry::VecTreeTy VectorizableTree;
+
+#ifndef NDEBUG
+ /// Debug printer.
+ LLVM_DUMP_METHOD void dumpVectorizableTree() const {
+ for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
+ VectorizableTree[Id]->dump();
+ dbgs() << "\n";
+ }
+ }
+#endif
TreeEntry *getTreeEntry(Value *V) {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
- return &VectorizableTree[I->second];
+ return VectorizableTree[I->second].get();
+ return nullptr;
+ }
+
+ const TreeEntry *getTreeEntry(Value *V) const {
+ auto I = ScalarToTreeEntry.find(V);
+ if (I != ScalarToTreeEntry.end())
+ return VectorizableTree[I->second].get();
return nullptr;
}
@@ -1246,21 +1857,25 @@ template <> struct GraphTraits<BoUpSLP *> {
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
+ using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
+
/// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
- : public iterator_adaptor_base<ChildIteratorType,
- SmallVector<int, 1>::iterator> {
- std::vector<TreeEntry> &VectorizableTree;
+ : public iterator_adaptor_base<
+ ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
+ ContainerTy &VectorizableTree;
- ChildIteratorType(SmallVector<int, 1>::iterator W,
- std::vector<TreeEntry> &VT)
+ ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
+ ContainerTy &VT)
: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
- NodeRef operator*() { return &VectorizableTree[*I]; }
+ NodeRef operator*() { return I->UserTE; }
};
- static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
+ static NodeRef getEntryNode(BoUpSLP &R) {
+ return R.VectorizableTree[0].get();
+ }
static ChildIteratorType child_begin(NodeRef N) {
return {N->UserTreeIndices.begin(), N->Container};
@@ -1272,7 +1887,19 @@ template <> struct GraphTraits<BoUpSLP *> {
/// For the node iterator we just need to turn the TreeEntry iterator into a
/// TreeEntry* iterator so that it dereferences to NodeRef.
- using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
+ class nodes_iterator {
+ using ItTy = ContainerTy::iterator;
+ ItTy It;
+
+ public:
+ nodes_iterator(const ItTy &It2) : It(It2) {}
+ NodeRef operator*() { return It->get(); }
+ nodes_iterator operator++() {
+ ++It;
+ return *this;
+ }
+ bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
+ };
static nodes_iterator nodes_begin(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.begin());
@@ -1331,11 +1958,11 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
UserIgnoreList = UserIgnoreLst;
if (!allSameType(Roots))
return;
- buildTree_rec(Roots, 0, -1);
+ buildTree_rec(Roots, 0, EdgeInfo());
// Collect the values that we need to extract from the tree.
- for (TreeEntry &EIdx : VectorizableTree) {
- TreeEntry *Entry = &EIdx;
+ for (auto &TEPtr : VectorizableTree) {
+ TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
if (Entry->NeedToGather)
@@ -1393,7 +2020,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
- int UserTreeIdx) {
+ const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
InstructionsState S = getSameOpcode(VL);
@@ -1450,6 +2077,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
E->UserTreeIndices.push_back(UserTreeIdx);
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
<< ".\n");
+ E->trySetUserTEOperand(UserTreeIdx, VL, None);
return;
}
@@ -1468,8 +2096,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// If any of the scalars is marked as a value that needs to stay scalar, then
// we need to gather the scalars.
+ // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- if (MustGather.count(VL[i])) {
+ if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
@@ -1548,7 +2177,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1558,7 +2187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, {TE, i});
}
return;
}
@@ -1571,6 +2200,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
++NumOpsWantToKeepOriginalOrder;
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
ReuseShuffleIndicies);
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
return;
}
if (!CurrentOrder.empty()) {
@@ -1588,6 +2222,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
++StoredCurrentOrderAndNum->getSecond();
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
StoredCurrentOrderAndNum->getFirst());
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
return;
}
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -1693,7 +2332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1702,7 +2341,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, {TE, i});
}
return;
}
@@ -1710,10 +2349,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
Type *ComparedTy = VL0->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
- if (Cmp->getPredicate() != P0 ||
+ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
@@ -1723,20 +2363,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<Instruction>(j)->getOperand(i));
-
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ ValueList Left, Right;
+ if (cast<CmpInst>(VL0)->isCommutative()) {
+ // Commutative predicate - collect + sort operands of the instructions
+ // so that each side is more likely to have the same opcode.
+ assert(P0 == SwapP0 && "Commutative Predicate mismatch");
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ } else {
+ // Collect operands - commute if it uses the swapped predicate.
+ for (Value *V : VL) {
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ if (Cmp->getPredicate() != P0)
+ std::swap(LHS, RHS);
+ Left.push_back(LHS);
+ Right.push_back(RHS);
+ }
}
+
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
return;
}
case Instruction::Select:
+ case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -1754,17 +2408,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor:
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+ case Instruction::Xor: {
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
- buildTree_rec(Left, Depth + 1, UserTreeIdx);
- buildTree_rec(Right, Depth + 1, UserTreeIdx);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
return;
}
@@ -1774,10 +2428,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, {TE, i});
}
return;
-
+ }
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
@@ -1815,7 +2469,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
@@ -1823,7 +2477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, {TE, i});
}
return;
}
@@ -1837,14 +2491,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(0));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
return;
}
case Instruction::Call: {
@@ -1860,9 +2514,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
Function *Int = CI->getCalledFunction();
- Value *A1I = nullptr;
- if (hasVectorInstrinsicScalarOpd(ID, 1))
- A1I = CI->getArgOperand(1);
+ unsigned NumArgs = CI->getNumArgOperands();
+ SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
+ for (unsigned j = 0; j != NumArgs; ++j)
+ if (hasVectorInstrinsicScalarOpd(ID, j))
+ ScalarArgs[j] = CI->getArgOperand(j);
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
@@ -1874,16 +2530,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
<< "\n");
return;
}
- // ctlz,cttz and powi are special intrinsics whose second argument
- // should be same in order for them to be vectorized.
- if (hasVectorInstrinsicScalarOpd(ID, 1)) {
- Value *A1J = CI2->getArgOperand(1);
- if (A1I != A1J) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
- << " argument " << A1I << "!=" << A1J << "\n");
- return;
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized.
+ for (unsigned j = 0; j != NumArgs; ++j) {
+ if (hasVectorInstrinsicScalarOpd(ID, j)) {
+ Value *A1J = CI2->getArgOperand(j);
+ if (ScalarArgs[j] != A1J) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << ScalarArgs[j] << "!=" << A1J
+ << "\n");
+ return;
+ }
}
}
// Verify that the bundle operands are identical between the two calls.
@@ -1899,7 +2558,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1907,11 +2566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, {TE, i});
}
return;
}
- case Instruction::ShuffleVector:
+ case Instruction::ShuffleVector: {
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
if (!S.isAltShuffle()) {
@@ -1920,15 +2579,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
- reorderAltShuffleOperands(S, VL, Left, Right);
- buildTree_rec(Left, Depth + 1, UserTreeIdx);
- buildTree_rec(Right, Depth + 1, UserTreeIdx);
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
return;
}
@@ -1938,10 +2597,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, {TE, i});
}
return;
-
+ }
default:
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
@@ -2223,6 +2882,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
+ case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -2260,7 +2920,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
ConstantInt *CInt0 = nullptr;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
- ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
+ unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
+ ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
if (!CInt) {
Op2VK = TargetTransformInfo::OK_AnyValue;
Op2VP = TargetTransformInfo::OP_None;
@@ -2413,31 +3074,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
}
-bool BoUpSLP::isFullyVectorizableTinyTree() {
+bool BoUpSLP::isFullyVectorizableTinyTree() const {
LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
- if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
+ if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)
return true;
if (VectorizableTree.size() != 2)
return false;
// Handle splat and all-constants stores.
- if (!VectorizableTree[0].NeedToGather &&
- (allConstant(VectorizableTree[1].Scalars) ||
- isSplat(VectorizableTree[1].Scalars)))
+ if (!VectorizableTree[0]->NeedToGather &&
+ (allConstant(VectorizableTree[1]->Scalars) ||
+ isSplat(VectorizableTree[1]->Scalars)))
return true;
// Gathering cost would be too much for tiny trees.
- if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
+ if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)
return false;
return true;
}
-bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -2457,19 +3118,19 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
return true;
}
-int BoUpSLP::getSpillCost() {
+int BoUpSLP::getSpillCost() const {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
- unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+ unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
int Cost = 0;
SmallPtrSet<Instruction*, 4> LiveValues;
Instruction *PrevInst = nullptr;
- for (const auto &N : VectorizableTree) {
- Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
+ for (const auto &TEPtr : VectorizableTree) {
+ Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
if (!Inst)
continue;
@@ -2494,6 +3155,7 @@ int BoUpSLP::getSpillCost() {
});
// Now find the sequence of instructions between PrevInst and Inst.
+ unsigned NumCalls = 0;
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
PrevInstIt =
PrevInst->getIterator().getReverse();
@@ -2506,16 +3168,19 @@ int BoUpSLP::getSpillCost() {
// Debug informations don't impact spill cost.
if ((isa<CallInst>(&*PrevInstIt) &&
!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
- &*PrevInstIt != PrevInst) {
- SmallVector<Type*, 4> V;
- for (auto *II : LiveValues)
- V.push_back(VectorType::get(II->getType(), BundleWidth));
- Cost += TTI->getCostOfKeepingLiveOverCall(V);
- }
+ &*PrevInstIt != PrevInst)
+ NumCalls++;
++PrevInstIt;
}
+ if (NumCalls) {
+ SmallVector<Type*, 4> V;
+ for (auto *II : LiveValues)
+ V.push_back(VectorType::get(II->getType(), BundleWidth));
+ Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
+ }
+
PrevInst = Inst;
}
@@ -2527,10 +3192,10 @@ int BoUpSLP::getTreeCost() {
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
- unsigned BundleWidth = VectorizableTree[0].Scalars.size();
+ unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
- TreeEntry &TE = VectorizableTree[I];
+ TreeEntry &TE = *VectorizableTree[I].get();
// We create duplicate tree entries for gather sequences that have multiple
// uses. However, we should not compute the cost of duplicate sequences.
@@ -2545,10 +3210,11 @@ int BoUpSLP::getTreeCost() {
// existing heuristics based on tree size may yield different results.
//
if (TE.NeedToGather &&
- std::any_of(std::next(VectorizableTree.begin(), I + 1),
- VectorizableTree.end(), [TE](TreeEntry &Entry) {
- return Entry.NeedToGather && Entry.isSame(TE.Scalars);
- }))
+ std::any_of(
+ std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),
+ [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+ return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);
+ }))
continue;
int C = getEntryCost(&TE);
@@ -2575,7 +3241,7 @@ int BoUpSLP::getTreeCost() {
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
- auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto Extend =
@@ -2608,17 +3274,17 @@ int BoUpSLP::getTreeCost() {
}
int BoUpSLP::getGatherCost(Type *Ty,
- const DenseSet<unsigned> &ShuffledIndices) {
+ const DenseSet<unsigned> &ShuffledIndices) const {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
if (!ShuffledIndices.count(i))
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
if (!ShuffledIndices.empty())
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
}
-int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -2638,221 +3304,19 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
return getGatherCost(VecTy, ShuffledElements);
}
-// Reorder commutative operations in alternate shuffle if the resulting vectors
-// are consecutive loads. This would allow us to vectorize the tree.
-// If we have something like-
-// load a[0] - load b[0]
-// load b[1] + load a[1]
-// load a[2] - load b[2]
-// load a[3] + load b[3]
-// Reordering the second load b[1] load a[1] would allow us to vectorize this
-// code.
-void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
- ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right) {
- // Push left and right operands of binary operation into Left and Right
- for (Value *V : VL) {
- auto *I = cast<Instruction>(V);
- assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
- Left.push_back(I->getOperand(0));
- Right.push_back(I->getOperand(1));
- }
-
- // Reorder if we have a commutative operation and consecutive access
- // are on either side of the alternate instructions.
- for (unsigned j = 0; j < VL.size() - 1; ++j) {
- if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
- if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
- Instruction *VL1 = cast<Instruction>(VL[j]);
- Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
- std::swap(Left[j], Right[j]);
- continue;
- } else if (VL2->isCommutative() &&
- isConsecutiveAccess(L, L1, *DL, *SE)) {
- std::swap(Left[j + 1], Right[j + 1]);
- continue;
- }
- // else unchanged
- }
- }
- if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
- if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
- Instruction *VL1 = cast<Instruction>(VL[j]);
- Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
- std::swap(Left[j], Right[j]);
- continue;
- } else if (VL2->isCommutative() &&
- isConsecutiveAccess(L, L1, *DL, *SE)) {
- std::swap(Left[j + 1], Right[j + 1]);
- continue;
- }
- // else unchanged
- }
- }
- }
-}
-
-// Return true if I should be commuted before adding it's left and right
-// operands to the arrays Left and Right.
-//
-// The vectorizer is trying to either have all elements one side being
-// instruction with the same opcode to enable further vectorization, or having
-// a splat to lower the vectorizing cost.
-static bool shouldReorderOperands(
- int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
- ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
- bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
- VLeft = I.getOperand(0);
- VRight = I.getOperand(1);
- // If we have "SplatRight", try to see if commuting is needed to preserve it.
- if (SplatRight) {
- if (VRight == Right[i - 1])
- // Preserve SplatRight
- return false;
- if (VLeft == Right[i - 1]) {
- // Commuting would preserve SplatRight, but we don't want to break
- // SplatLeft either, i.e. preserve the original order if possible.
- // (FIXME: why do we care?)
- if (SplatLeft && VLeft == Left[i - 1])
- return false;
- return true;
- }
- }
- // Symmetrically handle Right side.
- if (SplatLeft) {
- if (VLeft == Left[i - 1])
- // Preserve SplatLeft
- return false;
- if (VRight == Left[i - 1])
- return true;
- }
-
- Instruction *ILeft = dyn_cast<Instruction>(VLeft);
- Instruction *IRight = dyn_cast<Instruction>(VRight);
-
- // If we have "AllSameOpcodeRight", try to see if the left operands preserves
- // it and not the right, in this case we want to commute.
- if (AllSameOpcodeRight) {
- unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
- if (IRight && RightPrevOpcode == IRight->getOpcode())
- // Do not commute, a match on the right preserves AllSameOpcodeRight
- return false;
- if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
- // We have a match and may want to commute, but first check if there is
- // not also a match on the existing operands on the Left to preserve
- // AllSameOpcodeLeft, i.e. preserve the original order if possible.
- // (FIXME: why do we care?)
- if (AllSameOpcodeLeft && ILeft &&
- cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
- return false;
- return true;
- }
- }
- // Symmetrically handle Left side.
- if (AllSameOpcodeLeft) {
- unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
- if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
- return false;
- if (IRight && LeftPrevOpcode == IRight->getOpcode())
- return true;
- }
- return false;
-}
-
-void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
- ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right) {
- if (!VL.empty()) {
- // Peel the first iteration out of the loop since there's nothing
- // interesting to do anyway and it simplifies the checks in the loop.
- auto *I = cast<Instruction>(VL[0]);
- Value *VLeft = I->getOperand(0);
- Value *VRight = I->getOperand(1);
- if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
- // Favor having instruction to the right. FIXME: why?
- std::swap(VLeft, VRight);
- Left.push_back(VLeft);
- Right.push_back(VRight);
- }
-
- // Keep track if we have instructions with all the same opcode on one side.
- bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
- bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
- // Keep track if we have one side with all the same value (broadcast).
- bool SplatLeft = true;
- bool SplatRight = true;
-
- for (unsigned i = 1, e = VL.size(); i != e; ++i) {
- Instruction *I = cast<Instruction>(VL[i]);
- assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
- (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
- "Can only process commutative instruction");
- // Commute to favor either a splat or maximizing having the same opcodes on
- // one side.
- Value *VLeft;
- Value *VRight;
- if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
- AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
- VRight)) {
- Left.push_back(VRight);
- Right.push_back(VLeft);
- } else {
- Left.push_back(VLeft);
- Right.push_back(VRight);
- }
- // Update Splat* and AllSameOpcode* after the insertion.
- SplatRight = SplatRight && (Right[i - 1] == Right[i]);
- SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
- AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
- (cast<Instruction>(Left[i - 1])->getOpcode() ==
- cast<Instruction>(Left[i])->getOpcode());
- AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
- (cast<Instruction>(Right[i - 1])->getOpcode() ==
- cast<Instruction>(Right[i])->getOpcode());
- }
-
- // If one operand end up being broadcast, return this operand order.
- if (SplatRight || SplatLeft)
+// Perform operand reordering on the instructions in VL and return the reordered
+// operands in Left and Right.
+void BoUpSLP::reorderInputsAccordingToOpcode(
+ ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right, const DataLayout &DL,
+ ScalarEvolution &SE) {
+ if (VL.empty())
return;
-
- // Finally check if we can get longer vectorizable chain by reordering
- // without breaking the good operand order detected above.
- // E.g. If we have something like-
- // load a[0] load b[0]
- // load b[1] load a[1]
- // load a[2] load b[2]
- // load a[3] load b[3]
- // Reordering the second load b[1] load a[1] would allow us to vectorize
- // this code and we still retain AllSameOpcode property.
- // FIXME: This load reordering might break AllSameOpcode in some rare cases
- // such as-
- // add a[0],c[0] load b[0]
- // add a[1],c[2] load b[1]
- // b[2] load b[2]
- // add a[3],c[3] load b[3]
- for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
- if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
- if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
- if (isConsecutiveAccess(L, L1, *DL, *SE)) {
- std::swap(Left[j + 1], Right[j + 1]);
- continue;
- }
- }
- }
- if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
- if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
- if (isConsecutiveAccess(L, L1, *DL, *SE)) {
- std::swap(Left[j + 1], Right[j + 1]);
- continue;
- }
- }
- }
- // else unchanged
- }
+ VLOperands Ops(VL, DL, SE);
+ // Reorder the operands in place.
+ Ops.reorder();
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
}
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
@@ -3082,13 +3546,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
continue;
}
- // Prepare the operand vector.
- for (Value *V : E->Scalars)
- Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
-
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- Value *Vec = vectorizeTree(Operands);
+ Value *Vec = vectorizeTree(E->getOperand(i));
NewPhi->addIncoming(Vec, IBB);
}
@@ -3099,7 +3559,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::ExtractElement: {
if (!E->NeedToGather) {
- Value *V = VL0->getOperand(0);
+ Value *V = E->getSingleOperand(0);
if (!E->ReorderIndices.empty()) {
OrdersType Mask;
inversePermutation(E->ReorderIndices, Mask);
@@ -3132,11 +3592,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::ExtractValue: {
if (!E->NeedToGather) {
- LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+ LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
- LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+ LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());
Value *NewV = propagateMetadata(V, E->Scalars);
if (!E->ReorderIndices.empty()) {
OrdersType Mask;
@@ -3177,13 +3637,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- ValueList INVL;
- for (Value *V : E->Scalars)
- INVL.push_back(cast<Instruction>(V)->getOperand(0));
-
setInsertPointAfterBundle(E->Scalars, S);
- Value *InVec = vectorizeTree(INVL);
+ Value *InVec = vectorizeTree(E->getOperand(0));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3202,16 +3658,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::FCmp:
case Instruction::ICmp: {
- ValueList LHSV, RHSV;
- for (Value *V : E->Scalars) {
- LHSV.push_back(cast<Instruction>(V)->getOperand(0));
- RHSV.push_back(cast<Instruction>(V)->getOperand(1));
- }
-
setInsertPointAfterBundle(E->Scalars, S);
- Value *L = vectorizeTree(LHSV);
- Value *R = vectorizeTree(RHSV);
+ Value *L = vectorizeTree(E->getOperand(0));
+ Value *R = vectorizeTree(E->getOperand(1));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3235,31 +3685,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::Select: {
- ValueList TrueVec, FalseVec, CondVec;
- for (Value *V : E->Scalars) {
- CondVec.push_back(cast<Instruction>(V)->getOperand(0));
- TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
- FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
+ setInsertPointAfterBundle(E->Scalars, S);
+
+ Value *Cond = vectorizeTree(E->getOperand(0));
+ Value *True = vectorizeTree(E->getOperand(1));
+ Value *False = vectorizeTree(E->getOperand(2));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
}
+ Value *V = Builder.CreateSelect(Cond, True, False);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FNeg: {
setInsertPointAfterBundle(E->Scalars, S);
- Value *Cond = vectorizeTree(CondVec);
- Value *True = vectorizeTree(TrueVec);
- Value *False = vectorizeTree(FalseVec);
+ Value *Op = vectorizeTree(E->getOperand(0));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *V = Builder.CreateSelect(Cond, True, False);
+ Value *V = Builder.CreateUnOp(
+ static_cast<Instruction::UnaryOps>(S.getOpcode()), Op);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
++NumVectorInstructions;
+
return V;
}
case Instruction::Add:
@@ -3280,21 +3748,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- ValueList LHSVL, RHSVL;
- if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
- reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
- RHSVL);
- else
- for (Value *V : E->Scalars) {
- auto *I = cast<Instruction>(V);
- LHSVL.push_back(I->getOperand(0));
- RHSVL.push_back(I->getOperand(1));
- }
-
setInsertPointAfterBundle(E->Scalars, S);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
+ Value *LHS = vectorizeTree(E->getOperand(0));
+ Value *RHS = vectorizeTree(E->getOperand(1));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3341,7 +3798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
unsigned Alignment = LI->getAlignment();
- LI = Builder.CreateLoad(VecPtr);
+ LI = Builder.CreateLoad(VecTy, VecPtr);
if (!Alignment) {
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
}
@@ -3367,13 +3824,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
- ValueList ScalarStoreValues;
- for (Value *V : E->Scalars)
- ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
-
setInsertPointAfterBundle(E->Scalars, S);
- Value *VecValue = vectorizeTree(ScalarStoreValues);
+ Value *VecValue = vectorizeTree(E->getOperand(0));
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
@@ -3400,20 +3853,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::GetElementPtr: {
setInsertPointAfterBundle(E->Scalars, S);
- ValueList Op0VL;
- for (Value *V : E->Scalars)
- Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
-
- Value *Op0 = vectorizeTree(Op0VL);
+ Value *Op0 = vectorizeTree(E->getOperand(0));
std::vector<Value *> OpVecs;
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
- ValueList OpVL;
- for (Value *V : E->Scalars)
- OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
-
- Value *OpVec = vectorizeTree(OpVL);
+ Value *OpVec = vectorizeTree(E->getOperand(j));
OpVecs.push_back(OpVec);
}
@@ -3443,20 +3888,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
- // ctlz,cttz and powi are special intrinsics whose second argument is
- // a scalar. This argument should not be vectorized.
- if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
+ // Some intrinsics have scalar arguments. This argument should not be
+ // vectorized.
+ if (hasVectorInstrinsicScalarOpd(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
- for (Value *V : E->Scalars) {
- CallInst *CEI = cast<CallInst>(V);
- OpVL.push_back(CEI->getArgOperand(j));
- }
- Value *OpVec = vectorizeTree(OpVL);
+ Value *OpVec = vectorizeTree(E->getOperand(j));
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
@@ -3485,7 +3926,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::ShuffleVector: {
- ValueList LHSVL, RHSVL;
assert(S.isAltShuffle() &&
((Instruction::isBinaryOp(S.getOpcode()) &&
Instruction::isBinaryOp(S.getAltOpcode())) ||
@@ -3495,16 +3935,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *LHS, *RHS;
if (Instruction::isBinaryOp(S.getOpcode())) {
- reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
setInsertPointAfterBundle(E->Scalars, S);
- LHS = vectorizeTree(LHSVL);
- RHS = vectorizeTree(RHSVL);
+ LHS = vectorizeTree(E->getOperand(0));
+ RHS = vectorizeTree(E->getOperand(1));
} else {
- ValueList INVL;
- for (Value *V : E->Scalars)
- INVL.push_back(cast<Instruction>(V)->getOperand(0));
setInsertPointAfterBundle(E->Scalars, S);
- LHS = vectorizeTree(INVL);
+ LHS = vectorizeTree(E->getOperand(0));
}
if (E->VectorizedValue) {
@@ -3578,20 +4014,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
Builder.SetInsertPoint(&F->getEntryBlock().front());
- auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
+ auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
// If the vectorized tree can be rewritten in a smaller type, we truncate the
// vectorized root. InstCombine will then rewrite the entire expression. We
// sign extend the extracted values below.
- auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
if (auto *I = dyn_cast<Instruction>(VectorRoot))
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
- auto BundleWidth = VectorizableTree[0].Scalars.size();
+ auto BundleWidth = VectorizableTree[0]->Scalars.size();
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto *VecTy = VectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
- VectorizableTree[0].VectorizedValue = Trunc;
+ VectorizableTree[0]->VectorizedValue = Trunc;
}
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
@@ -3687,8 +4123,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
// For each vectorized value:
- for (TreeEntry &EIdx : VectorizableTree) {
- TreeEntry *Entry = &EIdx;
+ for (auto &TEPtr : VectorizableTree) {
+ TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
if (Entry->NeedToGather)
@@ -3721,7 +4157,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Builder.ClearInsertionPoint();
- return VectorizableTree[0].VectorizedValue;
+ return VectorizableTree[0]->VectorizedValue;
}
void BoUpSLP::optimizeGatherSequence() {
@@ -3767,10 +4203,10 @@ void BoUpSLP::optimizeGatherSequence() {
// Sort blocks by domination. This ensures we visit a block after all blocks
// dominating it are visited.
- std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
- [this](const DomTreeNode *A, const DomTreeNode *B) {
- return DT->properlyDominates(A, B);
- });
+ llvm::stable_sort(CSEWorkList,
+ [this](const DomTreeNode *A, const DomTreeNode *B) {
+ return DT->properlyDominates(A, B);
+ });
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
@@ -3989,7 +4425,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
<< "\n");
return true;
}
- UpIter++;
+ ++UpIter;
}
if (DownIter != LowerEnd) {
if (&*DownIter == I) {
@@ -4003,7 +4439,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
<< "\n");
return true;
}
- DownIter++;
+ ++DownIter;
}
assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
"instruction not found in block");
@@ -4253,7 +4689,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
BS->ScheduleStart = nullptr;
}
-unsigned BoUpSLP::getVectorElementSize(Value *V) {
+unsigned BoUpSLP::getVectorElementSize(Value *V) const {
// If V is a store, just return the width of the stored value without
// traversing the expression tree. This is the common case.
if (auto *Store = dyn_cast<StoreInst>(V))
@@ -4390,7 +4826,7 @@ void BoUpSLP::computeMinimumValueSizes() {
return;
// We only attempt to truncate integer expressions.
- auto &TreeRoot = VectorizableTree[0].Scalars;
+ auto &TreeRoot = VectorizableTree[0]->Scalars;
auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
if (!TreeRootIT)
return;
@@ -4411,8 +4847,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// Collect the scalar values of the vectorizable expression. We will use this
// context to determine which values can be demoted. If we see a truncation,
// we mark it as seeding another demotion.
- for (auto &Entry : VectorizableTree)
- Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
+ for (auto &EntryPtr : VectorizableTree)
+ Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
// Ensure the roots of the vectorizable tree don't form a cycle. They must
// have a single external user that is not in the vectorizable tree.
@@ -4746,38 +5182,29 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- // Do a quadratic search on all of the given stores in reverse order and find
- // all of the pairs of stores that follow each other.
- SmallVector<unsigned, 16> IndexQueue;
- unsigned E = Stores.size();
- IndexQueue.resize(E - 1);
- for (unsigned I = E; I > 0; --I) {
- unsigned Idx = I - 1;
- // If a store has multiple consecutive store candidates, search Stores
- // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
- // This is because usually pairing with immediate succeeding or preceding
- // candidate create the best chance to find slp vectorization opportunity.
- unsigned Offset = 1;
- unsigned Cnt = 0;
- for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
- if (Idx >= Offset) {
- IndexQueue[Cnt] = Idx - Offset;
- ++Cnt;
- }
- if (Idx + Offset < E) {
- IndexQueue[Cnt] = Idx + Offset;
- ++Cnt;
- }
- }
+ auto &&FindConsecutiveAccess =
+ [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {
+ if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
+ return false;
- for (auto K : IndexQueue) {
- if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {
Tails.insert(Stores[Idx]);
Heads.insert(Stores[K]);
ConsecutiveChain[Stores[K]] = Stores[Idx];
+ return true;
+ };
+
+ // Do a quadratic search on all of the given stores in reverse order and find
+ // all of the pairs of stores that follow each other.
+ int E = Stores.size();
+ for (int Idx = E - 1; Idx >= 0; --Idx) {
+ // If a store has multiple consecutive store candidates, search according
+ // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)
+ if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
+ (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
break;
- }
- }
}
// For stores that start but don't end a link in the chain:
@@ -5740,6 +6167,9 @@ public:
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
Value *VectorizedTree = nullptr;
+
+ // FIXME: Fast-math-flags should be set based on the instructions in the
+ // reduction (not all of 'fast' are required).
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
FastMathFlags Unsafe;
Unsafe.setFast();
@@ -5929,10 +6359,14 @@ private:
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
- if (!IsPairwiseReduction)
+ if (!IsPairwiseReduction) {
+ // FIXME: The builder should use an FMF guard. It should not be hard-coded
+ // to 'fast'.
+ assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");
return createSimpleTargetReduction(
Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
ReductionData.getFlags(), ReductionOps.back());
+ }
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
@@ -6256,7 +6690,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
// Sort by type.
- std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
+ llvm::stable_sort(Incoming, PhiTypeSorterFunc);
// Try to vectorize elements base on their type.
for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
@@ -6297,7 +6731,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
SmallVector<WeakVH, 8> PostProcessInstructions;
SmallDenseSet<Instruction *, 4> KeyNodes;
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
if (it->use_empty() && KeyNodes.count(&*it) > 0 &&