diff options
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 1362 |
1 files changed, 898 insertions, 464 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2e856a7e6802..27a86c0bca91 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1,9 +1,8 @@ //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -106,6 +105,10 @@ using namespace slpvectorizer; STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); +cl::opt<bool> + llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden, + cl::desc("Run the SLP vectorization passes")); + static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " @@ -207,6 +210,13 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } +/// \returns True if \p I is commutative, handles CmpInst as well as Instruction. +static bool isCommutative(Instruction *I) { + if (auto *IC = dyn_cast<CmpInst>(I)) + return IC->isCommutative(); + return I->isCommutative(); +} + /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -438,8 +448,9 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, case Instruction::Call: { CallInst *CI = cast<CallInst>(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (hasVectorInstrinsicScalarOpd(ID, 1)) { - return (CI->getArgOperand(1) == Scalar); + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + if (hasVectorInstrinsicScalarOpd(ID, i)) + return (CI->getArgOperand(i) == Scalar); } LLVM_FALLTHROUGH; } @@ -474,6 +485,8 @@ namespace slpvectorizer { /// Bottom Up SLP Vectorizer. class BoUpSLP { + struct TreeEntry; + public: using ValueList = SmallVector<Value *, 8>; using InstrList = SmallVector<Instruction *, 16>; @@ -517,7 +530,7 @@ public: /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost(); + int getSpillCost() const; /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. @@ -576,7 +589,7 @@ public: /// the stored value. Otherwise, the size is the width of the largest loaded /// value reaching V. This method is used by the vectorizer to calculate /// vectorization factors. - unsigned getVectorElementSize(Value *V); + unsigned getVectorElementSize(Value *V) const; /// Compute the minimum type sizes required to represent the entries in a /// vectorizable tree. @@ -599,13 +612,512 @@ public: /// \returns True if the VectorizableTree is both tiny and not fully /// vectorizable. We do not vectorize such trees. - bool isTreeTinyAndNotFullyVectorizable(); + bool isTreeTinyAndNotFullyVectorizable() const; OptimizationRemarkEmitter *getORE() { return ORE; } -private: - struct TreeEntry; + /// This structure holds any data we need about the edges being traversed + /// during buildTree_rec(). We keep track of: + /// (i) the user TreeEntry index, and + /// (ii) the index of the edge. + struct EdgeInfo { + EdgeInfo() = default; + EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) + : UserTE(UserTE), EdgeIdx(EdgeIdx) {} + /// The user TreeEntry. + TreeEntry *UserTE = nullptr; + /// The operand index of the use. + unsigned EdgeIdx = UINT_MAX; +#ifndef NDEBUG + friend inline raw_ostream &operator<<(raw_ostream &OS, + const BoUpSLP::EdgeInfo &EI) { + EI.dump(OS); + return OS; + } + /// Debug print. + void dump(raw_ostream &OS) const { + OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") + << " EdgeIdx:" << EdgeIdx << "}"; + } + LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } +#endif + }; + + /// A helper data structure to hold the operands of a vector of instructions. + /// This supports a fixed vector length for all operand vectors. + class VLOperands { + /// For each operand we need (i) the value, and (ii) the opcode that it + /// would be attached to if the expression was in a left-linearized form. + /// This is required to avoid illegal operand reordering. + /// For example: + /// \verbatim + /// 0 Op1 + /// |/ + /// Op1 Op2 Linearized + Op2 + /// \ / ----------> |/ + /// - - + /// + /// Op1 - Op2 (0 + Op1) - Op2 + /// \endverbatim + /// + /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. + /// + /// Another way to think of this is to track all the operations across the + /// path from the operand all the way to the root of the tree and to + /// calculate the operation that corresponds to this path. For example, the + /// path from Op2 to the root crosses the RHS of the '-', therefore the + /// corresponding operation is a '-' (which matches the one in the + /// linearized tree, as shown above). + /// + /// For lack of a better term, we refer to this operation as Accumulated + /// Path Operation (APO). + struct OperandData { + OperandData() = default; + OperandData(Value *V, bool APO, bool IsUsed) + : V(V), APO(APO), IsUsed(IsUsed) {} + /// The operand value. + Value *V = nullptr; + /// TreeEntries only allow a single opcode, or an alternate sequence of + /// them (e.g, +, -). Therefore, we can safely use a boolean value for the + /// APO. It is set to 'true' if 'V' is attached to an inverse operation + /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise + /// (e.g., Add/Mul) + bool APO = false; + /// Helper data for the reordering function. + bool IsUsed = false; + }; + + /// During operand reordering, we are trying to select the operand at lane + /// that matches best with the operand at the neighboring lane. Our + /// selection is based on the type of value we are looking for. For example, + /// if the neighboring lane has a load, we need to look for a load that is + /// accessing a consecutive address. These strategies are summarized in the + /// 'ReorderingMode' enumerator. + enum class ReorderingMode { + Load, ///< Matching loads to consecutive memory addresses + Opcode, ///< Matching instructions based on opcode (same or alternate) + Constant, ///< Matching constants + Splat, ///< Matching the same instruction multiple times (broadcast) + Failed, ///< We failed to create a vectorizable group + }; + + using OperandDataVec = SmallVector<OperandData, 2>; + + /// A vector of operand vectors. + SmallVector<OperandDataVec, 4> OpsVec; + + const DataLayout &DL; + ScalarEvolution &SE; + + /// \returns the operand data at \p OpIdx and \p Lane. + OperandData &getData(unsigned OpIdx, unsigned Lane) { + return OpsVec[OpIdx][Lane]; + } + + /// \returns the operand data at \p OpIdx and \p Lane. Const version. + const OperandData &getData(unsigned OpIdx, unsigned Lane) const { + return OpsVec[OpIdx][Lane]; + } + + /// Clears the used flag for all entries. + void clearUsed() { + for (unsigned OpIdx = 0, NumOperands = getNumOperands(); + OpIdx != NumOperands; ++OpIdx) + for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; + ++Lane) + OpsVec[OpIdx][Lane].IsUsed = false; + } + + /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. + void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { + std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); + } + + // Search all operands in Ops[*][Lane] for the one that matches best + // Ops[OpIdx][LastLane] and return its opreand index. + // If no good match can be found, return None. + Optional<unsigned> + getBestOperand(unsigned OpIdx, int Lane, int LastLane, + ArrayRef<ReorderingMode> ReorderingModes) { + unsigned NumOperands = getNumOperands(); + + // The operand of the previous lane at OpIdx. + Value *OpLastLane = getData(OpIdx, LastLane).V; + + // Our strategy mode for OpIdx. + ReorderingMode RMode = ReorderingModes[OpIdx]; + + // The linearized opcode of the operand at OpIdx, Lane. + bool OpIdxAPO = getData(OpIdx, Lane).APO; + + const unsigned BestScore = 2; + const unsigned GoodScore = 1; + + // The best operand index and its score. + // Sometimes we have more than one option (e.g., Opcode and Undefs), so we + // are using the score to differentiate between the two. + struct BestOpData { + Optional<unsigned> Idx = None; + unsigned Score = 0; + } BestOp; + + // Iterate through all unused operands and look for the best. + for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { + // Get the operand at Idx and Lane. + OperandData &OpData = getData(Idx, Lane); + Value *Op = OpData.V; + bool OpAPO = OpData.APO; + + // Skip already selected operands. + if (OpData.IsUsed) + continue; + + // Skip if we are trying to move the operand to a position with a + // different opcode in the linearized tree form. This would break the + // semantics. + if (OpAPO != OpIdxAPO) + continue; + + // Look for an operand that matches the current mode. + switch (RMode) { + case ReorderingMode::Load: + if (isa<LoadInst>(Op)) { + // Figure out which is left and right, so that we can check for + // consecutive loads + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + if (isConsecutiveAccess(cast<LoadInst>(OpLeft), + cast<LoadInst>(OpRight), DL, SE)) + BestOp.Idx = Idx; + } + break; + case ReorderingMode::Opcode: + // We accept both Instructions and Undefs, but with different scores. + if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) && + cast<Instruction>(Op)->getOpcode() == + cast<Instruction>(OpLastLane)->getOpcode()) || + (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) || + isa<UndefValue>(Op)) { + // An instruction has a higher score than an undef. + unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + } + break; + case ReorderingMode::Constant: + if (isa<Constant>(Op)) { + unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + } + break; + case ReorderingMode::Splat: + if (Op == OpLastLane) + BestOp.Idx = Idx; + break; + case ReorderingMode::Failed: + return None; + } + } + + if (BestOp.Idx) { + getData(BestOp.Idx.getValue(), Lane).IsUsed = true; + return BestOp.Idx; + } + // If we could not find a good match return None. + return None; + } + + /// Helper for reorderOperandVecs. \Returns the lane that we should start + /// reordering from. This is the one which has the least number of operands + /// that can freely move about. + unsigned getBestLaneToStartReordering() const { + unsigned BestLane = 0; + unsigned Min = UINT_MAX; + for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; + ++Lane) { + unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); + if (NumFreeOps < Min) { + Min = NumFreeOps; + BestLane = Lane; + } + } + return BestLane; + } + + /// \Returns the maximum number of operands that are allowed to be reordered + /// for \p Lane. This is used as a heuristic for selecting the first lane to + /// start operand reordering. + unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { + unsigned CntTrue = 0; + unsigned NumOperands = getNumOperands(); + // Operands with the same APO can be reordered. We therefore need to count + // how many of them we have for each APO, like this: Cnt[APO] = x. + // Since we only have two APOs, namely true and false, we can avoid using + // a map. Instead we can simply count the number of operands that + // correspond to one of them (in this case the 'true' APO), and calculate + // the other by subtracting it from the total number of operands. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) + if (getData(OpIdx, Lane).APO) + ++CntTrue; + unsigned CntFalse = NumOperands - CntTrue; + return std::max(CntTrue, CntFalse); + } + + /// Go through the instructions in VL and append their operands. + void appendOperandsOfVL(ArrayRef<Value *> VL) { + assert(!VL.empty() && "Bad VL"); + assert((empty() || VL.size() == getNumLanes()) && + "Expected same number of lanes"); + assert(isa<Instruction>(VL[0]) && "Expected instruction"); + unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); + OpsVec.resize(NumOperands); + unsigned NumLanes = VL.size(); + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + OpsVec[OpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + assert(isa<Instruction>(VL[Lane]) && "Expected instruction"); + // Our tree has just 3 nodes: the root and two operands. + // It is therefore trivial to get the APO. We only need to check the + // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or + // RHS operand. The LHS operand of both add and sub is never attached + // to an inversese operation in the linearized form, therefore its APO + // is false. The RHS is true only if VL[Lane] is an inverse operation. + + // Since operand reordering is performed on groups of commutative + // operations or alternating sequences (e.g., +, -), we can safely + // tell the inverse operations by checking commutativity. + bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); + bool APO = (OpIdx == 0) ? false : IsInverseOperation; + OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), + APO, false}; + } + } + } + + /// \returns the number of operands. + unsigned getNumOperands() const { return OpsVec.size(); } + + /// \returns the number of lanes. + unsigned getNumLanes() const { return OpsVec[0].size(); } + + /// \returns the operand value at \p OpIdx and \p Lane. + Value *getValue(unsigned OpIdx, unsigned Lane) const { + return getData(OpIdx, Lane).V; + } + /// \returns true if the data structure is empty. + bool empty() const { return OpsVec.empty(); } + + /// Clears the data. + void clear() { OpsVec.clear(); } + + /// \Returns true if there are enough operands identical to \p Op to fill + /// the whole vector. + /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. + bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { + bool OpAPO = getData(OpIdx, Lane).APO; + for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { + if (Ln == Lane) + continue; + // This is set to true if we found a candidate for broadcast at Lane. + bool FoundCandidate = false; + for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { + OperandData &Data = getData(OpI, Ln); + if (Data.APO != OpAPO || Data.IsUsed) + continue; + if (Data.V == Op) { + FoundCandidate = true; + Data.IsUsed = true; + break; + } + } + if (!FoundCandidate) + return false; + } + return true; + } + + public: + /// Initialize with all the operands of the instruction vector \p RootVL. + VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, + ScalarEvolution &SE) + : DL(DL), SE(SE) { + // Append all the operands of RootVL. + appendOperandsOfVL(RootVL); + } + + /// \Returns a value vector with the operands across all lanes for the + /// opearnd at \p OpIdx. + ValueList getVL(unsigned OpIdx) const { + ValueList OpVL(OpsVec[OpIdx].size()); + assert(OpsVec[OpIdx].size() == getNumLanes() && + "Expected same num of lanes across all operands"); + for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) + OpVL[Lane] = OpsVec[OpIdx][Lane].V; + return OpVL; + } + + // Performs operand reordering for 2 or more operands. + // The original operands are in OrigOps[OpIdx][Lane]. + // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. + void reorder() { + unsigned NumOperands = getNumOperands(); + unsigned NumLanes = getNumLanes(); + // Each operand has its own mode. We are using this mode to help us select + // the instructions for each lane, so that they match best with the ones + // we have selected so far. + SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); + + // This is a greedy single-pass algorithm. We are going over each lane + // once and deciding on the best order right away with no back-tracking. + // However, in order to increase its effectiveness, we start with the lane + // that has operands that can move the least. For example, given the + // following lanes: + // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd + // Lane 1 : A[1] = C[1] - B[1] // Visited 1st + // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd + // Lane 3 : A[3] = C[3] - B[3] // Visited 4th + // we will start at Lane 1, since the operands of the subtraction cannot + // be reordered. Then we will visit the rest of the lanes in a circular + // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. + + // Find the first lane that we will start our search from. + unsigned FirstLane = getBestLaneToStartReordering(); + + // Initialize the modes. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + Value *OpLane0 = getValue(OpIdx, FirstLane); + // Keep track if we have instructions with all the same opcode on one + // side. + if (isa<LoadInst>(OpLane0)) + ReorderingModes[OpIdx] = ReorderingMode::Load; + else if (isa<Instruction>(OpLane0)) { + // Check if OpLane0 should be broadcast. + if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) + ReorderingModes[OpIdx] = ReorderingMode::Splat; + else + ReorderingModes[OpIdx] = ReorderingMode::Opcode; + } + else if (isa<Constant>(OpLane0)) + ReorderingModes[OpIdx] = ReorderingMode::Constant; + else if (isa<Argument>(OpLane0)) + // Our best hope is a Splat. It may save some cost in some cases. + ReorderingModes[OpIdx] = ReorderingMode::Splat; + else + // NOTE: This should be unreachable. + ReorderingModes[OpIdx] = ReorderingMode::Failed; + } + + // If the initial strategy fails for any of the operand indexes, then we + // perform reordering again in a second pass. This helps avoid assigning + // high priority to the failed strategy, and should improve reordering for + // the non-failed operand indexes. + for (int Pass = 0; Pass != 2; ++Pass) { + // Skip the second pass if the first pass did not fail. + bool StrategyFailed = false; + // Mark all operand data as free to use. + clearUsed(); + // We keep the original operand order for the FirstLane, so reorder the + // rest of the lanes. We are visiting the nodes in a circular fashion, + // using FirstLane as the center point and increasing the radius + // distance. + for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { + // Visit the lane on the right and then the lane on the left. + for (int Direction : {+1, -1}) { + int Lane = FirstLane + Direction * Distance; + if (Lane < 0 || Lane >= (int)NumLanes) + continue; + int LastLane = Lane - Direction; + assert(LastLane >= 0 && LastLane < (int)NumLanes && + "Out of bounds"); + // Look for a good match for each operand. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + // Search for the operand that matches SortedOps[OpIdx][Lane-1]. + Optional<unsigned> BestIdx = + getBestOperand(OpIdx, Lane, LastLane, ReorderingModes); + // By not selecting a value, we allow the operands that follow to + // select a better matching value. We will get a non-null value in + // the next run of getBestOperand(). + if (BestIdx) { + // Swap the current operand with the one returned by + // getBestOperand(). + swap(OpIdx, BestIdx.getValue(), Lane); + } else { + // We failed to find a best operand, set mode to 'Failed'. + ReorderingModes[OpIdx] = ReorderingMode::Failed; + // Enable the second pass. + StrategyFailed = true; + } + } + } + } + // Skip second pass if the strategy did not fail. + if (!StrategyFailed) + break; + } + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { + switch (RMode) { + case ReorderingMode::Load: + return "Load"; + case ReorderingMode::Opcode: + return "Opcode"; + case ReorderingMode::Constant: + return "Constant"; + case ReorderingMode::Splat: + return "Splat"; + case ReorderingMode::Failed: + return "Failed"; + } + llvm_unreachable("Unimplemented Reordering Type"); + } + + LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, + raw_ostream &OS) { + return OS << getModeStr(RMode); + } + + /// Debug print. + LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { + printMode(RMode, dbgs()); + } + + friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { + return printMode(RMode, OS); + } + + LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { + const unsigned Indent = 2; + unsigned Cnt = 0; + for (const OperandDataVec &OpDataVec : OpsVec) { + OS << "Operand " << Cnt++ << "\n"; + for (const OperandData &OpData : OpDataVec) { + OS.indent(Indent) << "{"; + if (Value *V = OpData.V) + OS << *V; + else + OS << "null"; + OS << ", APO:" << OpData.APO << "}\n"; + } + OS << "\n"; + } + return OS; + } + + /// Debug print. + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif + }; + +private: /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I) const; @@ -613,7 +1125,8 @@ private: int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, + const EdgeInfo &EI); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a @@ -631,12 +1144,12 @@ private: /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices); + int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - int getGatherCost(ArrayRef<Value *> VL); + int getGatherCost(ArrayRef<Value *> VL) const; /// Set the Builder insert point to one after the last instruction in /// the bundle @@ -648,22 +1161,18 @@ private: /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. - bool isFullyVectorizableTinyTree(); + bool isFullyVectorizableTinyTree() const; - /// \reorder commutative operands in alt shuffle if they result in - /// vectorized code. - void reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right); - - /// \reorder commutative operands to get better probability of + /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right); + static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + const DataLayout &DL, + ScalarEvolution &SE); struct TreeEntry { - TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {} + using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; + TreeEntry(VecTreeTy &Container) : Container(Container) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { @@ -696,20 +1205,103 @@ private: /// to be a pointer and needs to be able to initialize the child iterator. /// Thus we need a reference back to the container to translate the indices /// to entries. - std::vector<TreeEntry> &Container; + VecTreeTy &Container; /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. - SmallVector<int, 1> UserTreeIndices; + SmallVector<EdgeInfo, 1> UserTreeIndices; + + /// The index of this treeEntry in VectorizableTree. + int Idx = -1; + + private: + /// The operands of each instruction in each lane Operands[op_index][lane]. + /// Note: This helps avoid the replication of the code that performs the + /// reordering of operands during buildTree_rec() and vectorizeTree(). + SmallVector<ValueList, 2> Operands; + + public: + /// Set this bundle's \p OpIdx'th operand to \p OpVL. + void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL, + ArrayRef<unsigned> ReuseShuffleIndices) { + if (Operands.size() < OpIdx + 1) + Operands.resize(OpIdx + 1); + assert(Operands[OpIdx].size() == 0 && "Already resized?"); + Operands[OpIdx].resize(Scalars.size()); + for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) + Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty()) + ? OpVL[ReuseShuffleIndices[Lane]] + : OpVL[Lane]; + } + + /// If there is a user TreeEntry, then set its operand. + void trySetUserTEOperand(const EdgeInfo &UserTreeIdx, + ArrayRef<Value *> OpVL, + ArrayRef<unsigned> ReuseShuffleIndices) { + if (UserTreeIdx.UserTE) + UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL, + ReuseShuffleIndices); + } + + /// \returns the \p OpIdx operand of this TreeEntry. + ValueList &getOperand(unsigned OpIdx) { + assert(OpIdx < Operands.size() && "Off bounds"); + return Operands[OpIdx]; + } + + /// \return the single \p OpIdx operand. + Value *getSingleOperand(unsigned OpIdx) const { + assert(OpIdx < Operands.size() && "Off bounds"); + assert(!Operands[OpIdx].empty() && "No operand available"); + return Operands[OpIdx][0]; + } + +#ifndef NDEBUG + /// Debug printer. + LLVM_DUMP_METHOD void dump() const { + dbgs() << Idx << ".\n"; + for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { + dbgs() << "Operand " << OpI << ":\n"; + for (const Value *V : Operands[OpI]) + dbgs().indent(2) << *V << "\n"; + } + dbgs() << "Scalars: \n"; + for (Value *V : Scalars) + dbgs().indent(2) << *V << "\n"; + dbgs() << "NeedToGather: " << NeedToGather << "\n"; + dbgs() << "VectorizedValue: "; + if (VectorizedValue) + dbgs() << *VectorizedValue; + else + dbgs() << "NULL"; + dbgs() << "\n"; + dbgs() << "ReuseShuffleIndices: "; + if (ReuseShuffleIndices.empty()) + dbgs() << "Emtpy"; + else + for (unsigned Idx : ReuseShuffleIndices) + dbgs() << Idx << ", "; + dbgs() << "\n"; + dbgs() << "ReorderIndices: "; + for (unsigned Idx : ReorderIndices) + dbgs() << Idx << ", "; + dbgs() << "\n"; + dbgs() << "UserTreeIndices: "; + for (const auto &EInfo : UserTreeIndices) + dbgs() << EInfo << ", "; + dbgs() << "\n"; + } +#endif }; /// Create a new VectorizableTree entry. - void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx, - ArrayRef<unsigned> ReuseShuffleIndices = None, - ArrayRef<unsigned> ReorderIndices = None) { - VectorizableTree.emplace_back(VectorizableTree); - int idx = VectorizableTree.size() - 1; - TreeEntry *Last = &VectorizableTree[idx]; + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, + const EdgeInfo &UserTreeIdx, + ArrayRef<unsigned> ReuseShuffleIndices = None, + ArrayRef<unsigned> ReorderIndices = None) { + VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree)); + TreeEntry *Last = VectorizableTree.back().get(); + Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), @@ -718,25 +1310,44 @@ private: if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + ScalarToTreeEntry[VL[i]] = Last->Idx; } } else { MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx >= 0) + if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - UserTreeIdx = idx; + + Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices); + return Last; } /// -- Vectorization State -- /// Holds all of the tree entries. - std::vector<TreeEntry> VectorizableTree; + TreeEntry::VecTreeTy VectorizableTree; + +#ifndef NDEBUG + /// Debug printer. + LLVM_DUMP_METHOD void dumpVectorizableTree() const { + for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { + VectorizableTree[Id]->dump(); + dbgs() << "\n"; + } + } +#endif TreeEntry *getTreeEntry(Value *V) { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return &VectorizableTree[I->second]; + return VectorizableTree[I->second].get(); + return nullptr; + } + + const TreeEntry *getTreeEntry(Value *V) const { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) + return VectorizableTree[I->second].get(); return nullptr; } @@ -1246,21 +1857,25 @@ template <> struct GraphTraits<BoUpSLP *> { /// NodeRef has to be a pointer per the GraphWriter. using NodeRef = TreeEntry *; + using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; + /// Add the VectorizableTree to the index iterator to be able to return /// TreeEntry pointers. struct ChildIteratorType - : public iterator_adaptor_base<ChildIteratorType, - SmallVector<int, 1>::iterator> { - std::vector<TreeEntry> &VectorizableTree; + : public iterator_adaptor_base< + ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { + ContainerTy &VectorizableTree; - ChildIteratorType(SmallVector<int, 1>::iterator W, - std::vector<TreeEntry> &VT) + ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, + ContainerTy &VT) : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} - NodeRef operator*() { return &VectorizableTree[*I]; } + NodeRef operator*() { return I->UserTE; } }; - static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; } + static NodeRef getEntryNode(BoUpSLP &R) { + return R.VectorizableTree[0].get(); + } static ChildIteratorType child_begin(NodeRef N) { return {N->UserTreeIndices.begin(), N->Container}; @@ -1272,7 +1887,19 @@ template <> struct GraphTraits<BoUpSLP *> { /// For the node iterator we just need to turn the TreeEntry iterator into a /// TreeEntry* iterator so that it dereferences to NodeRef. - using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>; + class nodes_iterator { + using ItTy = ContainerTy::iterator; + ItTy It; + + public: + nodes_iterator(const ItTy &It2) : It(It2) {} + NodeRef operator*() { return It->get(); } + nodes_iterator operator++() { + ++It; + return *this; + } + bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } + }; static nodes_iterator nodes_begin(BoUpSLP *R) { return nodes_iterator(R->VectorizableTree.begin()); @@ -1331,11 +1958,11 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0, -1); + buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (TreeEntry &EIdx : VectorizableTree) { - TreeEntry *Entry = &EIdx; + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. if (Entry->NeedToGather) @@ -1393,7 +2020,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, } void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, - int UserTreeIdx) { + const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); InstructionsState S = getSameOpcode(VL); @@ -1450,6 +2077,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, E->UserTreeIndices.push_back(UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); + E->trySetUserTEOperand(UserTreeIdx, VL, None); return; } @@ -1468,8 +2096,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. + // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i])) { + if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false, UserTreeIdx); return; @@ -1548,7 +2177,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1558,7 +2187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } @@ -1571,6 +2200,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ++NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); + // This is a special case, as it does not gather, but at the same time + // we are not extending buildTree_rec() towards the operands. + ValueList Op0; + Op0.assign(VL.size(), VL0->getOperand(0)); + VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); return; } if (!CurrentOrder.empty()) { @@ -1588,6 +2222,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ++StoredCurrentOrderAndNum->getSecond(); newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); + // This is a special case, as it does not gather, but at the same time + // we are not extending buildTree_rec() towards the operands. + ValueList Op0; + Op0.assign(VL.size(), VL0->getOperand(0)); + VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); @@ -1693,7 +2332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1702,7 +2341,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } @@ -1710,10 +2349,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::FCmp: { // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); + CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast<CmpInst>(VL[i]); - if (Cmp->getPredicate() != P0 || + if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); @@ -1723,20 +2363,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + ValueList Left, Right; + if (cast<CmpInst>(VL0)->isCommutative()) { + // Commutative predicate - collect + sort operands of the instructions + // so that each side is more likely to have the same opcode. + assert(P0 == SwapP0 && "Commutative Predicate mismatch"); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + } else { + // Collect operands - commute if it uses the swapped predicate. + for (Value *V : VL) { + auto *Cmp = cast<CmpInst>(V); + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + if (Cmp->getPredicate() != P0) + std::swap(LHS, RHS); + Left.push_back(LHS); + Right.push_back(RHS); + } } + + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); return; } case Instruction::Select: + case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -1754,17 +2408,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + case Instruction::Xor: { + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); return; } @@ -1774,10 +2428,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; - + } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. for (unsigned j = 0; j < VL.size(); ++j) { @@ -1815,7 +2469,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1823,7 +2477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } @@ -1837,14 +2491,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(0)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, 0}); return; } case Instruction::Call: { @@ -1860,9 +2514,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } Function *Int = CI->getCalledFunction(); - Value *A1I = nullptr; - if (hasVectorInstrinsicScalarOpd(ID, 1)) - A1I = CI->getArgOperand(1); + unsigned NumArgs = CI->getNumArgOperands(); + SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); + for (unsigned j = 0; j != NumArgs; ++j) + if (hasVectorInstrinsicScalarOpd(ID, j)) + ScalarArgs[j] = CI->getArgOperand(j); for (unsigned i = 1, e = VL.size(); i != e; ++i) { CallInst *CI2 = dyn_cast<CallInst>(VL[i]); if (!CI2 || CI2->getCalledFunction() != Int || @@ -1874,16 +2530,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, << "\n"); return; } - // ctlz,cttz and powi are special intrinsics whose second argument - // should be same in order for them to be vectorized. - if (hasVectorInstrinsicScalarOpd(ID, 1)) { - Value *A1J = CI2->getArgOperand(1); - if (A1I != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI - << " argument " << A1I << "!=" << A1J << "\n"); - return; + // Some intrinsics have scalar arguments and should be same in order for + // them to be vectorized. + for (unsigned j = 0; j != NumArgs; ++j) { + if (hasVectorInstrinsicScalarOpd(ID, j)) { + Value *A1J = CI2->getArgOperand(j); + if (ScalarArgs[j] != A1J) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI + << " argument " << ScalarArgs[j] << "!=" << A1J + << "\n"); + return; + } } } // Verify that the bundle operands are identical between the two calls. @@ -1899,7 +2558,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1907,11 +2566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CallInst *CI2 = dyn_cast<CallInst>(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } - case Instruction::ShuffleVector: + case Instruction::ShuffleVector: { // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. if (!S.isAltShuffle()) { @@ -1920,15 +2579,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(S, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); return; } @@ -1938,10 +2597,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; - + } default: BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); @@ -2223,6 +2882,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } + case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -2260,7 +2920,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ConstantInt *CInt0 = nullptr; for (unsigned i = 0, e = VL.size(); i < e; ++i) { const Instruction *I = cast<Instruction>(VL[i]); - ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1)); + unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0; + ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx)); if (!CInt) { Op2VK = TargetTransformInfo::OK_AnyValue; Op2VP = TargetTransformInfo::OP_None; @@ -2413,31 +3074,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } } -bool BoUpSLP::isFullyVectorizableTinyTree() { +bool BoUpSLP::isFullyVectorizableTinyTree() const { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather) + if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0].NeedToGather && - (allConstant(VectorizableTree[1].Scalars) || - isSplat(VectorizableTree[1].Scalars))) + if (!VectorizableTree[0]->NeedToGather && + (allConstant(VectorizableTree[1]->Scalars) || + isSplat(VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather) return false; return true; } -bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { +bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -2457,19 +3118,19 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { return true; } -int BoUpSLP::getSpillCost() { +int BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). - unsigned BundleWidth = VectorizableTree.front().Scalars.size(); + unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); int Cost = 0; SmallPtrSet<Instruction*, 4> LiveValues; Instruction *PrevInst = nullptr; - for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]); + for (const auto &TEPtr : VectorizableTree) { + Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); if (!Inst) continue; @@ -2494,6 +3155,7 @@ int BoUpSLP::getSpillCost() { }); // Now find the sequence of instructions between PrevInst and Inst. + unsigned NumCalls = 0; BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), PrevInstIt = PrevInst->getIterator().getReverse(); @@ -2506,16 +3168,19 @@ int BoUpSLP::getSpillCost() { // Debug informations don't impact spill cost. if ((isa<CallInst>(&*PrevInstIt) && !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && - &*PrevInstIt != PrevInst) { - SmallVector<Type*, 4> V; - for (auto *II : LiveValues) - V.push_back(VectorType::get(II->getType(), BundleWidth)); - Cost += TTI->getCostOfKeepingLiveOverCall(V); - } + &*PrevInstIt != PrevInst) + NumCalls++; ++PrevInstIt; } + if (NumCalls) { + SmallVector<Type*, 4> V; + for (auto *II : LiveValues) + V.push_back(VectorType::get(II->getType(), BundleWidth)); + Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); + } + PrevInst = Inst; } @@ -2527,10 +3192,10 @@ int BoUpSLP::getTreeCost() { LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = VectorizableTree[I]; + TreeEntry &TE = *VectorizableTree[I].get(); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -2545,10 +3210,11 @@ int BoUpSLP::getTreeCost() { // existing heuristics based on tree size may yield different results. // if (TE.NeedToGather && - std::any_of(std::next(VectorizableTree.begin(), I + 1), - VectorizableTree.end(), [TE](TreeEntry &Entry) { - return Entry.NeedToGather && Entry.isSame(TE.Scalars); - })) + std::any_of( + std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), + [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { + return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); + })) continue; int C = getEntryCost(&TE); @@ -2575,7 +3241,7 @@ int BoUpSLP::getTreeCost() { // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -2608,17 +3274,17 @@ int BoUpSLP::getTreeCost() { } int BoUpSLP::getGatherCost(Type *Ty, - const DenseSet<unsigned> &ShuffledIndices) { + const DenseSet<unsigned> &ShuffledIndices) const { int Cost = 0; for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) if (!ShuffledIndices.count(i)) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); if (!ShuffledIndices.empty()) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } -int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { +int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -2638,221 +3304,19 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { return getGatherCost(VecTy, ShuffledElements); } -// Reorder commutative operations in alternate shuffle if the resulting vectors -// are consecutive loads. This would allow us to vectorize the tree. -// If we have something like- -// load a[0] - load b[0] -// load b[1] + load a[1] -// load a[2] - load b[2] -// load a[3] + load b[3] -// Reordering the second load b[1] load a[1] would allow us to vectorize this -// code. -void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - // Push left and right operands of binary operation into Left and Right - for (Value *V : VL) { - auto *I = cast<Instruction>(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - } - - // Reorder if we have a commutative operation and consecutive access - // are on either side of the alternate instructions. - for (unsigned j = 0; j < VL.size() - 1; ++j) { - if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { - Instruction *VL1 = cast<Instruction>(VL[j]); - Instruction *VL2 = cast<Instruction>(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { - Instruction *VL1 = cast<Instruction>(VL[j]); - Instruction *VL2 = cast<Instruction>(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - } -} - -// Return true if I should be commuted before adding it's left and right -// operands to the arrays Left and Right. -// -// The vectorizer is trying to either have all elements one side being -// instruction with the same opcode to enable further vectorization, or having -// a splat to lower the vectorizing cost. -static bool shouldReorderOperands( - int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left, - ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, - bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); - // If we have "SplatRight", try to see if commuting is needed to preserve it. - if (SplatRight) { - if (VRight == Right[i - 1]) - // Preserve SplatRight - return false; - if (VLeft == Right[i - 1]) { - // Commuting would preserve SplatRight, but we don't want to break - // SplatLeft either, i.e. preserve the original order if possible. - // (FIXME: why do we care?) - if (SplatLeft && VLeft == Left[i - 1]) - return false; - return true; - } - } - // Symmetrically handle Right side. - if (SplatLeft) { - if (VLeft == Left[i - 1]) - // Preserve SplatLeft - return false; - if (VRight == Left[i - 1]) - return true; - } - - Instruction *ILeft = dyn_cast<Instruction>(VLeft); - Instruction *IRight = dyn_cast<Instruction>(VRight); - - // If we have "AllSameOpcodeRight", try to see if the left operands preserves - // it and not the right, in this case we want to commute. - if (AllSameOpcodeRight) { - unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode(); - if (IRight && RightPrevOpcode == IRight->getOpcode()) - // Do not commute, a match on the right preserves AllSameOpcodeRight - return false; - if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { - // We have a match and may want to commute, but first check if there is - // not also a match on the existing operands on the Left to preserve - // AllSameOpcodeLeft, i.e. preserve the original order if possible. - // (FIXME: why do we care?) - if (AllSameOpcodeLeft && ILeft && - cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode()) - return false; - return true; - } - } - // Symmetrically handle Left side. - if (AllSameOpcodeLeft) { - unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode(); - if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) - return false; - if (IRight && LeftPrevOpcode == IRight->getOpcode()) - return true; - } - return false; -} - -void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, - ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - if (!VL.empty()) { - // Peel the first iteration out of the loop since there's nothing - // interesting to do anyway and it simplifies the checks in the loop. - auto *I = cast<Instruction>(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); - if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) - // Favor having instruction to the right. FIXME: why? - std::swap(VLeft, VRight); - Left.push_back(VLeft); - Right.push_back(VRight); - } - - // Keep track if we have instructions with all the same opcode on one side. - bool AllSameOpcodeLeft = isa<Instruction>(Left[0]); - bool AllSameOpcodeRight = isa<Instruction>(Right[0]); - // Keep track if we have one side with all the same value (broadcast). - bool SplatLeft = true; - bool SplatRight = true; - - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - assert(((I->getOpcode() == Opcode && I->isCommutative()) || - (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && - "Can only process commutative instruction"); - // Commute to favor either a splat or maximizing having the same opcodes on - // one side. - Value *VLeft; - Value *VRight; - if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, - VRight)) { - Left.push_back(VRight); - Right.push_back(VLeft); - } else { - Left.push_back(VLeft); - Right.push_back(VRight); - } - // Update Splat* and AllSameOpcode* after the insertion. - SplatRight = SplatRight && (Right[i - 1] == Right[i]); - SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); - AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) && - (cast<Instruction>(Left[i - 1])->getOpcode() == - cast<Instruction>(Left[i])->getOpcode()); - AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) && - (cast<Instruction>(Right[i - 1])->getOpcode() == - cast<Instruction>(Right[i])->getOpcode()); - } - - // If one operand end up being broadcast, return this operand order. - if (SplatRight || SplatLeft) +// Perform operand reordering on the instructions in VL and return the reordered +// operands in Left and Right. +void BoUpSLP::reorderInputsAccordingToOpcode( + ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, const DataLayout &DL, + ScalarEvolution &SE) { + if (VL.empty()) return; - - // Finally check if we can get longer vectorizable chain by reordering - // without breaking the good operand order detected above. - // E.g. If we have something like- - // load a[0] load b[0] - // load b[1] load a[1] - // load a[2] load b[2] - // load a[3] load b[3] - // Reordering the second load b[1] load a[1] would allow us to vectorize - // this code and we still retain AllSameOpcode property. - // FIXME: This load reordering might break AllSameOpcode in some rare cases - // such as- - // add a[0],c[0] load b[0] - // add a[1],c[2] load b[1] - // b[2] load b[2] - // add a[3],c[3] load b[3] - for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { - if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - } - } - if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - } - } - // else unchanged - } + VLOperands Ops(VL, DL, SE); + // Reorder the operands in place. + Ops.reorder(); + Left = Ops.getVL(0); + Right = Ops.getVL(1); } void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, @@ -3082,13 +3546,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { continue; } - // Prepare the operand vector. - for (Value *V : E->Scalars) - Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB)); - Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands); + Value *Vec = vectorizeTree(E->getOperand(i)); NewPhi->addIncoming(Vec, IBB); } @@ -3099,7 +3559,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::ExtractElement: { if (!E->NeedToGather) { - Value *V = VL0->getOperand(0); + Value *V = E->getSingleOperand(0); if (!E->ReorderIndices.empty()) { OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); @@ -3132,11 +3592,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractValue: { if (!E->NeedToGather) { - LoadInst *LI = cast<LoadInst>(VL0->getOperand(0)); + LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); + LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); Value *NewV = propagateMetadata(V, E->Scalars); if (!E->ReorderIndices.empty()) { OrdersType Mask; @@ -3177,13 +3637,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast<Instruction>(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); - Value *InVec = vectorizeTree(INVL); + Value *InVec = vectorizeTree(E->getOperand(0)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3202,16 +3658,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::FCmp: case Instruction::ICmp: { - ValueList LHSV, RHSV; - for (Value *V : E->Scalars) { - LHSV.push_back(cast<Instruction>(V)->getOperand(0)); - RHSV.push_back(cast<Instruction>(V)->getOperand(1)); - } - setInsertPointAfterBundle(E->Scalars, S); - Value *L = vectorizeTree(LHSV); - Value *R = vectorizeTree(RHSV); + Value *L = vectorizeTree(E->getOperand(0)); + Value *R = vectorizeTree(E->getOperand(1)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3235,31 +3685,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (Value *V : E->Scalars) { - CondVec.push_back(cast<Instruction>(V)->getOperand(0)); - TrueVec.push_back(cast<Instruction>(V)->getOperand(1)); - FalseVec.push_back(cast<Instruction>(V)->getOperand(2)); + setInsertPointAfterBundle(E->Scalars, S); + + Value *Cond = vectorizeTree(E->getOperand(0)); + Value *True = vectorizeTree(E->getOperand(1)); + Value *False = vectorizeTree(E->getOperand(2)); + + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; } + Value *V = Builder.CreateSelect(Cond, True, False); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } + E->VectorizedValue = V; + ++NumVectorInstructions; + return V; + } + case Instruction::FNeg: { setInsertPointAfterBundle(E->Scalars, S); - Value *Cond = vectorizeTree(CondVec); - Value *True = vectorizeTree(TrueVec); - Value *False = vectorizeTree(FalseVec); + Value *Op = vectorizeTree(E->getOperand(0)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - Value *V = Builder.CreateSelect(Cond, True, False); + Value *V = Builder.CreateUnOp( + static_cast<Instruction::UnaryOps>(S.getOpcode()), Op); + propagateIRFlags(V, E->Scalars, VL0); + if (auto *I = dyn_cast<Instruction>(V)) + V = propagateMetadata(I, E->Scalars); + if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); } E->VectorizedValue = V; ++NumVectorInstructions; + return V; } case Instruction::Add: @@ -3280,21 +3748,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - ValueList LHSVL, RHSVL; - if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, - RHSVL); - else - for (Value *V : E->Scalars) { - auto *I = cast<Instruction>(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); - } - setInsertPointAfterBundle(E->Scalars, S); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(E->getOperand(0)); + Value *RHS = vectorizeTree(E->getOperand(1)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3341,7 +3798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecPtr); + LI = Builder.CreateLoad(VecTy, VecPtr); if (!Alignment) { Alignment = DL->getABITypeAlignment(ScalarLoadTy); } @@ -3367,13 +3824,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); - ValueList ScalarStoreValues; - for (Value *V : E->Scalars) - ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, S); - Value *VecValue = vectorizeTree(ScalarStoreValues); + Value *VecValue = vectorizeTree(E->getOperand(0)); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); @@ -3400,20 +3853,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::GetElementPtr: { setInsertPointAfterBundle(E->Scalars, S); - ValueList Op0VL; - for (Value *V : E->Scalars) - Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); - - Value *Op0 = vectorizeTree(Op0VL); + Value *Op0 = vectorizeTree(E->getOperand(0)); std::vector<Value *> OpVecs; for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; ++j) { - ValueList OpVL; - for (Value *V : E->Scalars) - OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); - - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(E->getOperand(j)); OpVecs.push_back(OpVec); } @@ -3443,20 +3888,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; - // ctlz,cttz and powi are special intrinsics whose second argument is - // a scalar. This argument should not be vectorized. - if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { + // Some intrinsics have scalar arguments. This argument should not be + // vectorized. + if (hasVectorInstrinsicScalarOpd(IID, j)) { CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); continue; } - for (Value *V : E->Scalars) { - CallInst *CEI = cast<CallInst>(V); - OpVL.push_back(CEI->getArgOperand(j)); - } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(E->getOperand(j)); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3485,7 +3926,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ShuffleVector: { - ValueList LHSVL, RHSVL; assert(S.isAltShuffle() && ((Instruction::isBinaryOp(S.getOpcode()) && Instruction::isBinaryOp(S.getAltOpcode())) || @@ -3495,16 +3935,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS, *RHS; if (Instruction::isBinaryOp(S.getOpcode())) { - reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(LHSVL); - RHS = vectorizeTree(RHSVL); + LHS = vectorizeTree(E->getOperand(0)); + RHS = vectorizeTree(E->getOperand(1)); } else { - ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast<Instruction>(V)->getOperand(0)); setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(INVL); + LHS = vectorizeTree(E->getOperand(0)); } if (E->VectorizedValue) { @@ -3578,20 +4014,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast<Instruction>(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); - auto BundleWidth = VectorizableTree[0].Scalars.size(); + auto BundleWidth = VectorizableTree[0]->Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto *VecTy = VectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); - VectorizableTree[0].VectorizedValue = Trunc; + VectorizableTree[0]->VectorizedValue = Trunc; } LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() @@ -3687,8 +4123,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } // For each vectorized value: - for (TreeEntry &EIdx : VectorizableTree) { - TreeEntry *Entry = &EIdx; + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. if (Entry->NeedToGather) @@ -3721,7 +4157,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Builder.ClearInsertionPoint(); - return VectorizableTree[0].VectorizedValue; + return VectorizableTree[0]->VectorizedValue; } void BoUpSLP::optimizeGatherSequence() { @@ -3767,10 +4203,10 @@ void BoUpSLP::optimizeGatherSequence() { // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. - std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), - [this](const DomTreeNode *A, const DomTreeNode *B) { - return DT->properlyDominates(A, B); - }); + llvm::stable_sort(CSEWorkList, + [this](const DomTreeNode *A, const DomTreeNode *B) { + return DT->properlyDominates(A, B); + }); // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the @@ -3989,7 +4425,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, << "\n"); return true; } - UpIter++; + ++UpIter; } if (DownIter != LowerEnd) { if (&*DownIter == I) { @@ -4003,7 +4439,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, << "\n"); return true; } - DownIter++; + ++DownIter; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); @@ -4253,7 +4689,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { BS->ScheduleStart = nullptr; } -unsigned BoUpSLP::getVectorElementSize(Value *V) { +unsigned BoUpSLP::getVectorElementSize(Value *V) const { // If V is a store, just return the width of the stored value without // traversing the expression tree. This is the common case. if (auto *Store = dyn_cast<StoreInst>(V)) @@ -4390,7 +4826,7 @@ void BoUpSLP::computeMinimumValueSizes() { return; // We only attempt to truncate integer expressions. - auto &TreeRoot = VectorizableTree[0].Scalars; + auto &TreeRoot = VectorizableTree[0]->Scalars; auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); if (!TreeRootIT) return; @@ -4411,8 +4847,8 @@ void BoUpSLP::computeMinimumValueSizes() { // Collect the scalar values of the vectorizable expression. We will use this // context to determine which values can be demoted. If we see a truncation, // we mark it as seeding another demotion. - for (auto &Entry : VectorizableTree) - Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end()); + for (auto &EntryPtr : VectorizableTree) + Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); // Ensure the roots of the vectorizable tree don't form a cycle. They must // have a single external user that is not in the vectorizable tree. @@ -4746,38 +5182,29 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - // Do a quadratic search on all of the given stores in reverse order and find - // all of the pairs of stores that follow each other. - SmallVector<unsigned, 16> IndexQueue; - unsigned E = Stores.size(); - IndexQueue.resize(E - 1); - for (unsigned I = E; I > 0; --I) { - unsigned Idx = I - 1; - // If a store has multiple consecutive store candidates, search Stores - // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... - // This is because usually pairing with immediate succeeding or preceding - // candidate create the best chance to find slp vectorization opportunity. - unsigned Offset = 1; - unsigned Cnt = 0; - for (unsigned J = 0; J < E - 1; ++J, ++Offset) { - if (Idx >= Offset) { - IndexQueue[Cnt] = Idx - Offset; - ++Cnt; - } - if (Idx + Offset < E) { - IndexQueue[Cnt] = Idx + Offset; - ++Cnt; - } - } + auto &&FindConsecutiveAccess = + [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) { + if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + return false; - for (auto K : IndexQueue) { - if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) { Tails.insert(Stores[Idx]); Heads.insert(Stores[K]); ConsecutiveChain[Stores[K]] = Stores[Idx]; + return true; + }; + + // Do a quadratic search on all of the given stores in reverse order and find + // all of the pairs of stores that follow each other. + int E = Stores.size(); + for (int Idx = E - 1; Idx >= 0; --Idx) { + // If a store has multiple consecutive store candidates, search according + // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find slp vectorization opportunity. + for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset) + if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || + (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) break; - } - } } // For stores that start but don't end a link in the chain: @@ -5740,6 +6167,9 @@ public: unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); Value *VectorizedTree = nullptr; + + // FIXME: Fast-math-flags should be set based on the instructions in the + // reduction (not all of 'fast' are required). IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); FastMathFlags Unsafe; Unsafe.setFast(); @@ -5929,10 +6359,14 @@ private: assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); - if (!IsPairwiseReduction) + if (!IsPairwiseReduction) { + // FIXME: The builder should use an FMF guard. It should not be hard-coded + // to 'fast'. + assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF"); return createSimpleTargetReduction( Builder, TTI, ReductionData.getOpcode(), VectorizedValue, ReductionData.getFlags(), ReductionOps.back()); + } Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { @@ -6256,7 +6690,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Sort by type. - std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); + llvm::stable_sort(Incoming, PhiTypeSorterFunc); // Try to vectorize elements base on their type. for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), @@ -6297,7 +6731,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { SmallVector<WeakVH, 8> PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { if (it->use_empty() && KeyNodes.count(&*it) > 0 && |