summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp2302
1 files changed, 1635 insertions, 667 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index dcbcab459a6b..76ba62f5d596 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6,6 +6,7 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
@@ -15,39 +16,89 @@
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
using namespace llvm;
+using namespace llvm::PatternMatch;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
@@ -156,6 +207,119 @@ static bool isSplat(ArrayRef<Value *> VL) {
return true;
}
+/// Checks if the vector of instructions can be represented as a shuffle, like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %x0x0 = mul i8 %x0, %x0
+/// %x3x3 = mul i8 %x3, %x3
+/// %y1y1 = mul i8 %y1, %y1
+/// %y2y2 = mul i8 %y2, %y2
+/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+/// ret <4 x i8> %ins4
+/// can be transformed into:
+/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
+/// i32 6>
+/// %2 = mul <4 x i8> %1, %1
+/// ret <4 x i8> %2
+/// We convert this initially to something like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
+/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
+/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
+/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
+/// %5 = mul <4 x i8> %4, %4
+/// %6 = extractelement <4 x i8> %5, i32 0
+/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
+/// %7 = extractelement <4 x i8> %5, i32 1
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
+/// %8 = extractelement <4 x i8> %5, i32 2
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
+/// %9 = extractelement <4 x i8> %5, i32 3
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
+/// ret <4 x i8> %ins4
+/// InstCombiner transforms this into a shuffle and vector mul
+static Optional<TargetTransformInfo::ShuffleKind>
+isShuffle(ArrayRef<Value *> VL) {
+ auto *EI0 = cast<ExtractElementInst>(VL[0]);
+ unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
+ Value *Vec1 = nullptr;
+ Value *Vec2 = nullptr;
+ enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
+ ShuffleMode CommonShuffleMode = Unknown;
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<ExtractElementInst>(VL[I]);
+ auto *Vec = EI->getVectorOperand();
+ // All vector operands must have the same number of vector elements.
+ if (Vec->getType()->getVectorNumElements() != Size)
+ return None;
+ auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+ if (!Idx)
+ return None;
+ // Undefined behavior if Idx is negative or >= Size.
+ if (Idx->getValue().uge(Size))
+ continue;
+ unsigned IntIdx = Idx->getValue().getZExtValue();
+ // We can extractelement from undef vector.
+ if (isa<UndefValue>(Vec))
+ continue;
+ // For correct shuffling we have to have at most 2 different vector operands
+ // in all extractelement instructions.
+ if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
+ return None;
+ if (CommonShuffleMode == Permute)
+ continue;
+ // If the extract index is not the same as the operation number, it is a
+ // permutation.
+ if (IntIdx != I) {
+ CommonShuffleMode = Permute;
+ continue;
+ }
+ // Check the shuffle mode for the current operation.
+ if (!Vec1)
+ Vec1 = Vec;
+ else if (Vec != Vec1)
+ Vec2 = Vec;
+ // Example: shufflevector A, B, <0,5,2,7>
+ // I is odd and IntIdx for A == I - FirstAlternate shuffle.
+ // I is even and IntIdx for B == I - FirstAlternate shuffle.
+ // Example: shufflevector A, B, <4,1,6,3>
+ // I is even and IntIdx for A == I - SecondAlternate shuffle.
+ // I is odd and IntIdx for B == I - SecondAlternate shuffle.
+ const bool IIsEven = I & 1;
+ const bool CurrVecIsA = Vec == Vec1;
+ const bool IIsOdd = !IIsEven;
+ const bool CurrVecIsB = !CurrVecIsA;
+ ShuffleMode CurrentShuffleMode =
+ ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
+ : SecondAlternate;
+ // Common mode is not set or the same as the shuffle mode of the current
+ // operation - alternate.
+ if (CommonShuffleMode == Unknown)
+ CommonShuffleMode = CurrentShuffleMode;
+ // Common shuffle mode is not the same as the shuffle mode of the current
+ // operation - permutation.
+ if (CommonShuffleMode != CurrentShuffleMode)
+ CommonShuffleMode = Permute;
+ }
+ // If we're not crossing lanes in different vectors, consider it as blending.
+ if ((CommonShuffleMode == FirstAlternate ||
+ CommonShuffleMode == SecondAlternate) &&
+ Vec2)
+ return TargetTransformInfo::SK_Alternate;
+ // If Vec2 was never used, we have a permutation of a single vector, otherwise
+ // we have permutation of 2 vectors.
+ return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
+ : TargetTransformInfo::SK_PermuteSingleSrc;
+}
+
///\returns Opcode that can be clubbed with \p Op to create an alternate
/// sequence which can later be merged as a ShuffleVector instruction.
static unsigned getAltOpcode(unsigned Op) {
@@ -173,50 +337,107 @@ static unsigned getAltOpcode(unsigned Op) {
}
}
-/// true if the \p Value is odd, false otherwise.
static bool isOdd(unsigned Value) {
return Value & 1;
}
-///\returns bool representing if Opcode \p Op can be part
-/// of an alternate sequence which can later be merged as
-/// a ShuffleVector instruction.
-static bool canCombineAsAltInst(unsigned Op) {
- return Op == Instruction::FAdd || Op == Instruction::FSub ||
- Op == Instruction::Sub || Op == Instruction::Add;
+static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
+ unsigned CheckedOpcode) {
+ return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
}
-/// \returns ShuffleVector instruction if instructions in \p VL have
-/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
-/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
-static unsigned isAltInst(ArrayRef<Value *> VL) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- unsigned Opcode = I0->getOpcode();
- unsigned AltOpcode = getAltOpcode(Opcode);
- for (int i = 1, e = VL.size(); i < e; i++) {
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode))
- return 0;
- }
- return Instruction::ShuffleVector;
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(Value *OpValue, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (!I)
+ return OpValue;
+ auto *OpInst = cast<Instruction>(OpValue);
+ unsigned OpInstOpcode = OpInst->getOpcode();
+ unsigned IOpcode = I->getOpcode();
+ if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
+ return Op;
+ return OpValue;
}
-/// \returns The opcode if all of the Instructions in \p VL have the same
-/// opcode, or zero.
-static unsigned getSameOpcode(ArrayRef<Value *> VL) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+namespace {
+
+/// Contains data for the instructions going to be vectorized.
+struct RawInstructionsData {
+ /// Main Opcode of the instructions going to be vectorized.
+ unsigned Opcode = 0;
+
+ /// The list of instructions have some instructions with alternate opcodes.
+ bool HasAltOpcodes = false;
+};
+
+} // end anonymous namespace
+
+/// Checks the list of the vectorized instructions \p VL and returns info about
+/// this list.
+static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
+ auto *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
- return 0;
+ return {};
+ RawInstructionsData Res;
unsigned Opcode = I0->getOpcode();
- for (int i = 1, e = VL.size(); i < e; i++) {
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I || Opcode != I->getOpcode()) {
- if (canCombineAsAltInst(Opcode) && i == 1)
- return isAltInst(VL);
- return 0;
+ // Walk through the list of the vectorized instructions
+ // in order to check its structure described by RawInstructionsData.
+ for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
+ auto *I = dyn_cast<Instruction>(VL[Cnt]);
+ if (!I)
+ return {};
+ if (Opcode != I->getOpcode())
+ Res.HasAltOpcodes = true;
+ }
+ Res.Opcode = Opcode;
+ return Res;
+}
+
+namespace {
+
+/// Main data required for vectorization of instructions.
+struct InstructionsState {
+ /// The very first instruction in the list with the main opcode.
+ Value *OpValue = nullptr;
+
+ /// The main opcode for the list of instructions.
+ unsigned Opcode = 0;
+
+ /// Some of the instructions in the list have alternate opcodes.
+ bool IsAltShuffle = false;
+
+ InstructionsState() = default;
+ InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
+ : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
+};
+
+} // end anonymous namespace
+
+/// \returns analysis of the Instructions in \p VL described in
+/// InstructionsState, the Opcode that we suppose the whole list
+/// could be vectorized even if its structure is diverse.
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
+ auto Res = getMainOpcode(VL);
+ unsigned Opcode = Res.Opcode;
+ if (!Res.HasAltOpcodes)
+ return InstructionsState(VL[0], Opcode, false);
+ auto *OpInst = cast<Instruction>(VL[0]);
+ unsigned AltOpcode = getAltOpcode(Opcode);
+ // Examine each element in the list instructions VL to determine
+ // if some operations there could be considered as an alternative
+ // (for example as subtraction relates to addition operation).
+ for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+ auto *I = cast<Instruction>(VL[Cnt]);
+ unsigned InstOpcode = I->getOpcode();
+ if ((Res.HasAltOpcodes &&
+ InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
+ (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
+ return InstructionsState(OpInst, 0, false);
}
}
- return Opcode;
+ return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
}
/// \returns true if all of the values in \p VL have the same type or false
@@ -247,7 +468,6 @@ static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
-
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
@@ -292,24 +512,25 @@ static bool isSimple(Instruction *I) {
}
namespace llvm {
+
namespace slpvectorizer {
+
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
- typedef SmallVector<Value *, 8> ValueList;
- typedef SmallVector<Instruction *, 16> InstrList;
- typedef SmallPtrSet<Value *, 16> ValueSet;
- typedef SmallVector<StoreInst *, 8> StoreList;
- typedef MapVector<Value *, SmallVector<Instruction *, 2>>
- ExtraValueToDebugLocsMap;
+ using ValueList = SmallVector<Value *, 8>;
+ using InstrList = SmallVector<Instruction *, 16>;
+ using ValueSet = SmallPtrSet<Value *, 16>;
+ using StoreList = SmallVector<StoreInst *, 8>;
+ using ExtraValueToDebugLocsMap =
+ MapVector<Value *, SmallVector<Instruction *, 2>>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
- : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
- SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
- DL(DL), ORE(ORE), Builder(Se->getContext()) {
+ : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
+ DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -331,6 +552,7 @@ public:
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
+
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
@@ -348,6 +570,7 @@ public:
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
+
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
/// into account (anf updating it, if required) list of externally used
@@ -374,7 +597,7 @@ public:
unsigned getTreeSize() const { return VectorizableTree.size(); }
/// \brief Perform LICM and CSE on the newly generated gather sequences.
- void optimizeGatherSequence();
+ void optimizeGatherSequence(Function &F);
/// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {
@@ -416,21 +639,30 @@ public:
private:
struct TreeEntry;
+ /// Checks if all users of \p I are the part of the vectorization tree.
+ bool areAllUsersVectorized(Instruction *I) const;
+
/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
- void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1,
+ int OpdNum = 0);
/// \returns True if the ExtractElement/ExtractValue instructions in VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
- bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+ bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
- /// Vectorize a single entry in the tree.
- Value *vectorizeTree(TreeEntry *E);
+ /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of
+ /// operand corrsponding to this tree entry \p E for the user tree entry
+ /// indicated by \p UserIndx.
+ // In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)".
+ Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1);
- /// Vectorize a single entry in the tree, starting in \p VL.
- Value *vectorizeTree(ArrayRef<Value *> VL);
+ /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate
+ /// the ordinality of operand corrsponding to the \p VL of scalar values for the
+ /// user indicated by \p UserIndx this \p VL feeds into.
+ Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1);
/// \returns the pointer to the vectorized value if \p VL is already
/// vectorized, or NULL. They may happen in cycles.
@@ -447,7 +679,7 @@ private:
/// \brief Set the Builder insert point to one after the last instruction in
/// the bundle
- void setInsertPointAfterBundle(ArrayRef<Value *> VL);
+ void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -458,18 +690,17 @@ private:
/// \reorder commutative operands in alt shuffle if they result in
/// vectorized code.
- void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+ void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
+
/// \reorder commutative operands to get better probability of
/// generating vectorized code.
- void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
struct TreeEntry {
- TreeEntry(std::vector<TreeEntry> &Container)
- : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
- Container(Container) {}
+ TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
@@ -477,14 +708,32 @@ private:
return std::equal(VL.begin(), VL.end(), Scalars.begin());
}
+ /// \returns true if the scalars in VL are found in this tree entry.
+ bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
+ ScalarEvolution &SE) const {
+ assert(VL.size() == Scalars.size() && "Invalid size");
+ SmallVector<Value *, 8> List;
+ if (!sortLoadAccesses(VL, DL, SE, List))
+ return false;
+ return std::equal(List.begin(), List.end(), Scalars.begin());
+ }
+
/// A vector of scalars.
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
- Value *VectorizedValue;
+ Value *VectorizedValue = nullptr;
/// Do we need to gather this sequence ?
- bool NeedToGather;
+ bool NeedToGather = false;
+
+ /// Records optional shuffle mask for the uses of jumbled memory accesses.
+ /// For example, a non-empty ShuffleMask[1] represents the permutation of
+ /// lanes that operand #1 of this vectorized instruction should undergo
+ /// before feeding this vectorized instruction, whereas an empty
+ /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized
+ /// instruction need not be permuted at all.
+ SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask;
/// Points back to the VectorizableTree.
///
@@ -501,12 +750,31 @@ private:
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
- int &UserTreeIdx) {
+ int &UserTreeIdx, const InstructionsState &S,
+ ArrayRef<unsigned> ShuffleMask = None,
+ int OpdNum = 0) {
+ assert((!Vectorized || S.Opcode != 0) &&
+ "Vectorized TreeEntry without opcode");
VectorizableTree.emplace_back(VectorizableTree);
+
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
+
+ TreeEntry *UserTreeEntry = nullptr;
+ if (UserTreeIdx != -1)
+ UserTreeEntry = &VectorizableTree[UserTreeIdx];
+
+ if (UserTreeEntry && !ShuffleMask.empty()) {
+ if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
+ UserTreeEntry->ShuffleMask.resize(OpdNum + 1);
+ assert(UserTreeEntry->ShuffleMask[OpdNum].empty() &&
+ "Mask already present");
+ using mask = SmallVector<unsigned, 4>;
+ mask tempMask(ShuffleMask.begin(), ShuffleMask.end());
+ UserTreeEntry->ShuffleMask[OpdNum] = tempMask;
+ }
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -548,16 +816,19 @@ private:
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
- ExternalUser (Value *S, llvm::User *U, int L) :
- Scalar(S), User(U), Lane(L){}
+ ExternalUser(Value *S, llvm::User *U, int L)
+ : Scalar(S), User(U), Lane(L) {}
+
// Which scalar in our function.
Value *Scalar;
+
// Which user that uses the scalar.
llvm::User *User;
+
// Which lane does the scalar belong to.
int Lane;
};
- typedef SmallVector<ExternalUser, 16> UserList;
+ using UserList = SmallVector<ExternalUser, 16>;
/// Checks if two instructions may access the same memory.
///
@@ -565,7 +836,6 @@ private:
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
-
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
@@ -583,7 +853,7 @@ private:
return aliased;
}
- typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
+ using AliasCacheKey = std::pair<Instruction *, Instruction *>;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
@@ -616,6 +886,7 @@ private:
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
+
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
@@ -624,18 +895,13 @@ private:
/// instruction bundle (= a group of instructions which is combined into a
/// vector instruction).
struct ScheduleData {
-
// The initial value for the dependency counters. It means that the
// dependencies are not calculated yet.
enum { InvalidDeps = -1 };
- ScheduleData()
- : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
- NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
- Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
- UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
+ ScheduleData() = default;
- void init(int BlockSchedulingRegionID) {
+ void init(int BlockSchedulingRegionID, Value *OpVal) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
@@ -643,6 +909,7 @@ private:
SchedulingRegionID = BlockSchedulingRegionID;
UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
+ OpValue = OpVal;
}
/// Returns true if the dependency information has been calculated.
@@ -702,19 +969,19 @@ private:
}
}
- Instruction *Inst;
+ Instruction *Inst = nullptr;
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
- ScheduleData *FirstInBundle;
+ ScheduleData *FirstInBundle = nullptr;
/// Single linked list of all instructions in a bundle. Null if it is a
/// single instruction.
- ScheduleData *NextInBundle;
+ ScheduleData *NextInBundle = nullptr;
/// Single linked list of all memory instructions (e.g. load, store, call)
/// in the block - until the end of the scheduling region.
- ScheduleData *NextLoadStore;
+ ScheduleData *NextLoadStore = nullptr;
/// The dependent memory instructions.
/// This list is derived on demand in calculateDependencies().
@@ -722,31 +989,33 @@ private:
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
- int SchedulingRegionID;
+ int SchedulingRegionID = 0;
/// Used for getting a "good" final ordering of instructions.
- int SchedulingPriority;
+ int SchedulingPriority = 0;
/// The number of dependencies. Constitutes of the number of users of the
/// instruction plus the number of dependent memory instructions (if any).
/// This value is calculated on demand.
/// If InvalidDeps, the number of dependencies is not calculated yet.
- ///
- int Dependencies;
+ int Dependencies = InvalidDeps;
/// The number of dependencies minus the number of dependencies of scheduled
/// instructions. As soon as this is zero, the instruction/bundle gets ready
/// for scheduling.
/// Note that this is negative as long as Dependencies is not calculated.
- int UnscheduledDeps;
+ int UnscheduledDeps = InvalidDeps;
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
/// single instructions.
- int UnscheduledDepsInBundle;
+ int UnscheduledDepsInBundle = InvalidDeps;
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
- bool IsScheduled;
+ bool IsScheduled = false;
+
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
};
#ifndef NDEBUG
@@ -756,22 +1025,14 @@ private:
return os;
}
#endif
+
friend struct GraphTraits<BoUpSLP *>;
friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
- ///
struct BlockScheduling {
-
BlockScheduling(BasicBlock *BB)
- : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
- ScheduleStart(nullptr), ScheduleEnd(nullptr),
- FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
- ScheduleRegionSize(0),
- ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
- // Make sure that the initial SchedulingRegionID is greater than the
- // initial SchedulingRegionID in ScheduleData (which is 0).
- SchedulingRegionID(1) {}
+ : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
void clear() {
ReadyInsts.clear();
@@ -799,6 +1060,18 @@ private:
return nullptr;
}
+ ScheduleData *getScheduleData(Value *V, Value *Key) {
+ if (V == Key)
+ return getScheduleData(V);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end()) {
+ ScheduleData *SD = I->second[Key];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ }
+ return nullptr;
+ }
+
bool isInSchedulingRegion(ScheduleData *SD) {
return SD->SchedulingRegionID == SchedulingRegionID;
}
@@ -812,19 +1085,29 @@ private:
ScheduleData *BundleMember = SD;
while (BundleMember) {
+ if (BundleMember->Inst != BundleMember->OpValue) {
+ BundleMember = BundleMember->NextInBundle;
+ continue;
+ }
// Handle the def-use chain dependencies.
for (Use &U : BundleMember->Inst->operands()) {
- ScheduleData *OpDef = getScheduleData(U.get());
- if (OpDef && OpDef->hasValidDependencies() &&
- OpDef->incrementUnscheduledDeps(-1) == 0) {
- // There are no more unscheduled dependencies after decrementing,
- // so we can put the dependent instruction into the ready list.
- ScheduleData *DepBundle = OpDef->FirstInBundle;
- assert(!DepBundle->IsScheduled &&
- "already scheduled bundle gets ready");
- ReadyList.insert(DepBundle);
- DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
- }
+ auto *I = dyn_cast<Instruction>(U.get());
+ if (!I)
+ continue;
+ doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
+ if (OpDef && OpDef->hasValidDependencies() &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after
+ // decrementing, so we can put the dependent instruction
+ // into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ });
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -835,22 +1118,35 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
+ DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
+ << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
}
}
+ void doForAllOpcodes(Value *V,
+ function_ref<void(ScheduleData *SD)> Action) {
+ if (ScheduleData *SD = getScheduleData(V))
+ Action(SD);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end())
+ for (auto &P : I->second)
+ if (P.second->SchedulingRegionID == SchedulingRegionID)
+ Action(P.second);
+ }
+
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- if (SD->isSchedulingEntity() && SD->isReady()) {
- ReadyList.insert(SD);
- DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
- }
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->isReady()) {
+ ReadyList.insert(SD);
+ DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
+ }
+ });
}
}
@@ -862,9 +1158,12 @@ private:
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+ /// Allocates schedule data chunk.
+ ScheduleData *allocateScheduleDataChunks();
+
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
- bool extendSchedulingRegion(Value *V);
+ bool extendSchedulingRegion(Value *V, Value *OpValue);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
@@ -897,6 +1196,10 @@ private:
/// ScheduleData structures are recycled.
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+ /// Attaches ScheduleData to Instruction with the leading key.
+ DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
+ ExtraScheduleDataMap;
+
struct ReadyList : SmallVector<ScheduleData *, 8> {
void insert(ScheduleData *SD) { push_back(SD); }
};
@@ -905,28 +1208,30 @@ private:
ReadyList ReadyInsts;
/// The first instruction of the scheduling region.
- Instruction *ScheduleStart;
+ Instruction *ScheduleStart = nullptr;
/// The first instruction _after_ the scheduling region.
- Instruction *ScheduleEnd;
+ Instruction *ScheduleEnd = nullptr;
/// The first memory accessing instruction in the scheduling region
/// (can be null).
- ScheduleData *FirstLoadStoreInRegion;
+ ScheduleData *FirstLoadStoreInRegion = nullptr;
/// The last memory accessing instruction in the scheduling region
/// (can be null).
- ScheduleData *LastLoadStoreInRegion;
+ ScheduleData *LastLoadStoreInRegion = nullptr;
/// The current size of the scheduling region.
- int ScheduleRegionSize;
+ int ScheduleRegionSize = 0;
/// The maximum size allowed for the scheduling region.
- int ScheduleRegionSizeLimit;
+ int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
- int SchedulingRegionID;
+ // Make sure that the initial SchedulingRegionID is greater than the
+ // initial SchedulingRegionID in ScheduleData (which is 0).
+ int SchedulingRegionID = 1;
};
/// Attaches the BlockScheduling structures to basic blocks.
@@ -940,10 +1245,10 @@ private:
ArrayRef<Value *> UserIgnoreList;
// Number of load bundles that contain consecutive loads.
- int NumLoadsWantToKeepOrder;
+ int NumLoadsWantToKeepOrder = 0;
// Number of load bundles that contain consecutive loads in reversed order.
- int NumLoadsWantToChangeOrder;
+ int NumLoadsWantToChangeOrder = 0;
// Analysis and block reference.
Function *F;
@@ -960,6 +1265,7 @@ private:
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
+
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
@@ -970,20 +1276,20 @@ private:
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
+
} // end namespace slpvectorizer
template <> struct GraphTraits<BoUpSLP *> {
- typedef BoUpSLP::TreeEntry TreeEntry;
+ using TreeEntry = BoUpSLP::TreeEntry;
/// NodeRef has to be a pointer per the GraphWriter.
- typedef TreeEntry *NodeRef;
+ using NodeRef = TreeEntry *;
/// \brief Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<ChildIteratorType,
SmallVector<int, 1>::iterator> {
-
std::vector<TreeEntry> &VectorizableTree;
ChildIteratorType(SmallVector<int, 1>::iterator W,
@@ -998,17 +1304,19 @@ template <> struct GraphTraits<BoUpSLP *> {
static ChildIteratorType child_begin(NodeRef N) {
return {N->UserTreeIndices.begin(), N->Container};
}
+
static ChildIteratorType child_end(NodeRef N) {
return {N->UserTreeIndices.end(), N->Container};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
/// TreeEntry* iterator so that it dereferences to NodeRef.
- typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
+ using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
static nodes_iterator nodes_begin(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.begin());
}
+
static nodes_iterator nodes_end(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.end());
}
@@ -1017,7 +1325,7 @@ template <> struct GraphTraits<BoUpSLP *> {
};
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
- typedef BoUpSLP::TreeEntry TreeEntry;
+ using TreeEntry = BoUpSLP::TreeEntry;
DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
@@ -1054,6 +1362,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap ExternallyUsedValues;
buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
}
+
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst) {
@@ -1118,44 +1427,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
- int UserTreeIdx) {
- bool isAltShuffle = false;
+ int UserTreeIdx, int OpdNum) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+ InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
// Don't handle vectors.
- if (VL[0]->getType()->isVectorTy()) {
+ if (S.OpValue->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
- unsigned Opcode = getSameOpcode(VL);
-
- // Check that this shuffle vector refers to the alternate
- // sequence of opcodes.
- if (Opcode == Instruction::ShuffleVector) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- unsigned Op = I0->getOpcode();
- if (Op != Instruction::ShuffleVector)
- isAltShuffle = true;
- }
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
@@ -1167,87 +1466,92 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (EphValues.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is ephemeral.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// Check if this is a duplicate of another entry.
- if (TreeEntry *E = getTreeEntry(VL[0])) {
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
if (E->Scalars[i] != VL[i]) {
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
- DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
+ DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- if (ScalarToTreeEntry.count(VL[i])) {
+ auto *I = dyn_cast<Instruction>(VL[i]);
+ if (!I)
+ continue;
+ if (getTreeEntry(I)) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is already in tree.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
- // If any of the scalars is marked as a value that needs to stay scalar then
+ // If any of the scalars is marked as a value that needs to stay scalar, then
// we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
- Instruction *VL0 = cast<Instruction>(VL[0]);
+ auto *VL0 = cast<Instruction>(S.OpValue);
BasicBlock *BB = VL0->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
- // Check that every instructions appears once in this bundle.
+ // Check that every instruction appears once in this bundle.
for (unsigned i = 0, e = VL.size(); i < e; ++i)
- for (unsigned j = i+1; j < e; ++j)
+ for (unsigned j = i + 1; j < e; ++j)
if (VL[i] == VL[j]) {
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
auto &BSRef = BlocksSchedules[BB];
- if (!BSRef) {
+ if (!BSRef)
BSRef = llvm::make_unique<BlockScheduling>(BB);
- }
+
BlockScheduling &BS = *BSRef.get();
- if (!BS.tryScheduleBundle(VL, this, VL0)) {
+ if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
- assert((!BS.getScheduleData(VL[0]) ||
- !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+ assert((!BS.getScheduleData(VL0) ||
+ !BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
- switch (Opcode) {
+ unsigned ShuffleOrOp = S.IsAltShuffle ?
+ (unsigned) Instruction::ShuffleVector : S.Opcode;
+ switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1259,12 +1563,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1274,35 +1578,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = canReuseExtract(VL, Opcode);
+ bool Reuse = canReuseExtract(VL, VL0);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
} else {
BS.cancelScheduling(VL, VL0);
}
- newTreeEntry(VL, Reuse, UserTreeIdx);
+ newTreeEntry(VL, Reuse, UserTreeIdx, S);
return;
}
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
- // load.
- // For example we don't want vectorize loads that are smaller than 8 bit.
- // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
- // loading/storing it as an i8 struct. If we vectorize loads/stores from
- // such a struct we read/write packed bits disagreeing with the
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
- Type *ScalarTy = VL[0]->getType();
+ Type *ScalarTy = VL0->getType();
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
@@ -1313,15 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
}
// Check if the loads are consecutive, reversed, or neither.
- // TODO: What we really want is to sort the loads, but for now, check
- // the two likely directions.
bool Consecutive = true;
bool ReverseConsecutive = true;
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@@ -1335,7 +1636,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Consecutive) {
++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
}
@@ -1349,15 +1650,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
break;
}
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
-
if (ReverseConsecutive) {
- ++NumLoadsWantToChangeOrder;
DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
- } else {
- DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ ++NumLoadsWantToChangeOrder;
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, false, UserTreeIdx, S);
+ return;
+ }
+
+ if (VL.size() > 2) {
+ bool ShuffledLoads = true;
+ SmallVector<Value *, 8> Sorted;
+ SmallVector<unsigned, 4> Mask;
+ if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) {
+ auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
+ for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
+ if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
+ ShuffledLoads = false;
+ break;
+ }
+ }
+ // TODO: Tracking how many load wants to have arbitrary shuffled order
+ // would be usefull.
+ if (ShuffledLoads) {
+ DEBUG(dbgs() << "SLP: added a vector of loads which needs "
+ "permutation of loaded lanes.\n");
+ newTreeEntry(NewVL, true, UserTreeIdx, S,
+ makeArrayRef(Mask.begin(), Mask.end()), OpdNum);
+ return;
+ }
+ }
}
+
+ DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
case Instruction::ZExt:
@@ -1377,12 +1704,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1391,7 +1718,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
@@ -1399,19 +1726,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
+ Type *ComparedTy = VL0->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1420,7 +1747,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
@@ -1442,17 +1769,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
- newTreeEntry(VL, true, UserTreeIdx);
+ case Instruction::Xor:
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right);
+ reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
- buildTree_rec(Right, Depth + 1, UserTreeIdx);
+ buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
return;
}
@@ -1462,30 +1789,30 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
- }
+
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
// We can't combine several GEPs into one vector if they operate on
// different types.
- Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+ Type *Ty0 = VL0->getOperand(0)->getType();
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
@@ -1497,12 +1824,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
@@ -1510,7 +1837,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
@@ -1519,12 +1846,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
@@ -1536,13 +1863,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::Call: {
// Check if the calls are all to the same vectorizable intrinsic.
- CallInst *CI = cast<CallInst>(VL[0]);
+ CallInst *CI = cast<CallInst>(VL0);
// Check if this is an Intrinsic call or something that can be
// represented by an intrinsic call
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
@@ -1556,7 +1883,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n");
return;
@@ -1567,7 +1894,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J
<< "\n");
@@ -1580,14 +1907,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
<< *VL[i] << '\n');
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1595,28 +1922,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
}
- case Instruction::ShuffleVector: {
+ case Instruction::ShuffleVector:
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
- if (!isAltShuffle) {
+ if (!S.IsAltShuffle) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
- reorderAltShuffleOperands(VL, Left, Right);
+ reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
- buildTree_rec(Right, Depth + 1, UserTreeIdx);
+ buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
return;
}
@@ -1626,13 +1953,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
- buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+ buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
}
return;
- }
+
default:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, S);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
@@ -1663,19 +1990,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
return N;
}
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
- assert(Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::ExtractValue);
- assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
+ Instruction *E0 = cast<Instruction>(OpValue);
+ assert(E0->getOpcode() == Instruction::ExtractElement ||
+ E0->getOpcode() == Instruction::ExtractValue);
+ assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
- Value *VL0 = VL[0];
- Instruction *E0 = cast<Instruction>(VL0);
Value *Vec = E0->getOperand(0);
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
- if (Opcode == Instruction::ExtractValue) {
+ if (E0->getOpcode() == Instruction::ExtractValue) {
const DataLayout &DL = E0->getModule()->getDataLayout();
NElts = canMapToVector(Vec->getType(), DL);
if (!NElts)
@@ -1692,20 +2018,24 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
return false;
// Check that all of the indices extract from the correct offset.
- if (!matchExtractIndex(E0, 0, Opcode))
- return false;
-
- for (unsigned i = 1, e = VL.size(); i < e; ++i) {
- Instruction *E = cast<Instruction>(VL[i]);
- if (!matchExtractIndex(E, i, Opcode))
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ Instruction *Inst = cast<Instruction>(VL[I]);
+ if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
return false;
- if (E->getOperand(0) != Vec)
+ if (Inst->getOperand(0) != Vec)
return false;
}
return true;
}
+bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
+ return I->hasOneUse() ||
+ std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
+ return ScalarToTreeEntry.count(U) > 0;
+ });
+}
+
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
@@ -1728,28 +2058,47 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
if (isSplat(VL)) {
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
+ if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
+ if (ShuffleKind.hasValue()) {
+ int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+ for (auto *V : VL) {
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ if (areAllUsersVectorized(cast<Instruction>(V)) &&
+ !ScalarToTreeEntry.count(V)) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ IO->getZExtValue());
+ }
+ }
+ return Cost;
+ }
+ }
return getGatherCost(E->Scalars);
}
- unsigned Opcode = getSameOpcode(VL);
- assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
- Instruction *VL0 = cast<Instruction>(VL[0]);
- switch (Opcode) {
- case Instruction::PHI: {
+ InstructionsState S = getSameOpcode(VL);
+ assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ Instruction *VL0 = cast<Instruction>(S.OpValue);
+ unsigned ShuffleOrOp = S.IsAltShuffle ?
+ (unsigned) Instruction::ShuffleVector : S.Opcode;
+ switch (ShuffleOrOp) {
+ case Instruction::PHI:
return 0;
- }
+
case Instruction::ExtractValue:
- case Instruction::ExtractElement: {
- if (canReuseExtract(VL, Opcode)) {
+ case Instruction::ExtractElement:
+ if (canReuseExtract(VL, S.OpValue)) {
int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
- if (E->hasOneUse() ||
- std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
- return ScalarToTreeEntry.count(U) > 0;
- }))
+ if (areAllUsersVectorized(E))
// Take credit for instruction that will become dead.
DeadCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
@@ -1757,7 +2106,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return -DeadCost;
}
return getGatherCost(VecTy);
- }
+
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -1786,8 +2135,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
- int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
+ TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+ int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
return VecCost - ScalarCost;
}
case Instruction::Add:
@@ -1848,9 +2197,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarCost =
VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+ TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
Op2VP, Operands);
- int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+ int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
Op1VP, Op2VP, Operands);
return VecCost - ScalarCost;
}
@@ -1968,7 +2317,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
}
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
-
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -2139,13 +2487,18 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// load a[3] + load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize this
// code.
-void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right
- for (Value *i : VL) {
- Left.push_back(cast<Instruction>(i)->getOperand(0));
- Right.push_back(cast<Instruction>(i)->getOperand(1));
+ unsigned AltOpcode = getAltOpcode(Opcode);
+ (void)AltOpcode;
+ for (Value *V : VL) {
+ auto *I = cast<Instruction>(V);
+ assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
+ "Incorrect instruction in vector");
+ Left.push_back(I->getOperand(0));
+ Right.push_back(I->getOperand(1));
}
// Reorder if we have a commutative operation and consecutive access
@@ -2190,14 +2543,12 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
// The vectorizer is trying to either have all elements one side being
// instruction with the same opcode to enable further vectorization, or having
// a splat to lower the vectorizing cost.
-static bool shouldReorderOperands(int i, Instruction &I,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- bool AllSameOpcodeLeft,
- bool AllSameOpcodeRight, bool SplatLeft,
- bool SplatRight) {
- Value *VLeft = I.getOperand(0);
- Value *VRight = I.getOperand(1);
+static bool shouldReorderOperands(
+ int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
+ ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
+ bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
+ VLeft = I.getOperand(0);
+ VRight = I.getOperand(1);
// If we have "SplatRight", try to see if commuting is needed to preserve it.
if (SplatRight) {
if (VRight == Right[i - 1])
@@ -2253,15 +2604,16 @@ static bool shouldReorderOperands(int i, Instruction &I,
return false;
}
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
-
- if (VL.size()) {
+ if (!VL.empty()) {
// Peel the first iteration out of the loop since there's nothing
// interesting to do anyway and it simplifies the checks in the loop.
- auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
- auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+ auto *I = cast<Instruction>(VL[0]);
+ Value *VLeft = I->getOperand(0);
+ Value *VRight = I->getOperand(1);
if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
// Favor having instruction to the right. FIXME: why?
std::swap(VLeft, VRight);
@@ -2278,16 +2630,21 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
Instruction *I = cast<Instruction>(VL[i]);
- assert(I->isCommutative() && "Can only process commutative instruction");
+ assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
+ (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
+ "Can only process commutative instruction");
// Commute to favor either a splat or maximizing having the same opcodes on
// one side.
- if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
- AllSameOpcodeRight, SplatLeft, SplatRight)) {
- Left.push_back(I->getOperand(1));
- Right.push_back(I->getOperand(0));
+ Value *VLeft;
+ Value *VRight;
+ if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
+ AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
+ VRight)) {
+ Left.push_back(VRight);
+ Right.push_back(VLeft);
} else {
- Left.push_back(I->getOperand(0));
- Right.push_back(I->getOperand(1));
+ Left.push_back(VLeft);
+ Right.push_back(VRight);
}
// Update Splat* and AllSameOpcode* after the insertion.
SplatRight = SplatRight && (Right[i - 1] == Right[i]);
@@ -2340,14 +2697,17 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
}
}
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
-
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
- auto *Front = cast<Instruction>(VL.front());
+ auto *Front = cast<Instruction>(OpValue);
auto *BB = Front->getParent();
- assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
- return cast<Instruction>(V)->getParent() == BB;
+ const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
+ const unsigned AltOpcode = getAltOpcode(Opcode);
+ assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
+ return !sameOpcodeOrAlt(Opcode, AltOpcode,
+ cast<Instruction>(V)->getOpcode()) ||
+ cast<Instruction>(V)->getParent() == BB;
}));
// The last instruction in the bundle in program order.
@@ -2358,10 +2718,12 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
- auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
+ auto *Bundle =
+ BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
- LastInst = Bundle->Inst;
+ if (Bundle->OpValue == Bundle->Inst)
+ LastInst = Bundle->Inst;
}
// LastInst can still be null at this point if there's either not an entry
@@ -2385,7 +2747,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I))
+ if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
LastInst = &I;
if (Bundle.empty())
break;
@@ -2435,27 +2797,41 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
return nullptr;
}
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
- if (TreeEntry *E = getTreeEntry(VL[0]))
- if (E->isSame(VL))
- return vectorizeTree(E);
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
+ InstructionsState S = getSameOpcode(VL);
+ if (S.Opcode) {
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+ TreeEntry *UserTreeEntry = nullptr;
+ if (UserIndx != -1)
+ UserTreeEntry = &VectorizableTree[UserIndx];
+
+ if (E->isSame(VL) ||
+ (UserTreeEntry &&
+ (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
+ !UserTreeEntry->ShuffleMask[OpdNum].empty() &&
+ E->isFoundJumbled(VL, *DL, *SE)))
+ return vectorizeTree(E, OpdNum, UserIndx);
+ }
+ }
- Type *ScalarTy = VL[0]->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ Type *ScalarTy = S.OpValue->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
return Gather(VL, VecTy);
}
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
IRBuilder<>::InsertPointGuard Guard(Builder);
+ TreeEntry *UserTreeEntry = nullptr;
if (E->VectorizedValue) {
DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
+ InstructionsState S = getSameOpcode(E->Scalars);
Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
@@ -2463,15 +2839,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
if (E->NeedToGather) {
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
- unsigned Opcode = getSameOpcode(E->Scalars);
+ assert(ScalarToTreeEntry.count(E->Scalars[0]) &&
+ "Expected user tree entry, missing!");
+ int CurrIndx = ScalarToTreeEntry[E->Scalars[0]];
- switch (Opcode) {
+ unsigned ShuffleOrOp = S.IsAltShuffle ?
+ (unsigned) Instruction::ShuffleVector : S.Opcode;
+ switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
@@ -2498,7 +2878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- Value *Vec = vectorizeTree(Operands);
+ Value *Vec = vectorizeTree(Operands, i, CurrIndx);
NewPhi->addIncoming(Vec, IBB);
}
@@ -2508,18 +2888,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::ExtractElement: {
- if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
+ if (canReuseExtract(E->Scalars, VL0)) {
Value *V = VL0->getOperand(0);
E->VectorizedValue = V;
return V;
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+ if (canReuseExtract(E->Scalars, VL0)) {
LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
@@ -2528,7 +2908,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = V;
return propagateMetadata(V, E->Scalars);
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
@@ -2549,9 +2929,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
INVL.push_back(cast<Instruction>(V)->getOperand(0));
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *InVec = vectorizeTree(INVL);
+ Value *InVec = vectorizeTree(INVL, 0, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
@@ -2570,23 +2950,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *L = vectorizeTree(LHSV);
- Value *R = vectorizeTree(RHSV);
+ Value *L = vectorizeTree(LHSV, 0, CurrIndx);
+ Value *R = vectorizeTree(RHSV, 1, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
- if (Opcode == Instruction::FCmp)
+ if (S.Opcode == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars);
+ propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
@@ -2598,11 +2978,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *Cond = vectorizeTree(CondVec);
- Value *True = vectorizeTree(TrueVec);
- Value *False = vectorizeTree(FalseVec);
+ Value *Cond = vectorizeTree(CondVec, 0, CurrIndx);
+ Value *True = vectorizeTree(TrueVec, 1, CurrIndx);
+ Value *False = vectorizeTree(FalseVec, 2, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
@@ -2632,25 +3012,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
- reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+ reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
+ RHSVL);
else
for (Value *V : E->Scalars) {
- LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
- RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
+ auto *I = cast<Instruction>(V);
+ LHSVL.push_back(I->getOperand(0));
+ RHSVL.push_back(I->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
+ Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
+ Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
- BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
- Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+ Value *V = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars);
+ propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
@@ -2661,9 +3043,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
+
+ if (UserIndx != -1)
+ UserTreeEntry = &VectorizableTree[UserIndx];
+
+ bool isJumbled = false;
+ LoadInst *LI = NULL;
+ if (UserTreeEntry &&
+ (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
+ !UserTreeEntry->ShuffleMask[OpdNum].empty()) {
+ isJumbled = true;
+ LI = cast<LoadInst>(E->Scalars[0]);
+ } else {
+ LI = cast<LoadInst>(VL0);
+ }
- LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
unsigned AS = LI->getPointerAddressSpace();
@@ -2685,47 +3080,60 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
++NumVectorInstructions;
- return propagateMetadata(LI, E->Scalars);
+ propagateMetadata(LI, E->Scalars);
+
+ if (isJumbled) {
+ SmallVector<Constant *, 8> Mask;
+ for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum])
+ Mask.push_back(Builder.getInt32(LaneEntry));
+ // Generate shuffle for jumbled memory access
+ Value *Undef = UndefValue::get(VecTy);
+ Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
+ ConstantVector::get(Mask));
+ E->VectorizedValue = Shuf;
+ ++NumVectorInstructions;
+ return Shuf;
+ }
+ return LI;
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
- ValueList ValueOp;
+ ValueList ScalarStoreValues;
for (Value *V : E->Scalars)
- ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
+ ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *VecValue = vectorizeTree(ValueOp);
- Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
- VecTy->getPointerTo(AS));
+ Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx);
+ Value *ScalarPtr = SI->getPointerOperand();
+ Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
- // The pointer operand uses an in-tree scalar so we add the new BitCast to
- // ExternalUses list to make sure that an extract will be generated in the
+ // The pointer operand uses an in-tree scalar, so add the new BitCast to
+ // ExternalUses to make sure that an extract will be generated in the
// future.
- Value *PO = SI->getPointerOperand();
- if (getTreeEntry(PO))
- ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+ if (getTreeEntry(ScalarPtr))
+ ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
- if (!Alignment) {
+ if (!Alignment)
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
- }
+
S->setAlignment(Alignment);
E->VectorizedValue = S;
++NumVectorInstructions;
return propagateMetadata(S, E->Scalars);
}
case Instruction::GetElementPtr: {
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
ValueList Op0VL;
for (Value *V : E->Scalars)
Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
- Value *Op0 = vectorizeTree(Op0VL);
+ Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx);
std::vector<Value *> OpVecs;
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -2734,7 +3142,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
- Value *OpVec = vectorizeTree(OpVL);
+ Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
OpVecs.push_back(OpVec);
}
@@ -2750,7 +3158,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E->Scalars);
+ setInsertPointAfterBundle(E->Scalars, VL0);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
@@ -2763,7 +3171,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// ctlz,cttz and powi are special intrinsics whose second argument is
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
- CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+ CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
@@ -2773,7 +3181,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
OpVL.push_back(CEI->getArgOperand(j));
}
- Value *OpVec = vectorizeTree(OpVL);
+ Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
@@ -2793,30 +3201,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars);
+ propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
- assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
- reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
- setInsertPointAfterBundle(E->Scalars);
+ assert(Instruction::isBinaryOp(S.Opcode) &&
+ "Invalid Shuffle Vector Operand");
+ reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
+ setInsertPointAfterBundle(E->Scalars, VL0);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
+ Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
+ Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
if (Value *V = alreadyVectorized(E->Scalars, VL0))
return V;
// Create a vector of LHS op1 RHS
- BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
- Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+ Value *V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ unsigned AltOpcode = getAltOpcode(S.Opcode);
// Create a vector of LHS op2 RHS
- Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
- BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
- Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+ Value *V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
// Create shuffle to take alternate operations from the vector.
// Also, gather up odd and even scalar ops to propagate IR flags to
@@ -2859,7 +3268,6 @@ Value *BoUpSLP::vectorizeTree() {
Value *
BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
-
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
@@ -2905,9 +3313,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
TreeEntry *E = getTreeEntry(Scalar);
assert(E && "Invalid scalar");
- assert(!E->NeedToGather && "Extracting from a gather list");
+ assert((!E->NeedToGather) && "Extracting from a gather list");
- Value *Vec = E->VectorizedValue;
+ Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue);
+ if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) {
+ Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0);
+ } else {
+ Vec = E->VectorizedValue;
+ }
assert(Vec && "Can't find vectorizable value");
Value *Lane = Builder.getInt32(ExternalUse.Lane);
@@ -2975,14 +3388,15 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
for (TreeEntry &EIdx : VectorizableTree) {
TreeEntry *Entry = &EIdx;
+ // No need to handle users of gathered values.
+ if (Entry->NeedToGather)
+ continue;
+
+ assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- // No need to handle users of gathered values.
- if (Entry->NeedToGather)
- continue;
-
- assert(Entry->VectorizedValue && "Can't find vectorizable value");
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
@@ -2990,9 +3404,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
- assert((getTreeEntry(U) ||
- // It is legal to replace users in the ignorelist by undef.
- is_contained(UserIgnoreList, U)) &&
+ // It is legal to replace users in the ignorelist by undef.
+ assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
"Replacing out-of-tree value with undef");
}
#endif
@@ -3009,7 +3422,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
return VectorizableTree[0].VectorizedValue;
}
-void BoUpSLP::optimizeGatherSequence() {
+void BoUpSLP::optimizeGatherSequence(Function &F) {
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
@@ -3043,30 +3456,16 @@ void BoUpSLP::optimizeGatherSequence() {
Insert->moveBefore(PreHeader->getTerminator());
}
- // Make a list of all reachable blocks in our CSE queue.
- SmallVector<const DomTreeNode *, 8> CSEWorkList;
- CSEWorkList.reserve(CSEBlocks.size());
- for (BasicBlock *BB : CSEBlocks)
- if (DomTreeNode *N = DT->getNode(BB)) {
- assert(DT->isReachableFromEntry(N));
- CSEWorkList.push_back(N);
- }
-
- // Sort blocks by domination. This ensures we visit a block after all blocks
- // dominating it are visited.
- std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
- [this](const DomTreeNode *A, const DomTreeNode *B) {
- return DT->properlyDominates(A, B);
- });
-
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
- for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
- assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
- "Worklist not sorted properly!");
- BasicBlock *BB = (*I)->getBlock();
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (auto BB : RPOT) {
+ // Traverse CSEBlocks by RPOT order.
+ if (!CSEBlocks.count(BB))
+ continue;
+
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
@@ -3111,7 +3510,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (!extendSchedulingRegion(V))
+ if (!extendSchedulingRegion(V, OpValue))
return false;
}
@@ -3148,8 +3547,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// It is seldom that this needs to be done a second time after adding the
// initial bundle to the region.
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- SD->clearDependencies();
+ doForAllOpcodes(I, [](ScheduleData *SD) {
+ SD->clearDependencies();
+ });
}
ReSchedule = true;
}
@@ -3210,17 +3610,43 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
}
}
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
- if (getScheduleData(V))
+BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
+ // Allocate a new ScheduleData for the instruction.
+ if (ChunkPos >= ChunkSize) {
+ ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+ ChunkPos = 0;
+ }
+ return &(ScheduleDataChunks.back()[ChunkPos++]);
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+ Value *OpValue) {
+ if (getScheduleData(V, isOneOf(OpValue, V)))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+ auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
+ ScheduleData *ISD = getScheduleData(I);
+ if (!ISD)
+ return false;
+ assert(isInSchedulingRegion(ISD) &&
+ "ScheduleData not in scheduling region");
+ ScheduleData *SD = allocateScheduleDataChunks();
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, OpValue);
+ ExtraScheduleDataMap[I][OpValue] = SD;
+ return true;
+ };
+ if (CheckSheduleForI(I))
+ return true;
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
+ if (isOneOf(OpValue, I) != I)
+ CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
@@ -3232,7 +3658,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
- for (;;) {
+ while (true) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
@@ -3242,6 +3668,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
+ if (isOneOf(OpValue, I) != I)
+ CheckSheduleForI(I);
DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
return true;
}
@@ -3252,6 +3680,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
+ if (isOneOf(OpValue, I) != I)
+ CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
@@ -3272,21 +3702,17 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
ScheduleData *SD = ScheduleDataMap[I];
if (!SD) {
- // Allocate a new ScheduleData for the instruction.
- if (ChunkPos >= ChunkSize) {
- ScheduleDataChunks.push_back(
- llvm::make_unique<ScheduleData[]>(ChunkSize));
- ChunkPos = 0;
- }
- SD = &(ScheduleDataChunks.back()[ChunkPos++]);
+ SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
SD->Inst = I;
}
assert(!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region");
- SD->init(SchedulingRegionID);
+ SD->init(SchedulingRegionID, I);
- if (I->mayReadOrWriteMemory()) {
+ if (I->mayReadOrWriteMemory() &&
+ (!isa<IntrinsicInst>(I) ||
+ cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
// Update the linked list of memory accessing instructions.
if (CurrentLoadStore) {
CurrentLoadStore->NextLoadStore = SD;
@@ -3326,23 +3752,35 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
BundleMember->resetUnscheduledDeps();
// Handle def-use chain dependencies.
- for (User *U : BundleMember->Inst->users()) {
- if (isa<Instruction>(U)) {
- ScheduleData *UseSD = getScheduleData(U);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ if (BundleMember->OpValue != BundleMember->Inst) {
+ ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ for (User *U : BundleMember->Inst->users()) {
+ if (isa<Instruction>(U)) {
+ ScheduleData *UseSD = getScheduleData(U);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ // I'm not sure if this can ever happen. But we need to be safe.
+ // This lets the instruction/bundle never be scheduled and
+ // eventually disable vectorization.
BundleMember->Dependencies++;
- ScheduleData *DestBundle = UseSD->FirstInBundle;
- if (!DestBundle->IsScheduled)
- BundleMember->incrementUnscheduledDeps(1);
- if (!DestBundle->hasValidDependencies())
- WorkList.push_back(DestBundle);
+ BundleMember->incrementUnscheduledDeps(1);
}
- } else {
- // I'm not sure if this can ever happen. But we need to be safe.
- // This lets the instruction/bundle never be scheduled and
- // eventually disable vectorization.
- BundleMember->Dependencies++;
- BundleMember->incrementUnscheduledDeps(1);
}
}
@@ -3419,16 +3857,17 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- assert(isInSchedulingRegion(SD));
- SD->IsScheduled = false;
- SD->resetUnscheduledDeps();
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ assert(isInSchedulingRegion(SD) &&
+ "ScheduleData not in scheduling region");
+ SD->IsScheduled = false;
+ SD->resetUnscheduledDeps();
+ });
}
ReadyInsts.clear();
}
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
-
if (!BS->ScheduleStart)
return;
@@ -3452,15 +3891,16 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
- ScheduleData *SD = BS->getScheduleData(I);
- assert(
- SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
- "scheduler and vectorizer have different opinion on what is a bundle");
- SD->FirstInBundle->SchedulingPriority = Idx++;
- if (SD->isSchedulingEntity()) {
- BS->calculateDependencies(SD, false, this);
- NumToSchedule++;
- }
+ BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+ assert(SD->isPartOfBundle() ==
+ (getTreeEntry(SD->Inst) != nullptr) &&
+ "scheduler and vectorizer bundle mismatch");
+ SD->FirstInBundle->SchedulingPriority = Idx++;
+ if (SD->isSchedulingEntity()) {
+ BS->calculateDependencies(SD, false, this);
+ NumToSchedule++;
+ }
+ });
}
BS->initialFillReadyList(ReadyInsts);
@@ -3559,7 +3999,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
SmallVectorImpl<Value *> &ToDemote,
SmallVectorImpl<Value *> &Roots) {
-
// We can always demote constants.
if (isa<Constant>(V)) {
ToDemote.push_back(V);
@@ -3702,7 +4141,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
- IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
+ IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
KnownBits Known = computeKnownBits(R, *DL);
return Known.isNonNegative();
});
@@ -3710,7 +4149,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine the maximum number of bits required to store the scalar
// values.
for (auto *Scalar : ToDemote) {
- auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
+ auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
}
@@ -3755,6 +4194,7 @@ void BoUpSLP::computeMinimumValueSizes() {
}
namespace {
+
/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {
SLPVectorizerPass Impl;
@@ -3766,7 +4206,6 @@ struct SLPVectorizer : public FunctionPass {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}
-
bool doInitialization(Module &M) override {
return false;
}
@@ -3806,6 +4245,7 @@ struct SLPVectorizer : public FunctionPass {
AU.setPreservesCFG();
}
};
+
} // end anonymous namespace
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
@@ -3893,7 +4333,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
}
if (Changed) {
- R.optimizeGatherSequence();
+ R.optimizeGatherSequence(F);
DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
DEBUG(verifyFunction(F));
}
@@ -3952,7 +4392,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+
using namespace ore;
+
R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
cast<StoreInst>(Chain[i]))
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
@@ -3972,7 +4414,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {
- SetVector<StoreInst *> Heads, Tails;
+ SetVector<StoreInst *> Heads;
+ SmallDenseSet<StoreInst *> Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
@@ -3980,45 +4423,51 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- // Do a quadratic search on all of the given stores and find
+ // Do a quadratic search on all of the given stores in reverse order and find
// all of the pairs of stores that follow each other.
SmallVector<unsigned, 16> IndexQueue;
- for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
- IndexQueue.clear();
+ unsigned E = Stores.size();
+ IndexQueue.resize(E - 1);
+ for (unsigned I = E; I > 0; --I) {
+ unsigned Idx = I - 1;
// If a store has multiple consecutive store candidates, search Stores
- // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
// This is because usually pairing with immediate succeeding or preceding
// candidate create the best chance to find slp vectorization opportunity.
- unsigned j = 0;
- for (j = i + 1; j < e; ++j)
- IndexQueue.push_back(j);
- for (j = i; j > 0; --j)
- IndexQueue.push_back(j - 1);
-
- for (auto &k : IndexQueue) {
- if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
- Tails.insert(Stores[k]);
- Heads.insert(Stores[i]);
- ConsecutiveChain[Stores[i]] = Stores[k];
+ unsigned Offset = 1;
+ unsigned Cnt = 0;
+ for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
+ if (Idx >= Offset) {
+ IndexQueue[Cnt] = Idx - Offset;
+ ++Cnt;
+ }
+ if (Idx + Offset < E) {
+ IndexQueue[Cnt] = Idx + Offset;
+ ++Cnt;
+ }
+ }
+
+ for (auto K : IndexQueue) {
+ if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {
+ Tails.insert(Stores[Idx]);
+ Heads.insert(Stores[K]);
+ ConsecutiveChain[Stores[K]] = Stores[Idx];
break;
}
}
}
// For stores that start but don't end a link in the chain:
- for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
- it != e; ++it) {
- if (Tails.count(*it))
+ for (auto *SI : llvm::reverse(Heads)) {
+ if (Tails.count(SI))
continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
- StoreInst *I = *it;
+ StoreInst *I = SI;
// Collect the chain into a list.
- while (Tails.count(I) || Heads.count(I)) {
- if (VectorizedStores.count(I))
- break;
+ while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
Operands.push_back(I);
// Move to the next value in the chain.
I = ConsecutiveChain[I];
@@ -4041,7 +4490,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
}
void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
-
// Initialize the collections. We will make a single pass over the block.
Stores.clear();
GEPs.clear();
@@ -4050,7 +4498,6 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// Stores and GEPs according to the underlying objects of their pointer
// operands.
for (Instruction &I : *BB) {
-
// Ignore store instructions that are volatile or have a pointer operand
// that doesn't point to a scalar type.
if (auto *SI = dyn_cast<StoreInst>(&I)) {
@@ -4086,7 +4533,8 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
ArrayRef<Value *> BuildVector,
- bool AllowReorder) {
+ bool AllowReorder,
+ bool NeedExtraction) {
if (VL.size() < 2)
return false;
@@ -4103,19 +4551,51 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
- if (MaxVF < 2)
- return false;
+ if (MaxVF < 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
+ }
for (Value *V : VL) {
Type *Ty = V->getType();
- if (!isValidElementType(Ty))
+ if (!isValidElementType(Ty)) {
+ // NOTE: the following will give user internal llvm type name, which may not be useful
+ R.getORE()->emit([&]() {
+ std::string type_str;
+ llvm::raw_string_ostream rso(type_str);
+ Ty->print(rso);
+ return OptimizationRemarkMissed(
+ SV_NAME, "UnsupportedType", I0)
+ << "Cannot SLP vectorize list: type "
+ << rso.str() + " is unsupported by vectorizer";
+ });
return false;
+ }
Instruction *Inst = dyn_cast<Instruction>(V);
- if (!Inst || Inst->getOpcode() != Opcode0)
+
+ if (!Inst)
+ return false;
+ if (Inst->getOpcode() != Opcode0) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "InequableTypes", I0)
+ << "Cannot SLP vectorize list: not all of the "
+ << "parts of scalar instructions are of the same type: "
+ << ore::NV("Instruction1Opcode", I0) << " and "
+ << ore::NV("Instruction2Opcode", Inst);
+ });
return false;
+ }
}
bool Changed = false;
+ bool CandidateFound = false;
+ int MinCost = SLPCostThreshold;
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
@@ -4148,11 +4628,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
<< "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+ ArrayRef<Value *> EmptyArray;
ArrayRef<Value *> BuildVectorSlice;
if (!BuildVector.empty())
BuildVectorSlice = BuildVector.slice(I, OpsWidth);
- R.buildTree(Ops, BuildVectorSlice);
+ R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && R.shouldReorder()) {
// Conceptually, there is nothing actually preventing us from trying to
@@ -4169,14 +4650,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
+ CandidateFound = true;
+ MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
- cast<Instruction>(Ops[0]))
- << "SLP vectorized with cost " << ore::NV("Cost", Cost)
- << " and with tree size "
- << ore::NV("TreeSize", R.getTreeSize()));
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
Value *VectorizedRoot = R.vectorizeTree();
@@ -4199,8 +4682,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
I->setOperand(1, Extract);
- I->removeFromParent();
- I->insertAfter(Extract);
+ I->moveAfter(Extract);
InsertAfter = I;
}
}
@@ -4212,18 +4694,37 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
}
}
+ if (!Changed && CandidateFound) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "NotBeneficial", I0)
+ << "List vectorization was possible but not beneficial with cost "
+ << ore::NV("Cost", MinCost) << " >= "
+ << ore::NV("Treshold", -SLPCostThreshold);
+ });
+ } else if (!Changed) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "NotPossible", I0)
+ << "Cannot SLP vectorize list: vectorization was impossible"
+ << " with available vectorization factors";
+ });
+ }
return Changed;
}
-bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
- if (!V)
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+ if (!I)
return false;
- Value *P = V->getParent();
+ if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+ return false;
+
+ Value *P = I->getParent();
// Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
return false;
@@ -4286,6 +4787,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
}
namespace {
+
/// Model horizontal reductions.
///
/// A horizontal reduction is a tree of reduction operations (currently add and
@@ -4314,17 +4816,375 @@ namespace {
/// *p =
///
class HorizontalReduction {
- SmallVector<Value *, 16> ReductionOps;
+ using ReductionOpsType = SmallVector<Value *, 16>;
+ using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
+ ReductionOpsListType ReductionOps;
SmallVector<Value *, 32> ReducedVals;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
- BinaryOperator *ReductionRoot = nullptr;
+ /// Kind of the reduction data.
+ enum ReductionKind {
+ RK_None, /// Not a reduction.
+ RK_Arithmetic, /// Binary reduction data.
+ RK_Min, /// Minimum reduction data.
+ RK_UMin, /// Unsigned minimum reduction data.
+ RK_Max, /// Maximum reduction data.
+ RK_UMax, /// Unsigned maximum reduction data.
+ };
+
+ /// Contains info about operation, like its opcode, left and right operands.
+ class OperationData {
+ /// Opcode of the instruction.
+ unsigned Opcode = 0;
+
+ /// Left operand of the reduction operation.
+ Value *LHS = nullptr;
+
+ /// Right operand of the reduction operation.
+ Value *RHS = nullptr;
+
+ /// Kind of the reduction operation.
+ ReductionKind Kind = RK_None;
+
+ /// True if float point min/max reduction has no NaNs.
+ bool NoNaN = false;
+
+ /// Checks if the reduction operation can be vectorized.
+ bool isVectorizable() const {
+ return LHS && RHS &&
+ // We currently only support adds && min/max reductions.
+ ((Kind == RK_Arithmetic &&
+ (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
+ ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ (Kind == RK_Min || Kind == RK_Max)) ||
+ (Opcode == Instruction::ICmp &&
+ (Kind == RK_UMin || Kind == RK_UMax)));
+ }
+
+ /// Creates reduction operation with the current opcode.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ Value *Cmp;
+ switch (Kind) {
+ case RK_Arithmetic:
+ return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
+ Name);
+ case RK_Min:
+ Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
+ : Builder.CreateFCmpOLT(LHS, RHS);
+ break;
+ case RK_Max:
+ Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
+ : Builder.CreateFCmpOGT(LHS, RHS);
+ break;
+ case RK_UMin:
+ assert(Opcode == Instruction::ICmp && "Expected integer types.");
+ Cmp = Builder.CreateICmpULT(LHS, RHS);
+ break;
+ case RK_UMax:
+ assert(Opcode == Instruction::ICmp && "Expected integer types.");
+ Cmp = Builder.CreateICmpUGT(LHS, RHS);
+ break;
+ case RK_None:
+ llvm_unreachable("Unknown reduction operation.");
+ }
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+
+ public:
+ explicit OperationData() = default;
+
+ /// Construction for reduced values. They are identified by opcode only and
+ /// don't have associated LHS/RHS values.
+ explicit OperationData(Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ Opcode = I->getOpcode();
+ }
+
+ /// Constructor for reduction operations with opcode and its left and
+ /// right operands.
+ OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
+ bool NoNaN = false)
+ : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
+ assert(Kind != RK_None && "One of the reduction operations is expected.");
+ }
+
+ explicit operator bool() const { return Opcode; }
+
+ /// Get the index of the first operand.
+ unsigned getFirstOperandIndex() const {
+ assert(!!*this && "The opcode is not set.");
+ switch (Kind) {
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return 1;
+ case RK_Arithmetic:
+ case RK_None:
+ break;
+ }
+ return 0;
+ }
+
+ /// Total number of operands in the reduction operation.
+ unsigned getNumberOfOperands() const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return 2;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return 3;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Checks if the operation has the same parent as \p P.
+ bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ if (!IsRedOp)
+ return I->getParent() == P;
+ switch (Kind) {
+ case RK_Arithmetic:
+ // Arithmetic reduction operation must be used once only.
+ return I->getParent() == P;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax: {
+ // SelectInst must be used twice while the condition op must have single
+ // use only.
+ auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
+ return I->getParent() == P && Cmp && Cmp->getParent() == P;
+ }
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+ /// Expected number of uses for reduction operations/reduced values.
+ bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return I->hasOneUse();
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ return I->hasNUses(2) &&
+ (!IsReductionOp ||
+ cast<SelectInst>(I)->getCondition()->hasOneUse());
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Initializes the list of reduction operations.
+ void initReductionOps(ReductionOpsListType &ReductionOps) {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ ReductionOps.assign(1, ReductionOpsType());
+ break;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ ReductionOps.assign(2, ReductionOpsType());
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ }
+ /// Add all reduction operations for the reduction instruction \p I.
+ void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
+ assert(Kind != RK_None && !!*this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ ReductionOps[0].emplace_back(I);
+ break;
+ case RK_Min:
+ case RK_UMin:
+ case RK_Max:
+ case RK_UMax:
+ ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
+ ReductionOps[1].emplace_back(I);
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ }
+
+ /// Checks if instruction is associative and can be vectorized.
+ bool isAssociative(Instruction *I) const {
+ assert(Kind != RK_None && *this && LHS && RHS &&
+ "Expected reduction operation.");
+ switch (Kind) {
+ case RK_Arithmetic:
+ return I->isAssociative();
+ case RK_Min:
+ case RK_Max:
+ return Opcode == Instruction::ICmp ||
+ cast<Instruction>(I->getOperand(0))->isFast();
+ case RK_UMin:
+ case RK_UMax:
+ assert(Opcode == Instruction::ICmp &&
+ "Only integer compare operation is expected.");
+ return true;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Checks if the reduction operation can be vectorized.
+ bool isVectorizable(Instruction *I) const {
+ return isVectorizable() && isAssociative(I);
+ }
+
+ /// Checks if two operation data are both a reduction op or both a reduced
+ /// value.
+ bool operator==(const OperationData &OD) {
+ assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
+ "One of the comparing operations is incorrect.");
+ return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
+ }
+ bool operator!=(const OperationData &OD) { return !(*this == OD); }
+ void clear() {
+ Opcode = 0;
+ LHS = nullptr;
+ RHS = nullptr;
+ Kind = RK_None;
+ NoNaN = false;
+ }
+
+ /// Get the opcode of the reduction operation.
+ unsigned getOpcode() const {
+ assert(isVectorizable() && "Expected vectorizable operation.");
+ return Opcode;
+ }
+
+ /// Get kind of reduction data.
+ ReductionKind getKind() const { return Kind; }
+ Value *getLHS() const { return LHS; }
+ Value *getRHS() const { return RHS; }
+ Type *getConditionType() const {
+ switch (Kind) {
+ case RK_Arithmetic:
+ return nullptr;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ return CmpInst::makeCmpResultType(LHS->getType());
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Reduction kind is not set");
+ }
+
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p ReductionOps.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+ const ReductionOpsListType &ReductionOps) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ auto *Op = createOp(Builder, Name);
+ switch (Kind) {
+ case RK_Arithmetic:
+ propagateIRFlags(Op, ReductionOps[0]);
+ return Op;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ if (auto *SI = dyn_cast<SelectInst>(Op))
+ propagateIRFlags(SI->getCondition(), ReductionOps[0]);
+ propagateIRFlags(Op, ReductionOps[1]);
+ return Op;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Unknown reduction operation.");
+ }
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p I.
+ Value *createOp(IRBuilder<> &Builder, const Twine &Name,
+ Instruction *I) const {
+ assert(isVectorizable() &&
+ "Expected add|fadd or min/max reduction operation.");
+ auto *Op = createOp(Builder, Name);
+ switch (Kind) {
+ case RK_Arithmetic:
+ propagateIRFlags(Op, I);
+ return Op;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ if (auto *SI = dyn_cast<SelectInst>(Op)) {
+ propagateIRFlags(SI->getCondition(),
+ cast<SelectInst>(I)->getCondition());
+ }
+ propagateIRFlags(Op, I);
+ return Op;
+ case RK_None:
+ break;
+ }
+ llvm_unreachable("Unknown reduction operation.");
+ }
+
+ TargetTransformInfo::ReductionFlags getFlags() const {
+ TargetTransformInfo::ReductionFlags Flags;
+ Flags.NoNaN = NoNaN;
+ switch (Kind) {
+ case RK_Arithmetic:
+ break;
+ case RK_Min:
+ Flags.IsSigned = Opcode == Instruction::ICmp;
+ Flags.IsMaxOp = false;
+ break;
+ case RK_Max:
+ Flags.IsSigned = Opcode == Instruction::ICmp;
+ Flags.IsMaxOp = true;
+ break;
+ case RK_UMin:
+ Flags.IsSigned = false;
+ Flags.IsMaxOp = false;
+ break;
+ case RK_UMax:
+ Flags.IsSigned = false;
+ Flags.IsMaxOp = true;
+ break;
+ case RK_None:
+ llvm_unreachable("Reduction kind is not set");
+ }
+ return Flags;
+ }
+ };
+
+ Instruction *ReductionRoot = nullptr;
+
+ /// The operation data of the reduction operation.
+ OperationData ReductionData;
+
+ /// The operation data of the values we perform a reduction on.
+ OperationData ReducedValueData;
- /// The opcode of the reduction.
- Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
- /// The opcode of the values we perform a reduction on.
- unsigned ReducedValueOpcode = 0;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction = false;
@@ -4349,55 +5209,89 @@ class HorizontalReduction {
}
}
+ static OperationData getOperationData(Value *V) {
+ if (!V)
+ return OperationData();
+
+ Value *LHS;
+ Value *RHS;
+ if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
+ return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
+ RK_Arithmetic);
+ }
+ if (auto *Select = dyn_cast<SelectInst>(V)) {
+ // Look for a min/max pattern.
+ if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+ } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+ } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
+ m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(
+ Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+ } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+ } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
+ m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
+ return OperationData(
+ Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ }
+ }
+ return OperationData(V);
+ }
+
public:
HorizontalReduction() = default;
/// \brief Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
+ ReductionData = getOperationData(B);
+
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
- if (B->getOperand(0) == Phi) {
+ if (ReductionData.getLHS() == Phi) {
Phi = nullptr;
- B = dyn_cast<BinaryOperator>(B->getOperand(1));
- } else if (B->getOperand(1) == Phi) {
+ B = dyn_cast<Instruction>(ReductionData.getRHS());
+ ReductionData = getOperationData(B);
+ } else if (ReductionData.getRHS() == Phi) {
Phi = nullptr;
- B = dyn_cast<BinaryOperator>(B->getOperand(0));
+ B = dyn_cast<Instruction>(ReductionData.getLHS());
+ ReductionData = getOperationData(B);
}
}
- if (!B)
+ if (!ReductionData.isVectorizable(B))
return false;
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
- ReductionOpcode = B->getOpcode();
- ReducedValueOpcode = 0;
+ ReducedValueData.clear();
ReductionRoot = B;
- // We currently only support adds.
- if ((ReductionOpcode != Instruction::Add &&
- ReductionOpcode != Instruction::FAdd) ||
- !B->isAssociative())
- return false;
-
// Post order traverse the reduction tree starting at B. We only handle true
- // trees containing only binary operators or selects.
+ // trees containing only binary operators.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(B, 0));
+ Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
+ ReductionData.initReductionOps(ReductionOps);
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;
- bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+ OperationData OpData = getOperationData(TreeN);
+ bool IsReducedValue = OpData != ReductionData;
// Postorder vist.
- if (EdgeToVist == 2 || IsReducedValue) {
+ if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
if (IsReducedValue)
ReducedVals.push_back(TreeN);
else {
@@ -4415,7 +5309,7 @@ public:
markExtraArg(Stack[Stack.size() - 2], TreeN);
ExtraArgs.erase(TreeN);
} else
- ReductionOps.push_back(TreeN);
+ ReductionData.addReductionOps(TreeN, ReductionOps);
}
// Retract.
Stack.pop_back();
@@ -4426,45 +5320,50 @@ public:
Value *NextV = TreeN->getOperand(EdgeToVist);
if (NextV != Phi) {
auto *I = dyn_cast<Instruction>(NextV);
+ OpData = getOperationData(I);
// Continue analysis if the next operand is a reduction operation or
// (possibly) a reduced value. If the reduced value opcode is not set,
// the first met operation != reduction operation is considered as the
// reduced value class.
- if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
- I->getOpcode() == ReductionOpcode)) {
+ if (I && (!ReducedValueData || OpData == ReducedValueData ||
+ OpData == ReductionData)) {
+ const bool IsReductionOperation = OpData == ReductionData;
// Only handle trees in the current basic block.
- if (I->getParent() != B->getParent()) {
+ if (!ReductionData.hasSameParent(I, B->getParent(),
+ IsReductionOperation)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
- // Each tree node needs to have one user except for the ultimate
- // reduction.
- if (!I->hasOneUse() && I != B) {
+ // Each tree node needs to have minimal number of users except for the
+ // ultimate reduction.
+ if (!ReductionData.hasRequiredNumberOfUses(I,
+ OpData == ReductionData) &&
+ I != B) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
- if (I->getOpcode() == ReductionOpcode) {
+ if (IsReductionOperation) {
// We need to be able to reassociate the reduction operations.
- if (!I->isAssociative()) {
+ if (!OpData.isAssociative(I)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
- } else if (ReducedValueOpcode &&
- ReducedValueOpcode != I->getOpcode()) {
+ } else if (ReducedValueData &&
+ ReducedValueData != OpData) {
// Make sure that the opcodes of the operations that we are going to
// reduce match.
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
- } else if (!ReducedValueOpcode)
- ReducedValueOpcode = I->getOpcode();
+ } else if (!ReducedValueData)
+ ReducedValueData = OpData;
- Stack.push_back(std::make_pair(I, 0));
+ Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
continue;
}
}
@@ -4492,7 +5391,7 @@ public:
Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;
- Unsafe.setUnsafeAlgebra();
+ Unsafe.setFast();
Builder.setFastMathFlags(Unsafe);
unsigned i = 0;
@@ -4501,12 +5400,15 @@ public:
// to use it.
for (auto &Pair : ExtraArgs)
ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ SmallVector<Value *, 16> IgnoreList;
+ for (auto &V : ReductionOps)
+ IgnoreList.append(V.begin(), V.end());
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, ExternallyUsedValues, ReductionOps);
+ V.buildTree(VL, ExternallyUsedValues, IgnoreList);
if (V.shouldReorder()) {
SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
- V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
+ V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
@@ -4516,17 +5418,27 @@ public:
// Estimate cost.
int Cost =
V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
- if (Cost >= -SLPCostThreshold)
- break;
+ if (Cost >= -SLPCostThreshold) {
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost "
+ << ore::NV("Cost", Cost) << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ break;
+ }
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
- auto *I0 = cast<Instruction>(VL[0]);
- V.getORE()->emit(
- OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+ V.getORE()->emit([&]() {
+ return OptimizationRemark(
+ SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
<< "Vectorized horizontal reduction with cost "
<< ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize()));
+ << ore::NV("TreeSize", V.getTreeSize());
+ });
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4534,12 +5446,14 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
- VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
- ReducedSubTree, "bin.rdx");
- propagateIRFlags(VectorizedTree, ReductionOps);
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, ReducedSubTree,
+ ReductionData.getKind());
+ VectorizedTree =
+ VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
} else
VectorizedTree = ReducedSubTree;
i += ReduxWidth;
@@ -4551,9 +5465,10 @@ public:
for (; i < NumReducedVals; ++i) {
auto *I = cast<Instruction>(ReducedVals[i]);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree =
- Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
- propagateIRFlags(VectorizedTree, ReductionOps);
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, I,
+ ReductionData.getKind());
+ VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
}
for (auto &Pair : ExternallyUsedValues) {
assert(!Pair.second.empty() &&
@@ -4561,9 +5476,10 @@ public:
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
- Pair.first, "bin.extra");
- propagateIRFlags(VectorizedTree, I);
+ OperationData VectReductionData(ReductionData.getOpcode(),
+ VectorizedTree, Pair.first,
+ ReductionData.getKind());
+ VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
}
}
// Update users.
@@ -4583,15 +5499,58 @@ private:
Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
- int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
- int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+ int PairwiseRdxCost;
+ int SplittingRdxCost;
+ switch (ReductionData.getKind()) {
+ case RK_Arithmetic:
+ PairwiseRdxCost =
+ TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+ /*IsPairwiseForm=*/true);
+ SplittingRdxCost =
+ TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+ /*IsPairwiseForm=*/false);
+ break;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax: {
+ Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
+ bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
+ ReductionData.getKind() == RK_UMax;
+ PairwiseRdxCost =
+ TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+ /*IsPairwiseForm=*/true, IsUnsigned);
+ SplittingRdxCost =
+ TTI->getMinMaxReductionCost(VecTy, VecCondTy,
+ /*IsPairwiseForm=*/false, IsUnsigned);
+ break;
+ }
+ case RK_None:
+ llvm_unreachable("Expected arithmetic or min/max reduction operation");
+ }
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
- int ScalarReduxCost =
- (ReduxWidth - 1) *
- TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
+ int ScalarReduxCost;
+ switch (ReductionData.getKind()) {
+ case RK_Arithmetic:
+ ScalarReduxCost =
+ TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+ break;
+ case RK_Min:
+ case RK_Max:
+ case RK_UMin:
+ case RK_UMax:
+ ScalarReduxCost =
+ TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
+ break;
+ case RK_None:
+ llvm_unreachable("Expected arithmetic or min/max reduction operation");
+ }
+ ScalarReduxCost *= (ReduxWidth - 1);
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
@@ -4604,16 +5563,15 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, ArrayRef<Value *> RedOps,
- const TargetTransformInfo *TTI) {
+ unsigned ReduxWidth, const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
if (!IsPairwiseReduction)
return createSimpleTargetReduction(
- Builder, TTI, ReductionOpcode, VectorizedValue,
- TargetTransformInfo::ReductionFlags(), RedOps);
+ Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
+ ReductionData.getFlags(), ReductionOps.back());
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
@@ -4627,15 +5585,16 @@ private:
Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec =
- Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
- propagateIRFlags(TmpVec, RedOps);
+ OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
+ RightShuf, ReductionData.getKind());
+ TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
}
// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}
};
+
} // end anonymous namespace
/// \brief Recognize construction of vectors like
@@ -4643,39 +5602,29 @@ private:
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
+/// starting from the last insertelement instruction.
///
/// Returns true if it matches
-///
-static bool findBuildVector(InsertElementInst *FirstInsertElem,
+static bool findBuildVector(InsertElementInst *LastInsertElem,
SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
- if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
- return false;
-
- InsertElementInst *IE = FirstInsertElem;
- while (true) {
- BuildVector.push_back(IE);
- BuildVectorOpds.push_back(IE->getOperand(1));
-
- if (IE->use_empty())
- return false;
-
- InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
- if (!NextUse)
- return true;
-
- // If this isn't the final use, make sure the next insertelement is the only
- // use. It's OK if the final constructed vector is used multiple times
- if (!IE->hasOneUse())
+ Value *V = nullptr;
+ do {
+ BuildVector.push_back(LastInsertElem);
+ BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
+ V = LastInsertElem->getOperand(0);
+ if (isa<UndefValue>(V))
+ break;
+ LastInsertElem = dyn_cast<InsertElementInst>(V);
+ if (!LastInsertElem || !LastInsertElem->hasOneUse())
return false;
-
- IE = NextUse;
- }
-
- return false;
+ } while (true);
+ std::reverse(BuildVector.begin(), BuildVector.end());
+ std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+ return true;
}
-/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+/// \brief Like findBuildVector, but looks for construction of aggregate.
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
@@ -4767,14 +5716,14 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI,
- const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
+ const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
if (!Root)
return false;
- if (Root->getParent() != BB)
+ if (Root->getParent() != BB || isa<PHINode>(Root))
return false;
// Start analysis starting from Root instruction. If horizontal reduction is
// found, try to vectorize it. If it is not a horizontal reduction or
@@ -4795,11 +5744,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
if (!V)
continue;
auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst || isa<PHINode>(Inst))
+ if (!Inst)
continue;
- if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+ auto *BI = dyn_cast<BinaryOperator>(Inst);
+ auto *SI = dyn_cast<SelectInst>(Inst);
+ if (BI || SI) {
HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, BI)) {
+ if (HorRdx.matchAssociativeReduction(P, Inst)) {
if (HorRdx.tryToReduce(R, TTI)) {
Res = true;
// Set P to nullptr to avoid re-analysis of phi node in
@@ -4808,7 +5759,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
continue;
}
}
- if (P) {
+ if (P && BI) {
Inst = dyn_cast<Instruction>(BI->getOperand(0));
if (Inst == P)
Inst = dyn_cast<Instruction>(BI->getOperand(1));
@@ -4823,15 +5774,20 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
P = nullptr;
- if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ if (Vectorize(Inst, R)) {
Res = true;
continue;
}
// Try to vectorize operands.
+ // Continue analysis for the instruction from the same basic block only to
+ // save compile time.
if (++Level < RecursionMaxDepth)
for (auto *Op : Inst->operand_values())
- Stack.emplace_back(Op, Level);
+ if (VisitedInstrs.insert(Op).second)
+ if (auto *I = dyn_cast<Instruction>(Op))
+ if (!isa<PHINode>(I) && I->getParent() == BB)
+ Stack.emplace_back(Op, Level);
}
return Res;
}
@@ -4848,10 +5804,71 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
if (!isa<BinaryOperator>(I))
P = nullptr;
// Try to match and vectorize a horizontal reduction.
- return tryToVectorizeHorReductionOrInstOperands(
- P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
- return tryToVectorize(BI, R);
- });
+ auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
+ return tryToVectorize(I, R);
+ };
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+ ExtraVectorization);
+}
+
+bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
+ BasicBlock *BB, BoUpSLP &R) {
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ if (!R.canMapToVector(IVI->getType(), DL))
+ return false;
+
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+ return false;
+
+ DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+ // Aggregate value is unlikely to be processed in vector register, we need to
+ // extract scalars into scalar registers, so NeedExtraction is set true.
+ return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
+}
+
+bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
+ BasicBlock *BB, BoUpSLP &R) {
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+ return false;
+
+ // Vectorize starting with the build vector operands ignoring the BuildVector
+ // instructions for the purpose of scheduling and user extraction.
+ return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+}
+
+bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
+ BoUpSLP &R) {
+ if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
+ return true;
+
+ bool OpsChanged = false;
+ for (int Idx = 0; Idx < 2; ++Idx) {
+ OpsChanged |=
+ vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
+ }
+ return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+ SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+ bool OpsChanged = false;
+ for (auto &VH : reverse(Instructions)) {
+ auto *I = dyn_cast_or_null<Instruction>(VH);
+ if (!I)
+ continue;
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+ else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+ else if (auto *CI = dyn_cast<CmpInst>(I))
+ OpsChanged |= vectorizeCmpInst(CI, BB, R);
+ }
+ Instructions.clear();
+ return OpsChanged;
}
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
@@ -4913,10 +5930,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
VisitedInstrs.clear();
+ SmallVector<WeakVH, 8> PostProcessInstructions;
+ SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(&*it).second)
+ if (!VisitedInstrs.insert(&*it).second) {
+ if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
+ vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ }
continue;
+ }
if (isa<DbgInfoIntrinsic>(it))
continue;
@@ -4938,96 +5966,37 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- if (ShouldStartVectorizeHorAtStore) {
- if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
- TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ // Ran into an instruction without users, like terminator, or function call
+ // with ignored return value, store. Ignore unused instructions (basing on
+ // instruction type, except for CallInst and InvokeInst).
+ if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
+ isa<InvokeInst>(it))) {
+ KeyNodes.insert(&*it);
+ bool OpsChanged = false;
+ if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+ for (auto *V : it->operand_values()) {
+ // Try to match and vectorize a horizontal reduction.
+ OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
}
}
- }
-
- // Try to vectorize horizontal reductions feeding into a return.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
- if (RI->getNumOperands() != 0) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
- }
- }
-
- // Try to vectorize trees that start at compare instructions.
- if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
- if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
- Changed = true;
+ // Start vectorization of post-process list of instructions from the
+ // top-tree instructions to try to vectorize as many instructions as
+ // possible.
+ OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+ if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- continue;
- }
-
- for (int I = 0; I < 2; ++I) {
- if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- break;
- }
- }
- continue;
- }
-
- // Try to vectorize trees that start at insertelement instructions.
- if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
- SmallVector<Value *, 16> BuildVector;
- SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
- continue;
-
- // Vectorize starting with the build vector operands ignoring the
- // BuildVector instructions for the purpose of scheduling and user
- // extraction.
- if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
Changed = true;
it = BB->begin();
e = BB->end();
+ continue;
}
-
- continue;
}
- // Try to vectorize trees that start at insertvalue instructions feeding into
- // a store.
- if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
- if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
- const DataLayout &DL = BB->getModule()->getDataLayout();
- if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
- SmallVector<Value *, 16> BuildVector;
- SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
- continue;
+ if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
+ isa<InsertValueInst>(it))
+ PostProcessInstructions.push_back(&*it);
- DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
- if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- }
- continue;
- }
- }
- }
}
return Changed;
@@ -5036,7 +6005,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
auto Changed = false;
for (auto &Entry : GEPs) {
-
// If the getelementptr list has fewer than two elements, there's nothing
// to do.
if (Entry.second.size() < 2)
@@ -5141,7 +6109,9 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
}
char SLPVectorizer::ID = 0;
+
static const char lv_name[] = "SLP Vectorizer";
+
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
@@ -5152,6 +6122,4 @@ INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
-namespace llvm {
-Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
-}
+Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }