summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp1926
1 files changed, 1196 insertions, 730 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a7ccd3faec44..ac8c4f046c6f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -161,7 +161,7 @@ static const unsigned MaxMemDepDistance = 160;
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
-/// \brief Predicate for the element types that the SLP vectorizer supports.
+/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
@@ -246,13 +246,15 @@ static bool isSplat(ArrayRef<Value *> VL) {
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
+/// TODO: Can we split off and reuse the shuffle mask detection from
+/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
- enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
+ enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
auto *EI = cast<ExtractElementInst>(VL[I]);
@@ -272,7 +274,11 @@ isShuffle(ArrayRef<Value *> VL) {
continue;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
- if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
+ if (!Vec1 || Vec1 == Vec)
+ Vec1 = Vec;
+ else if (!Vec2 || Vec2 == Vec)
+ Vec2 = Vec;
+ else
return None;
if (CommonShuffleMode == Permute)
continue;
@@ -282,119 +288,17 @@ isShuffle(ArrayRef<Value *> VL) {
CommonShuffleMode = Permute;
continue;
}
- // Check the shuffle mode for the current operation.
- if (!Vec1)
- Vec1 = Vec;
- else if (Vec != Vec1)
- Vec2 = Vec;
- // Example: shufflevector A, B, <0,5,2,7>
- // I is odd and IntIdx for A == I - FirstAlternate shuffle.
- // I is even and IntIdx for B == I - FirstAlternate shuffle.
- // Example: shufflevector A, B, <4,1,6,3>
- // I is even and IntIdx for A == I - SecondAlternate shuffle.
- // I is odd and IntIdx for B == I - SecondAlternate shuffle.
- const bool IIsEven = I & 1;
- const bool CurrVecIsA = Vec == Vec1;
- const bool IIsOdd = !IIsEven;
- const bool CurrVecIsB = !CurrVecIsA;
- ShuffleMode CurrentShuffleMode =
- ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
- : SecondAlternate;
- // Common mode is not set or the same as the shuffle mode of the current
- // operation - alternate.
- if (CommonShuffleMode == Unknown)
- CommonShuffleMode = CurrentShuffleMode;
- // Common shuffle mode is not the same as the shuffle mode of the current
- // operation - permutation.
- if (CommonShuffleMode != CurrentShuffleMode)
- CommonShuffleMode = Permute;
+ CommonShuffleMode = Select;
}
// If we're not crossing lanes in different vectors, consider it as blending.
- if ((CommonShuffleMode == FirstAlternate ||
- CommonShuffleMode == SecondAlternate) &&
- Vec2)
- return TargetTransformInfo::SK_Alternate;
+ if (CommonShuffleMode == Select && Vec2)
+ return TargetTransformInfo::SK_Select;
// If Vec2 was never used, we have a permutation of a single vector, otherwise
// we have permutation of 2 vectors.
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
}
-///\returns Opcode that can be clubbed with \p Op to create an alternate
-/// sequence which can later be merged as a ShuffleVector instruction.
-static unsigned getAltOpcode(unsigned Op) {
- switch (Op) {
- case Instruction::FAdd:
- return Instruction::FSub;
- case Instruction::FSub:
- return Instruction::FAdd;
- case Instruction::Add:
- return Instruction::Sub;
- case Instruction::Sub:
- return Instruction::Add;
- default:
- return 0;
- }
-}
-
-static bool isOdd(unsigned Value) {
- return Value & 1;
-}
-
-static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
- unsigned CheckedOpcode) {
- return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
-}
-
-/// Chooses the correct key for scheduling data. If \p Op has the same (or
-/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
-/// OpValue.
-static Value *isOneOf(Value *OpValue, Value *Op) {
- auto *I = dyn_cast<Instruction>(Op);
- if (!I)
- return OpValue;
- auto *OpInst = cast<Instruction>(OpValue);
- unsigned OpInstOpcode = OpInst->getOpcode();
- unsigned IOpcode = I->getOpcode();
- if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
- return Op;
- return OpValue;
-}
-
-namespace {
-
-/// Contains data for the instructions going to be vectorized.
-struct RawInstructionsData {
- /// Main Opcode of the instructions going to be vectorized.
- unsigned Opcode = 0;
-
- /// The list of instructions have some instructions with alternate opcodes.
- bool HasAltOpcodes = false;
-};
-
-} // end anonymous namespace
-
-/// Checks the list of the vectorized instructions \p VL and returns info about
-/// this list.
-static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
- auto *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
- return {};
- RawInstructionsData Res;
- unsigned Opcode = I0->getOpcode();
- // Walk through the list of the vectorized instructions
- // in order to check its structure described by RawInstructionsData.
- for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
- auto *I = dyn_cast<Instruction>(VL[Cnt]);
- if (!I)
- return {};
- if (Opcode != I->getOpcode())
- Res.HasAltOpcodes = true;
- }
- Res.Opcode = Opcode;
- return Res;
-}
-
namespace {
/// Main data required for vectorization of instructions.
@@ -402,42 +306,90 @@ struct InstructionsState {
/// The very first instruction in the list with the main opcode.
Value *OpValue = nullptr;
- /// The main opcode for the list of instructions.
- unsigned Opcode = 0;
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
/// Some of the instructions in the list have alternate opcodes.
- bool IsAltShuffle = false;
+ bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+ }
- InstructionsState() = default;
- InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
- : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
+ InstructionsState() = delete;
+ InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+ : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
};
} // end anonymous namespace
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && S.isOpcodeOrAlt(I))
+ return Op;
+ return S.OpValue;
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
- auto Res = getMainOpcode(VL);
- unsigned Opcode = Res.Opcode;
- if (!Res.HasAltOpcodes)
- return InstructionsState(VL[0], Opcode, false);
- auto *OpInst = cast<Instruction>(VL[0]);
- unsigned AltOpcode = getAltOpcode(Opcode);
- // Examine each element in the list instructions VL to determine
- // if some operations there could be considered as an alternative
- // (for example as subtraction relates to addition operation).
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0) {
+ // Make sure these are all Instructions.
+ if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
+ bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
+ bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+ unsigned AltOpcode = Opcode;
+ unsigned AltIndex = BaseIndex;
+
+ // Check for one alternate opcode from another BinaryOperator.
+ // TODO - generalize to support all operators (types, calls etc.).
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
- auto *I = cast<Instruction>(VL[Cnt]);
- unsigned InstOpcode = I->getOpcode();
- if ((Res.HasAltOpcodes &&
- InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
- (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
- return InstructionsState(OpInst, 0, false);
- }
+ unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
+ if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+ Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
+ Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ }
+ } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
- return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
+
+ return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+ cast<Instruction>(VL[AltIndex]));
}
/// \returns true if all of the values in \p VL have the same type or false
@@ -452,16 +404,21 @@ static bool allSameType(ArrayRef<Value *> VL) {
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
-static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
- assert(Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::ExtractValue);
+static Optional<unsigned> getExtractIndex(Instruction *E) {
+ unsigned Opcode = E->getOpcode();
+ assert((Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue) &&
+ "Expected extractelement or extractvalue instruction.");
if (Opcode == Instruction::ExtractElement) {
- ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
- return CI && CI->getZExtValue() == Idx;
- } else {
- ExtractValueInst *EI = cast<ExtractValueInst>(E);
- return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
+ auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+ if (!CI)
+ return None;
+ return CI->getZExtValue();
}
+ ExtractValueInst *EI = cast<ExtractValueInst>(E);
+ if (EI->getNumIndices() != 1)
+ return None;
+ return *EI->idx_begin();
}
/// \returns True if in-tree use also needs extract. This refers to
@@ -549,7 +506,7 @@ public:
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
- /// \brief Vectorize the tree that starts with the elements in \p VL.
+ /// Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
@@ -585,8 +542,8 @@ public:
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
- NumLoadsWantToKeepOrder = 0;
- NumLoadsWantToChangeOrder = 0;
+ NumOpsWantToKeepOrder.clear();
+ NumOpsWantToKeepOriginalOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -596,12 +553,22 @@ public:
unsigned getTreeSize() const { return VectorizableTree.size(); }
- /// \brief Perform LICM and CSE on the newly generated gather sequences.
- void optimizeGatherSequence(Function &F);
+ /// Perform LICM and CSE on the newly generated gather sequences.
+ void optimizeGatherSequence();
+
+ /// \returns The best order of instructions for vectorization.
+ Optional<ArrayRef<unsigned>> bestOrder() const {
+ auto I = std::max_element(
+ NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
+ [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
+ const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+ return D1.second < D2.second;
+ });
+ if (I == NumOpsWantToKeepOrder.end() ||
+ I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+ return None;
- /// \returns true if it is beneficial to reverse the vector order.
- bool shouldReorder() const {
- return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+ return makeArrayRef(I->getFirst());
}
/// \return The vector element size in bits to use when vectorizing the
@@ -625,7 +592,7 @@ public:
return MinVecRegSize;
}
- /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+ /// Check if ArrayType or StructType is isomorphic to some VectorType.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -648,9 +615,13 @@ private:
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
- /// \returns True if the ExtractElement/ExtractValue instructions in VL can
- /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
- bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
+ /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+ /// be vectorized to use the original vector (or aggregate "bitcast" to a
+ /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
+ /// returns false, setting \p CurrentOrder to either an empty vector or a
+ /// non-identity permutation that allows to reuse extract instructions.
+ bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
@@ -658,22 +629,19 @@ private:
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
- /// \returns the pointer to the vectorized value if \p VL is already
- /// vectorized, or NULL. They may happen in cycles.
- Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
-
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(Type *Ty);
+ int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL);
- /// \brief Set the Builder insert point to one after the last instruction in
+ /// Set the Builder insert point to one after the last instruction in
/// the bundle
- void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
+ void setInsertPointAfterBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -684,7 +652,8 @@ private:
/// \reorder commutative operands in alt shuffle if they result in
/// vectorized code.
- void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+ void reorderAltShuffleOperands(const InstructionsState &S,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
@@ -698,8 +667,12 @@ private:
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
- assert(VL.size() == Scalars.size() && "Invalid size");
- return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ if (VL.size() == Scalars.size())
+ return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ return VL.size() == ReuseShuffleIndices.size() &&
+ std::equal(
+ VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
+ [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
}
/// A vector of scalars.
@@ -711,6 +684,12 @@ private:
/// Do we need to gather this sequence ?
bool NeedToGather = false;
+ /// Does this sequence require some shuffling?
+ SmallVector<unsigned, 4> ReuseShuffleIndices;
+
+ /// Does this entry require reordering?
+ ArrayRef<unsigned> ReorderIndices;
+
/// Points back to the VectorizableTree.
///
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
@@ -725,13 +704,17 @@ private:
};
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
- int &UserTreeIdx) {
+ void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
VectorizableTree.emplace_back(VectorizableTree);
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
+ Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
+ ReuseShuffleIndices.end());
+ Last->ReorderIndices = ReorderIndices;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -744,7 +727,6 @@ private:
if (UserTreeIdx >= 0)
Last->UserTreeIndices.push_back(UserTreeIdx);
UserTreeIdx = idx;
- return Last;
}
/// -- Vectorization State --
@@ -758,13 +740,6 @@ private:
return nullptr;
}
- const TreeEntry *getTreeEntry(Value *V) const {
- auto I = ScalarToTreeEntry.find(V);
- if (I != ScalarToTreeEntry.end())
- return &VectorizableTree[I->second];
- return nullptr;
- }
-
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry;
@@ -1038,7 +1013,7 @@ private:
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
- DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
@@ -1061,8 +1036,8 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs()
- << "SLP: gets ready (def): " << *DepBundle << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
}
});
}
@@ -1075,8 +1050,8 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
- << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
@@ -1101,7 +1076,8 @@ private:
doForAllOpcodes(I, [&](ScheduleData *SD) {
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
- DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: initially in ready list: " << *I << "\n");
}
});
}
@@ -1110,7 +1086,8 @@ private:
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
- bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
+ bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1120,7 +1097,7 @@ private:
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
- bool extendSchedulingRegion(Value *V, Value *OpValue);
+ bool extendSchedulingRegion(Value *V, const InstructionsState &S);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
@@ -1201,11 +1178,38 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- // Number of load bundles that contain consecutive loads.
- int NumLoadsWantToKeepOrder = 0;
+ using OrdersType = SmallVector<unsigned, 4>;
+ /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+ /// sorted SmallVectors of unsigned.
+ struct OrdersTypeDenseMapInfo {
+ static OrdersType getEmptyKey() {
+ OrdersType V;
+ V.push_back(~1U);
+ return V;
+ }
+
+ static OrdersType getTombstoneKey() {
+ OrdersType V;
+ V.push_back(~2U);
+ return V;
+ }
+
+ static unsigned getHashValue(const OrdersType &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
+ return LHS == RHS;
+ }
+ };
- // Number of load bundles that contain consecutive loads in reversed order.
- int NumLoadsWantToChangeOrder = 0;
+ /// Contains orders of operations along with the number of bundles that have
+ /// operations in this order. It stores only those orders that require
+ /// reordering, if reordering is not required it is counted using \a
+ /// NumOpsWantToKeepOriginalOrder.
+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
+ /// Number of bundles that do not require reordering.
+ unsigned NumOpsWantToKeepOriginalOrder = 0;
// Analysis and block reference.
Function *F;
@@ -1242,7 +1246,7 @@ template <> struct GraphTraits<BoUpSLP *> {
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
- /// \brief Add the VectorizableTree to the index iterator to be able to return
+ /// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<ChildIteratorType,
@@ -1340,17 +1344,22 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+ int FoundLane = Lane;
+ if (!Entry->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(Entry->ReuseShuffleIndices.begin(),
+ llvm::find(Entry->ReuseShuffleIndices, FoundLane));
+ }
// Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
- DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
- Lane << " from " << *Scalar << ".\n");
- ExternalUses.emplace_back(Scalar, nullptr, Lane);
- continue;
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
}
for (User *U : Scalar->users()) {
- DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
@@ -1364,8 +1373,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
- DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
- << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
assert(!UseEntry->NeedToGather && "Bad state");
continue;
}
@@ -1375,9 +1384,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
if (is_contained(UserIgnoreList, UserInst))
continue;
- DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
- Lane << " from " << *Scalar << ".\n");
- ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
}
}
}
@@ -1389,28 +1398,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
- DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) {
- DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
- DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
- DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1421,8 +1430,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't vectorize ephemeral values.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (EphValues.count(VL[i])) {
- DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
- ") is ephemeral.\n");
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ << ") is ephemeral.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1430,18 +1439,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
- if (E->Scalars[i] != VL[i]) {
- DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, false, UserTreeIdx);
- return;
- }
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+ if (!E->isSame(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ newTreeEntry(VL, false, UserTreeIdx);
+ return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
- DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ << ".\n");
return;
}
@@ -1451,8 +1459,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!I)
continue;
if (getTreeEntry(I)) {
- DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
- ") is already in tree.\n");
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ << ") is already in tree.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1462,7 +1470,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
- DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1476,19 +1484,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
- DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Check that every instruction appears once in this bundle.
- for (unsigned i = 0, e = VL.size(); i < e; ++i)
- for (unsigned j = i + 1; j < e; ++j)
- if (VL[i] == VL[j]) {
- DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, false, UserTreeIdx);
- return;
- }
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ if (UniqueValues.size() == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, false, UserTreeIdx);
+ return;
+ }
+ VL = UniqueValues;
+ }
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
@@ -1496,18 +1517,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef.get();
- if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
- DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+ if (!BS.tryScheduleBundle(VL, this, S)) {
+ LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
- DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+ LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1518,15 +1539,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TerminatorInst *Term = dyn_cast<TerminatorInst>(
cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
if (Term) {
- DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -1541,13 +1564,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = canReuseExtract(VL, VL0);
+ OrdersType CurrentOrder;
+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse) {
- DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
- } else {
- BS.cancelScheduling(VL, VL0);
+ LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
}
- newTreeEntry(VL, Reuse, UserTreeIdx);
+ if (!CurrentOrder.empty()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ auto StoredCurrentOrderAndNum =
+ NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++StoredCurrentOrderAndNum->getSecond();
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+ StoredCurrentOrderAndNum->getFirst());
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+ newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+ BS.cancelScheduling(VL, VL0);
return;
}
case Instruction::Load: {
@@ -1562,62 +1607,67 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
- LoadInst *L = cast<LoadInst>(VL[i]);
+ SmallVector<Value *, 4> PointerOps(VL.size());
+ auto POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
+ *POIter = L->getPointerOperand();
+ ++POIter;
}
- // Check if the loads are consecutive, reversed, or neither.
- // TODO: What we really want is to sort the loads, but for now, check
- // the two likely directions.
- bool Consecutive = true;
- bool ReverseConsecutive = true;
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
- if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
- Consecutive = false;
- break;
+ OrdersType CurrentOrder;
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
} else {
- ReverseConsecutive = false;
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
}
- }
-
- if (Consecutive) {
- ++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of loads.\n");
- return;
- }
-
- // If none of the load pairs were consecutive when checked in order,
- // check the reverse order.
- if (ReverseConsecutive)
- for (unsigned i = VL.size() - 1; i > 0; --i)
- if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
- ReverseConsecutive = false;
- break;
+ const SCEV *Scev0 = SE->getSCEV(Ptr0);
+ const SCEV *ScevN = SE->getSCEV(PtrN);
+ const auto *Diff =
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ // Check that the sorted loads are consecutive.
+ if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+ if (CurrentOrder.empty()) {
+ // Original loads are consecutive and does not require reordering.
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ } else {
+ // Need to reorder.
+ auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++I->getSecond();
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies, I->getFirst());
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
}
+ return;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
-
- if (ReverseConsecutive) {
- ++NumLoadsWantToChangeOrder;
- DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
- } else {
- DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
- }
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
case Instruction::ZExt:
@@ -1637,13 +1687,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering casts with different src types.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
@@ -1665,14 +1716,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
@@ -1703,14 +1755,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
+ reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
@@ -1730,9 +1782,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
- DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
@@ -1743,9 +1795,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
- DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
@@ -1754,16 +1807,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = 0; j < VL.size(); ++j) {
auto Op = cast<Instruction>(VL[j])->getOperand(1);
if (!isa<ConstantInt>(Op)) {
- DEBUG(
- dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1779,13 +1832,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
@@ -1802,8 +1855,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
Function *Int = CI->getCalledFunction();
@@ -1816,9 +1869,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
- << "\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+ << "\n");
return;
}
// ctlz,cttz and powi are special intrinsics whose second argument
@@ -1827,10 +1880,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
- << " argument "<< A1I<<"!=" << A1J
- << "\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << A1I << "!=" << A1J << "\n");
return;
}
}
@@ -1840,14 +1892,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
- << *VL[i] << '\n');
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
+ << *CI << "!=" << *VL[i] << '\n');
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1862,19 +1914,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::ShuffleVector:
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
- if (!S.IsAltShuffle) {
+ if (!S.isAltShuffle()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
- reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
+ reorderAltShuffleOperands(S, VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
@@ -1892,8 +1944,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
default:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
}
@@ -1923,15 +1975,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
return N;
}
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const {
Instruction *E0 = cast<Instruction>(OpValue);
assert(E0->getOpcode() == Instruction::ExtractElement ||
E0->getOpcode() == Instruction::ExtractValue);
- assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
+ assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *Vec = E0->getOperand(0);
+ CurrentOrder.clear();
+
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
@@ -1951,15 +2006,40 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
return false;
// Check that all of the indices extract from the correct offset.
- for (unsigned I = 0, E = VL.size(); I < E; ++I) {
- Instruction *Inst = cast<Instruction>(VL[I]);
- if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
- return false;
+ bool ShouldKeepOrder = true;
+ unsigned E = VL.size();
+ // Assign to all items the initial value E + 1 so we can check if the extract
+ // instruction index was used already.
+ // Also, later we can check that all the indices are used and we have a
+ // consecutive access in the extract instructions, by checking that no
+ // element of CurrentOrder still has value E + 1.
+ CurrentOrder.assign(E, E + 1);
+ unsigned I = 0;
+ for (; I < E; ++I) {
+ auto *Inst = cast<Instruction>(VL[I]);
if (Inst->getOperand(0) != Vec)
- return false;
+ break;
+ Optional<unsigned> Idx = getExtractIndex(Inst);
+ if (!Idx)
+ break;
+ const unsigned ExtIdx = *Idx;
+ if (ExtIdx != I) {
+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+ break;
+ ShouldKeepOrder = false;
+ CurrentOrder[ExtIdx] = I;
+ } else {
+ if (CurrentOrder[I] != E + 1)
+ break;
+ CurrentOrder[I] = I;
+ }
+ }
+ if (I < E) {
+ CurrentOrder.clear();
+ return false;
}
- return true;
+ return ShouldKeepOrder;
}
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
@@ -1985,13 +2065,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
VecTy = VectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ int ReuseShuffleCost = 0;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
if (E->NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ return ReuseShuffleCost +
+ TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
- if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
+ if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+ allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
@@ -2008,37 +2097,86 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
IO->getZExtValue());
}
}
- return Cost;
+ return ReuseShuffleCost + Cost;
}
}
- return getGatherCost(E->Scalars);
+ return ReuseShuffleCost + getGatherCost(VL);
}
InstructionsState S = getSameOpcode(VL);
- assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(S.OpValue);
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI:
return 0;
case Instruction::ExtractValue:
case Instruction::ExtractElement:
- if (canReuseExtract(VL, S.OpValue)) {
- int DeadCost = 0;
+ if (NeedToShuffleReuses) {
+ unsigned Idx = 0;
+ for (unsigned I : E->ReuseShuffleIndices) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(VL[I])->getIndexOperand());
+ Idx = IO->getZExtValue();
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ } else {
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ ++Idx;
+ }
+ }
+ Idx = ReuseShuffleNumbers;
+ for (Value *V : VL) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Idx = IO->getZExtValue();
+ } else {
+ --Idx;
+ }
+ ReuseShuffleCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+ }
+ }
+ if (!E->NeedToGather) {
+ int DeadCost = ReuseShuffleCost;
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ DeadCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(E))
+ if (areAllUsersVectorized(E)) {
// Take credit for instruction that will become dead.
- DeadCost +=
+ if (E->hasOneUse()) {
+ Instruction *Ext = E->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ DeadCost -= TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), VecTy, i);
+ // Add back the cost of s|zext which is subtracted seperately.
+ DeadCost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
+ continue;
+ }
+ }
+ DeadCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ }
}
- return -DeadCost;
+ return DeadCost;
}
- return getGatherCost(VecTy);
+ return ReuseShuffleCost + getGatherCost(VL);
case Instruction::ZExt:
case Instruction::SExt:
@@ -2053,24 +2191,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
+ int ScalarEltCost =
+ TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
// Calculate the cost of this instruction.
- int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
- VL0->getType(), SrcTy, VL0);
+ int ScalarCost = VL.size() * ScalarEltCost;
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
- int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
+ int VecCost = 0;
+ // Check if the values are candidates to demote.
+ if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+ VecCost = ReuseShuffleCost +
+ TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+ }
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
+ int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+ Builder.getInt1Ty(), VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
- int ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
- int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
- return VecCost - ScalarCost;
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Add:
case Instruction::FAdd:
@@ -2099,42 +2250,43 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
- TargetTransformInfo::OP_None;
+ TargetTransformInfo::OP_PowerOf2;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
// If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue.
- ConstantInt *CInt = nullptr;
- for (unsigned i = 0; i < VL.size(); ++i) {
+ ConstantInt *CInt0 = nullptr;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
- if (!isa<ConstantInt>(I->getOperand(1))) {
+ ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!CInt) {
Op2VK = TargetTransformInfo::OK_AnyValue;
+ Op2VP = TargetTransformInfo::OP_None;
break;
}
+ if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+ !CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_None;
if (i == 0) {
- CInt = cast<ConstantInt>(I->getOperand(1));
+ CInt0 = CInt;
continue;
}
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
- CInt != cast<ConstantInt>(I->getOperand(1)))
+ if (CInt0 != CInt)
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
- // FIXME: Currently cost of model modification for division by power of
- // 2 is handled for X86 and AArch64. Add support for other targets.
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
- CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_PowerOf2;
SmallVector<const Value *, 4> Operands(VL0->operand_values());
- int ScalarCost =
- VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
- Op2VP, Operands);
- int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP, Operands);
- return VecCost - ScalarCost;
+ int ScalarEltCost = TTI->getArithmeticInstrCost(
+ S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK =
@@ -2142,83 +2294,119 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
- int ScalarCost =
- VecTy->getNumElements() *
+ int ScalarEltCost =
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
-
- return VecCost - ScalarCost;
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
- unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
- int ScalarLdCost = VecTy->getNumElements() *
+ unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
+ int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
- int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
- VecTy, alignment, 0, VL0);
- return VecLdCost - ScalarLdCost;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ VecLdCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ return ReuseShuffleCost + VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
- unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
- int ScalarStCost = VecTy->getNumElements() *
+ unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
+ int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, alignment, 0, VL0);
- return VecStCost - ScalarStCost;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecStCost =
+ TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
+ return ReuseShuffleCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type*, 4> ScalarTys;
- for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
+ SmallVector<Type *, 4> ScalarTys;
+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
ScalarTys.push_back(CI->getArgOperand(op)->getType());
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
- int ScalarCallCost = VecTy->getNumElements() *
+ int ScalarEltCost =
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
SmallVector<Value *, 4> Args(CI->arg_operands());
int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
VecTy->getNumElements());
- DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
- << " (" << VecCallCost << "-" << ScalarCallCost << ")"
- << " for " << *CI << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *CI << "\n");
- return VecCallCost - ScalarCallCost;
+ return ReuseShuffleCost + VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_AnyValue;
+ assert(S.isAltShuffle() &&
+ ((Instruction::isBinaryOp(S.getOpcode()) &&
+ Instruction::isBinaryOp(S.getAltOpcode())) ||
+ (Instruction::isCast(S.getOpcode()) &&
+ Instruction::isCast(S.getAltOpcode()))) &&
+ "Invalid Shuffle Vector Operand");
int ScalarCost = 0;
- int VecCost = 0;
+ if (NeedToShuffleReuses) {
+ for (unsigned Idx : E->ReuseShuffleIndices) {
+ Instruction *I = cast<Instruction>(VL[Idx]);
+ ReuseShuffleCost -= TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ ReuseShuffleCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ }
for (Value *i : VL) {
Instruction *I = cast<Instruction>(i);
- if (!I)
- break;
- ScalarCost +=
- TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+ assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ ScalarCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
- Instruction *I0 = cast<Instruction>(VL[0]);
- VecCost =
- TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
- Instruction *I1 = cast<Instruction>(VL[1]);
- VecCost +=
- TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
- VecCost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
- return VecCost - ScalarCost;
+ int VecCost = 0;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
+ VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+ } else {
+ Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
+ Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+ VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
+ VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
+ VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+ }
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
@@ -2226,8 +2414,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
bool BoUpSLP::isFullyVectorizableTinyTree() {
- DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
- VectorizableTree.size() << " is fully vectorizable .\n");
+ LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
+ << VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
@@ -2297,13 +2485,13 @@ int BoUpSLP::getSpillCost() {
LiveValues.insert(cast<Instruction>(&*J));
}
- DEBUG(
+ LLVM_DEBUG({
dbgs() << "SLP: #LV: " << LiveValues.size();
for (auto *X : LiveValues)
dbgs() << " " << X->getName();
dbgs() << ", Looking at ";
Inst->dump();
- );
+ });
// Now find the sequence of instructions between PrevInst and Inst.
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
@@ -2315,7 +2503,10 @@ int BoUpSLP::getSpillCost() {
continue;
}
- if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+ // Debug informations don't impact spill cost.
+ if ((isa<CallInst>(&*PrevInstIt) &&
+ !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+ &*PrevInstIt != PrevInst) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
V.push_back(VectorType::get(II->getType(), BundleWidth));
@@ -2333,19 +2524,41 @@ int BoUpSLP::getSpillCost() {
int BoUpSLP::getTreeCost() {
int Cost = 0;
- DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
- VectorizableTree.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+ << VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
- for (TreeEntry &TE : VectorizableTree) {
+ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+ TreeEntry &TE = VectorizableTree[I];
+
+ // We create duplicate tree entries for gather sequences that have multiple
+ // uses. However, we should not compute the cost of duplicate sequences.
+ // For example, if we have a build vector (i.e., insertelement sequence)
+ // that is used by more than one vector instruction, we only need to
+ // compute the cost of the insertelement instructions once. The redundent
+ // instructions will be eliminated by CSE.
+ //
+ // We should consider not creating duplicate tree entries for gather
+ // sequences, and instead add additional edges to the tree representing
+ // their uses. Since such an approach results in fewer total entries,
+ // existing heuristics based on tree size may yeild different results.
+ //
+ if (TE.NeedToGather &&
+ std::any_of(std::next(VectorizableTree.begin(), I + 1),
+ VectorizableTree.end(), [TE](TreeEntry &Entry) {
+ return Entry.NeedToGather && Entry.isSame(TE.Scalars);
+ }))
+ continue;
+
int C = getEntryCost(&TE);
- DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
- << *TE.Scalars[0] << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for bundle that starts with " << *TE.Scalars[0]
+ << ".\n");
Cost += C;
}
- SmallSet<Value *, 16> ExtractCostCalculated;
+ SmallPtrSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
@@ -2386,7 +2599,7 @@ int BoUpSLP::getTreeCost() {
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
}
- DEBUG(dbgs() << Str);
+ LLVM_DEBUG(dbgs() << Str);
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
@@ -2394,10 +2607,14 @@ int BoUpSLP::getTreeCost() {
return Cost;
}
-int BoUpSLP::getGatherCost(Type *Ty) {
+int BoUpSLP::getGatherCost(Type *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
- Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (!ShuffledIndices.count(i))
+ Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (!ShuffledIndices.empty())
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
}
@@ -2408,7 +2625,17 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
- return getGatherCost(VecTy);
+ // Check if the same elements are inserted several times and count them as
+ // shuffle candidates.
+ DenseSet<unsigned> ShuffledElements;
+ DenseSet<Value *> UniqueElements;
+ // Iterate in reverse order to consider insert elements with the high cost.
+ for (unsigned I = VL.size(); I > 0; --I) {
+ unsigned Idx = I - 1;
+ if (!UniqueElements.insert(VL[Idx]).second)
+ ShuffledElements.insert(Idx);
+ }
+ return getGatherCost(VecTy, ShuffledElements);
}
// Reorder commutative operations in alternate shuffle if the resulting vectors
@@ -2420,16 +2647,14 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// load a[3] + load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize this
// code.
-void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right
- unsigned AltOpcode = getAltOpcode(Opcode);
- (void)AltOpcode;
for (Value *V : VL) {
auto *I = cast<Instruction>(V);
- assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
- "Incorrect instruction in vector");
+ assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
Left.push_back(I->getOperand(0));
Right.push_back(I->getOperand(1));
}
@@ -2609,7 +2834,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
// add a[1],c[2] load b[1]
// b[2] load b[2]
// add a[3],c[3] load b[3]
- for (unsigned j = 0; j < VL.size() - 1; ++j) {
+ for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
@@ -2630,17 +2855,15 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
}
}
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
- auto *Front = cast<Instruction>(OpValue);
+ auto *Front = cast<Instruction>(S.OpValue);
auto *BB = Front->getParent();
- const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
- const unsigned AltOpcode = getAltOpcode(Opcode);
assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
- return !sameOpcodeOrAlt(Opcode, AltOpcode,
- cast<Instruction>(V)->getOpcode()) ||
- cast<Instruction>(V)->getParent() == BB;
+ auto *I = cast<Instruction>(V);
+ return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
}));
// The last instruction in the bundle in program order.
@@ -2652,7 +2875,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
auto *Bundle =
- BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
+ BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
@@ -2680,7 +2903,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
+ if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
LastInst = &I;
if (Bundle.empty())
break;
@@ -2706,7 +2929,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
if (TreeEntry *E = getTreeEntry(VL[i])) {
// Find which lane we need to extract.
int FoundLane = -1;
- for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+ for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
// Is this the lane of the scalar that we are looking for ?
if (E->Scalars[Lane] == VL[i]) {
FoundLane = Lane;
@@ -2714,6 +2937,11 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
}
}
assert(FoundLane >= 0 && "Could not find the correct lane");
+ if (!E->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(E->ReuseShuffleIndices.begin(),
+ llvm::find(E->ReuseShuffleIndices, FoundLane));
+ }
ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
}
}
@@ -2722,66 +2950,128 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
return Vec;
}
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
- if (const TreeEntry *En = getTreeEntry(OpValue)) {
- if (En->isSame(VL) && En->VectorizedValue)
- return En->VectorizedValue;
- }
- return nullptr;
-}
-
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
InstructionsState S = getSameOpcode(VL);
- if (S.Opcode) {
+ if (S.getOpcode()) {
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- if (E->isSame(VL))
- return vectorizeTree(E);
+ if (E->isSame(VL)) {
+ Value *V = vectorizeTree(E);
+ if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+ // We need to get the vectorized value but without shuffle.
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ V = SV->getOperand(0);
+ } else {
+ // Reshuffle to get only unique values.
+ SmallVector<unsigned, 4> UniqueIdxs;
+ SmallSet<unsigned, 4> UsedIdxs;
+ for(unsigned Idx : E->ReuseShuffleIndices)
+ if (UsedIdxs.insert(Idx).second)
+ UniqueIdxs.emplace_back(Idx);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ UniqueIdxs);
+ }
+ }
+ return V;
+ }
}
}
Type *ScalarTy = S.OpValue->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
ScalarTy = SI->getValueOperand()->getType();
+
+ // Check that every instruction appears once in this bundle.
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ if (VL.size() > 2) {
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second || isa<Constant>(V))
+ UniqueValues.emplace_back(V);
+ }
+ // Do not shuffle single element or if number of unique values is not power
+ // of 2.
+ if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+ !llvm::isPowerOf2_32(UniqueValues.size()))
+ ReuseShuffleIndicies.clear();
+ else
+ VL = UniqueValues;
+ }
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
- return Gather(VL, VecTy);
+ Value *V = Gather(VL, VecTy);
+ if (!ReuseShuffleIndicies.empty()) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ ReuseShuffleIndicies, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ return V;
+}
+
+static void inversePermutation(ArrayRef<unsigned> Indices,
+ SmallVectorImpl<unsigned> &Mask) {
+ Mask.clear();
+ const unsigned E = Indices.size();
+ Mask.resize(E);
+ for (unsigned I = 0; I < E; ++I)
+ Mask[Indices[I]] = I;
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->VectorizedValue) {
- DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
InstructionsState S = getSameOpcode(E->Scalars);
- Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+ Instruction *VL0 = cast<Instruction>(S.OpValue);
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+
if (E->NeedToGather) {
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- E->VectorizedValue = NewPhi;
+ Value *V = NewPhi;
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
- SmallSet<BasicBlock*, 4> VisitedBBs;
+ SmallPtrSet<BasicBlock*, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -2804,32 +3094,74 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
- return NewPhi;
+ return V;
}
case Instruction::ExtractElement: {
- if (canReuseExtract(E->Scalars, VL0)) {
+ if (!E->NeedToGather) {
Value *V = VL0->getOperand(0);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ if (!E->ReorderIndices.empty())
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
return V;
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- if (canReuseExtract(E->Scalars, VL0)) {
+ if (!E->NeedToGather) {
LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
- E->VectorizedValue = V;
- return propagateMetadata(V, E->Scalars);
+ Value *NewV = propagateMetadata(V, E->Scalars);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ NewV = Builder.CreateShuffleVector(
+ NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = NewV;
+ return NewV;
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
@@ -2849,15 +3181,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
INVL.push_back(cast<Instruction>(V)->getOperand(0));
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *InVec = vectorizeTree(INVL);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
CastInst *CI = dyn_cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -2870,23 +3208,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *L = vectorizeTree(LHSV);
Value *R = vectorizeTree(RHSV);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
- if (S.Opcode == Instruction::FCmp)
+ if (S.getOpcode() == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
@@ -2898,16 +3242,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *Cond = vectorizeTree(CondVec);
Value *True = vectorizeTree(TrueVec);
Value *False = vectorizeTree(FalseVec);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
Value *V = Builder.CreateSelect(Cond, True, False);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -2932,7 +3282,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
- reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
+ reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
RHSVL);
else
for (Value *V : E->Scalars) {
@@ -2941,29 +3291,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSVL.push_back(I->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
Value *V = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
-
return V;
}
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
- setInsertPointAfterBundle(E->Scalars, VL0);
+ bool IsReorder = !E->ReorderIndices.empty();
+ if (IsReorder) {
+ S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+ VL0 = cast<Instruction>(S.OpValue);
+ }
+ setInsertPointAfterBundle(E->Scalars, S);
LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
@@ -2985,9 +3346,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
}
LI->setAlignment(Alignment);
- E->VectorizedValue = LI;
+ Value *V = propagateMetadata(LI, E->Scalars);
+ if (IsReorder) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ Mask, "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
++NumVectorInstructions;
- return propagateMetadata(LI, E->Scalars);
+ return V;
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
@@ -2998,12 +3371,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *VecValue = vectorizeTree(ScalarStoreValues);
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
- StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+ StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
// The pointer operand uses an in-tree scalar, so add the new BitCast to
// ExternalUses to make sure that an extract will be generated in the
@@ -3014,13 +3387,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (!Alignment)
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
- S->setAlignment(Alignment);
- E->VectorizedValue = S;
+ ST->setAlignment(Alignment);
+ Value *V = propagateMetadata(ST, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
++NumVectorInstructions;
- return propagateMetadata(S, E->Scalars);
+ return V;
}
case Instruction::GetElementPtr: {
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
ValueList Op0VL;
for (Value *V : E->Scalars)
@@ -3041,17 +3419,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateGEP(
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
-
return V;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
@@ -3075,7 +3457,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *OpVec = vectorizeTree(OpVL);
- DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
@@ -3093,58 +3475,87 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (ScalarArg && getTreeEntry(ScalarArg))
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
- assert(Instruction::isBinaryOp(S.Opcode) &&
+ assert(S.isAltShuffle() &&
+ ((Instruction::isBinaryOp(S.getOpcode()) &&
+ Instruction::isBinaryOp(S.getAltOpcode())) ||
+ (Instruction::isCast(S.getOpcode()) &&
+ Instruction::isCast(S.getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
- reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
- setInsertPointAfterBundle(E->Scalars, VL0);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
-
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ Value *LHS, *RHS;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
+ setInsertPointAfterBundle(E->Scalars, S);
+ LHS = vectorizeTree(LHSVL);
+ RHS = vectorizeTree(RHSVL);
+ } else {
+ ValueList INVL;
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
+ setInsertPointAfterBundle(E->Scalars, S);
+ LHS = vectorizeTree(INVL);
+ }
- // Create a vector of LHS op1 RHS
- Value *V0 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
- unsigned AltOpcode = getAltOpcode(S.Opcode);
- // Create a vector of LHS op2 RHS
- Value *V1 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
+ Value *V0, *V1;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+ } else {
+ V0 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+ V1 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+ }
// Create shuffle to take alternate operations from the vector.
- // Also, gather up odd and even scalar ops to propagate IR flags to
+ // Also, gather up main and alt scalar ops to propagate IR flags to
// each vector operation.
- ValueList OddScalars, EvenScalars;
+ ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
- if (isOdd(i)) {
+ auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+ if (OpInst->getOpcode() == S.getAltOpcode()) {
Mask[i] = Builder.getInt32(e + i);
- OddScalars.push_back(E->Scalars[i]);
+ AltScalars.push_back(E->Scalars[i]);
} else {
Mask[i] = Builder.getInt32(i);
- EvenScalars.push_back(E->Scalars[i]);
+ OpScalars.push_back(E->Scalars[i]);
}
}
Value *ShuffleMask = ConstantVector::get(Mask);
- propagateIRFlags(V0, EvenScalars);
- propagateIRFlags(V1, OddScalars);
+ propagateIRFlags(V0, OpScalars);
+ propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
return V;
}
@@ -3183,7 +3594,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
VectorizableTree[0].VectorizedValue = Trunc;
}
- DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+ LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+ << " values .\n");
// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
// specified by ScalarType.
@@ -3260,7 +3672,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
- }
+ }
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
@@ -3269,7 +3681,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
User->replaceUsesOfWith(Scalar, Ex);
}
- DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
// For each vectorized value:
@@ -3290,7 +3702,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
if (!Ty->isVoidTy()) {
#ifndef NDEBUG
for (User *U : Scalar->users()) {
- DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
// It is legal to replace users in the ignorelist by undef.
assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
@@ -3300,7 +3712,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Value *Undef = UndefValue::get(Ty);
Scalar->replaceAllUsesWith(Undef);
}
- DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
}
}
@@ -3310,18 +3722,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
return VectorizableTree[0].VectorizedValue;
}
-void BoUpSLP::optimizeGatherSequence(Function &F) {
- DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
- << " gather sequences instructions.\n");
+void BoUpSLP::optimizeGatherSequence() {
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ << " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (Instruction *it : GatherSeq) {
- InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
-
- if (!Insert)
+ for (Instruction *I : GatherSeq) {
+ if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
continue;
// Check if this block is inside a loop.
- Loop *L = LI->getLoopFor(Insert->getParent());
+ Loop *L = LI->getLoopFor(I->getParent());
if (!L)
continue;
@@ -3333,27 +3743,41 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
- Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
- Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
- if (CurrVec && L->contains(CurrVec))
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (Op0 && L->contains(Op0))
continue;
- if (NewElem && L->contains(NewElem))
+ if (Op1 && L->contains(Op1))
continue;
// We can hoist this instruction. Move it to the pre-header.
- Insert->moveBefore(PreHeader->getTerminator());
+ I->moveBefore(PreHeader->getTerminator());
}
+ // Make a list of all reachable blocks in our CSE queue.
+ SmallVector<const DomTreeNode *, 8> CSEWorkList;
+ CSEWorkList.reserve(CSEBlocks.size());
+ for (BasicBlock *BB : CSEBlocks)
+ if (DomTreeNode *N = DT->getNode(BB)) {
+ assert(DT->isReachableFromEntry(N));
+ CSEWorkList.push_back(N);
+ }
+
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
+ [this](const DomTreeNode *A, const DomTreeNode *B) {
+ return DT->properlyDominates(A, B);
+ });
+
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
- ReversePostOrderTraversal<Function *> RPOT(&F);
- for (auto BB : RPOT) {
- // Traverse CSEBlocks by RPOT order.
- if (!CSEBlocks.count(BB))
- continue;
-
+ for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+ assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ "Worklist not sorted properly!");
+ BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
@@ -3384,8 +3808,9 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
- BoUpSLP *SLP, Value *OpValue) {
- if (isa<PHINode>(OpValue))
+ BoUpSLP *SLP,
+ const InstructionsState &S) {
+ if (isa<PHINode>(S.OpValue))
return true;
// Initialize the instruction bundle.
@@ -3393,12 +3818,12 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
ScheduleData *PrevInBundle = nullptr;
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
- DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (!extendSchedulingRegion(V, OpValue))
+ if (!extendSchedulingRegion(V, S))
return false;
}
@@ -3410,8 +3835,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
- DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
- << " was already scheduled\n");
+ LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
ReSchedule = true;
}
assert(BundleMember->isSchedulingEntity() &&
@@ -3446,8 +3871,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
initialFillReadyList(ReadyInsts);
}
- DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
- << BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+ << BB->getName() << "\n");
calculateDependencies(Bundle, true, SLP);
@@ -3465,7 +3890,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
}
}
if (!Bundle->isReady()) {
- cancelScheduling(VL, OpValue);
+ cancelScheduling(VL, S.OpValue);
return false;
}
return true;
@@ -3477,7 +3902,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
return;
ScheduleData *Bundle = getScheduleData(OpValue);
- DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
@@ -3508,13 +3933,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
}
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
- Value *OpValue) {
- if (getScheduleData(V, isOneOf(OpValue, V)))
+ const InstructionsState &S) {
+ if (getScheduleData(V, isOneOf(S, V)))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
- auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
+ auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
return false;
@@ -3522,8 +3947,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
"ScheduleData not in scheduling region");
ScheduleData *SD = allocateScheduleDataChunks();
SD->Inst = I;
- SD->init(SchedulingRegionID, OpValue);
- ExtraScheduleDataMap[I][OpValue] = SD;
+ SD->init(SchedulingRegionID, S.OpValue);
+ ExtraScheduleDataMap[I][S.OpValue] = SD;
return true;
};
if (CheckSheduleForI(I))
@@ -3533,10 +3958,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
- DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
}
// Search up and down at the same time, because we don't know if the new
@@ -3548,7 +3973,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
BasicBlock::iterator LowerEnd = BB->end();
while (true) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
- DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
}
@@ -3556,9 +3981,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
- DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
+ << "\n");
return true;
}
UpIter++;
@@ -3568,10 +3994,11 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
- DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
+ << "\n");
return true;
}
DownIter++;
@@ -3635,7 +4062,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
assert(isInSchedulingRegion(BundleMember));
if (!BundleMember->hasValidDependencies()) {
- DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
+ << "\n");
BundleMember->Dependencies = 0;
BundleMember->resetUnscheduledDeps();
@@ -3727,7 +4155,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
// and we can abort this loop at i6.
if (DistToSrc >= 2 * MaxMemDepDistance)
- break;
+ break;
DistToSrc++;
}
}
@@ -3736,7 +4164,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
if (InsertInReadyList && SD->isReady()) {
ReadyInsts.push_back(SD);
- DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
+ << "\n");
}
}
}
@@ -3759,7 +4188,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!BS->ScheduleStart)
return;
- DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
BS->resetSchedule();
@@ -4025,7 +4454,11 @@ void BoUpSLP::computeMinimumValueSizes() {
// We start by looking at each entry that can be demoted. We compute the
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
- if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
+ if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+ llvm::all_of(TreeRoot, [](Value *R) {
+ assert(R->hasOneUse() && "Root should have only one use!");
+ return isa<GetElementPtrInst>(R->user_back());
+ })) {
MaxBitWidth = 8u;
// Determine if the sign bit of all the roots is known to be zero. If not,
@@ -4188,7 +4621,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
- DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
@@ -4203,8 +4636,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Vectorize trees that end at stores.
if (!Stores.empty()) {
- DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
- << " underlying objects.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+ << " underlying objects.\n");
Changed |= vectorizeStoreChains(R);
}
@@ -4215,21 +4648,21 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// is primarily intended to catch gather-like idioms ending at
// non-consecutive loads.
if (!GEPs.empty()) {
- DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
- << " underlying objects.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+ << " underlying objects.\n");
Changed |= vectorizeGEPIndices(BB, R);
}
}
if (Changed) {
- R.optimizeGatherSequence(F);
- DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
- DEBUG(verifyFunction(F));
+ R.optimizeGatherSequence();
+ LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+ LLVM_DEBUG(verifyFunction(F));
}
return Changed;
}
-/// \brief Check that the Values in the slice in VL array are still existent in
+/// Check that the Values in the slice in VL array are still existent in
/// the WeakTrackingVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakTrackingVH
@@ -4244,30 +4677,28 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
- unsigned ChainLen = Chain.size();
- DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
- << "\n");
- unsigned Sz = R.getVectorElementSize(Chain[0]);
- unsigned VF = VecRegSize / Sz;
+ const unsigned ChainLen = Chain.size();
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+ << "\n");
+ const unsigned Sz = R.getVectorElementSize(Chain[0]);
+ const unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
// Keep track of values that were deleted by vectorizing in the loop below.
- SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
+ const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
- for (unsigned i = 0, e = ChainLen; i < e; ++i) {
- if (i + VF > e)
- break;
+ for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
continue;
- DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
- << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+ << "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
@@ -4278,9 +4709,10 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
int Cost = R.getTreeCost();
- DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
+ << "\n");
if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
using namespace ore;
@@ -4417,66 +4849,48 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R, None, true);
+ return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector,
- bool AllowReorder,
- bool NeedExtraction) {
+ int UserCost, bool AllowReorder) {
if (VL.size() < 2)
return false;
- DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
- << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
+ << VL.size() << ".\n");
- // Check that all of the parts are scalar instructions of the same type.
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
+ // Check that all of the parts are scalar instructions of the same type,
+ // we permit an alternate opcode via InstructionsState.
+ InstructionsState S = getSameOpcode(VL);
+ if (!S.getOpcode())
return false;
- unsigned Opcode0 = I0->getOpcode();
-
+ Instruction *I0 = cast<Instruction>(S.OpValue);
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
if (MaxVF < 2) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "SmallVF", I0)
- << "Cannot SLP vectorize list: vectorization factor "
- << "less than 2 is not supported";
- });
- return false;
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
}
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty)) {
- // NOTE: the following will give user internal llvm type name, which may not be useful
+ // NOTE: the following will give user internal llvm type name, which may
+ // not be useful.
R.getORE()->emit([&]() {
- std::string type_str;
- llvm::raw_string_ostream rso(type_str);
- Ty->print(rso);
- return OptimizationRemarkMissed(
- SV_NAME, "UnsupportedType", I0)
- << "Cannot SLP vectorize list: type "
- << rso.str() + " is unsupported by vectorizer";
- });
- return false;
- }
- Instruction *Inst = dyn_cast<Instruction>(V);
-
- if (!Inst)
- return false;
- if (Inst->getOpcode() != Opcode0) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "InequableTypes", I0)
- << "Cannot SLP vectorize list: not all of the "
- << "parts of scalar instructions are of the same type: "
- << ore::NV("Instruction1Opcode", I0) << " and "
- << ore::NV("Instruction2Opcode", Inst);
+ std::string type_str;
+ llvm::raw_string_ostream rso(type_str);
+ Ty->print(rso);
+ return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+ << "Cannot SLP vectorize list: type "
+ << rso.str() + " is unsupported by vectorizer";
});
return false;
}
@@ -4513,24 +4927,20 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
continue;
- DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
- << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
- ArrayRef<Value *> EmptyArray;
- ArrayRef<Value *> BuildVectorSlice;
- if (!BuildVector.empty())
- BuildVectorSlice = BuildVector.slice(I, OpsWidth);
-
- R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
+ R.buildTree(Ops);
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
// TODO: check if we can allow reordering for more cases.
- if (AllowReorder && R.shouldReorder()) {
+ if (AllowReorder && Order) {
+ // TODO: reorder tree nodes without tree rebuilding.
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here when
// there are exactly two operations.
assert(Ops.size() == 2);
- assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
}
@@ -4538,43 +4948,19 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
continue;
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ int Cost = R.getTreeCost() - UserCost;
CandidateFound = true;
MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
cast<Instruction>(Ops[0]))
<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
<< " and with tree size "
<< ore::NV("TreeSize", R.getTreeSize()));
- Value *VectorizedRoot = R.vectorizeTree();
-
- // Reconstruct the build vector by extracting the vectorized root. This
- // way we handle the case where some elements of the vector are
- // undefined.
- // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
- if (!BuildVectorSlice.empty()) {
- // The insert point is the last build vector instruction. The
- // vectorized root will precede it. This guarantees that we get an
- // instruction. The vectorized tree could have been constant folded.
- Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
- unsigned VecIdx = 0;
- for (auto &V : BuildVectorSlice) {
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
- Instruction *I = cast<Instruction>(V);
- assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
- Instruction *Extract =
- cast<Instruction>(Builder.CreateExtractElement(
- VectorizedRoot, Builder.getInt32(VecIdx++)));
- I->setOperand(1, Extract);
- I->moveAfter(Extract);
- InsertAfter = I;
- }
- }
+ R.vectorizeTree();
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
@@ -4585,18 +4971,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (!Changed && CandidateFound) {
R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "NotBeneficial", I0)
- << "List vectorization was possible but not beneficial with cost "
- << ore::NV("Cost", MinCost) << " >= "
- << ore::NV("Treshold", -SLPCostThreshold);
+ return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
+ << "List vectorization was possible but not beneficial with cost "
+ << ore::NV("Cost", MinCost) << " >= "
+ << ore::NV("Treshold", -SLPCostThreshold);
});
} else if (!Changed) {
R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "NotPossible", I0)
- << "Cannot SLP vectorize list: vectorization was impossible"
- << " with available vectorization factors";
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+ << "Cannot SLP vectorize list: vectorization was impossible"
+ << " with available vectorization factors";
});
}
return Changed;
@@ -4645,7 +5029,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
return false;
}
-/// \brief Generate a shuffle mask to be used in a reduction tree.
+/// Generate a shuffle mask to be used in a reduction tree.
///
/// \param VecLen The length of the vector to be reduced.
/// \param NumEltsToRdx The number of elements that should be reduced in the
@@ -5128,6 +5512,77 @@ class HorizontalReduction {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Max,
cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else {
+ // Try harder: look for min/max pattern based on instructions producing
+ // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+ // During the intermediate stages of SLP, it's very common to have
+ // pattern like this (since optimizeGatherSequence is run only once
+ // at the end):
+ // %1 = extractelement <2 x i32> %a, i32 0
+ // %2 = extractelement <2 x i32> %a, i32 1
+ // %cond = icmp sgt i32 %1, %2
+ // %3 = extractelement <2 x i32> %a, i32 0
+ // %4 = extractelement <2 x i32> %a, i32 1
+ // %select = select i1 %cond, i32 %3, i32 %4
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
+
+ LHS = Select->getTrueValue();
+ RHS = Select->getFalseValue();
+ Value *Cond = Select->getCondition();
+
+ // TODO: Support inverse predicates.
+ if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+ if (!isa<ExtractElementInst>(RHS) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+ if (!isa<ExtractElementInst>(LHS) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)))
+ return OperationData(V);
+ } else {
+ if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+ return OperationData(V);
+ if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ }
+ switch (Pred) {
+ default:
+ return OperationData(V);
+
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SLE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Cond)->hasNoNaNs());
+
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGT:
+ case CmpInst::FCMP_UGE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Cond)->hasNoNaNs());
+ }
}
}
return OperationData(V);
@@ -5136,7 +5591,7 @@ class HorizontalReduction {
public:
HorizontalReduction() = default;
- /// \brief Try to find a reduction tree.
+ /// Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
@@ -5164,6 +5619,8 @@ public:
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
+ if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+ return false;
ReducedValueData.clear();
ReductionRoot = B;
@@ -5262,7 +5719,7 @@ public:
return true;
}
- /// \brief Attempt to vectorize the tree found by
+ /// Attempt to vectorize the tree found by
/// matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
if (ReducedVals.empty())
@@ -5295,9 +5752,14 @@ public:
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
- if (V.shouldReorder()) {
- SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
- V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
+ Optional<ArrayRef<unsigned>> Order = V.bestOrder();
+ // TODO: Handle orders of size less than number of elements in the vector.
+ if (Order && Order->size() == VL.size()) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(VL.size());
+ llvm::transform(*Order, ReorderedOps.begin(),
+ [VL](const unsigned Idx) { return VL[Idx]; });
+ V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
@@ -5305,8 +5767,9 @@ public:
V.computeMinimumValueSizes();
// Estimate cost.
- int Cost =
- V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ int TreeCost = V.getTreeCost();
+ int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ int Cost = TreeCost + ReductionCost;
if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
return OptimizationRemarkMissed(
@@ -5319,8 +5782,8 @@ public:
break;
}
- DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
- << ". (HorRdx)\n");
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
return OptimizationRemark(
SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
@@ -5382,7 +5845,7 @@ public:
}
private:
- /// \brief Calculate the cost of a reduction.
+ /// Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
@@ -5441,16 +5904,16 @@ private:
}
ScalarReduxCost *= (ReduxWidth - 1);
- DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
- << " for reduction that starts with " << *FirstReducedVal
- << " (It is a "
- << (IsPairwiseReduction ? "pairwise" : "splitting")
- << " reduction)\n");
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+ << " for reduction that starts with " << *FirstReducedVal
+ << " (It is a "
+ << (IsPairwiseReduction ? "pairwise" : "splitting")
+ << " reduction)\n");
return VecReduxCost - ScalarReduxCost;
}
- /// \brief Emit a horizontal reduction of the vectorized value.
+ /// Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
unsigned ReduxWidth, const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
@@ -5486,7 +5949,7 @@ private:
} // end anonymous namespace
-/// \brief Recognize construction of vectors like
+/// Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
@@ -5495,11 +5958,17 @@ private:
///
/// Returns true if it matches
static bool findBuildVector(InsertElementInst *LastInsertElem,
- SmallVectorImpl<Value *> &BuildVector,
- SmallVectorImpl<Value *> &BuildVectorOpds) {
+ TargetTransformInfo *TTI,
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ int &UserCost) {
+ UserCost = 0;
Value *V = nullptr;
do {
- BuildVector.push_back(LastInsertElem);
+ if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
+ UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+ LastInsertElem->getType(),
+ CI->getZExtValue());
+ }
BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
V = LastInsertElem->getOperand(0);
if (isa<UndefValue>(V))
@@ -5508,20 +5977,17 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
if (!LastInsertElem || !LastInsertElem->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
-/// \brief Like findBuildVector, but looks for construction of aggregate.
+/// Like findBuildVector, but looks for construction of aggregate.
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
- SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
Value *V;
do {
- BuildVector.push_back(IV);
BuildVectorOpds.push_back(IV->getInsertedValueOperand());
V = IV->getAggregateOperand();
if (isa<UndefValue>(V))
@@ -5530,7 +5996,6 @@ static bool findBuildAggregate(InsertValueInst *IV,
if (!IV || !IV->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
@@ -5539,7 +6004,7 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
-/// \brief Try and get a reduction value from a phi node.
+/// Try and get a reduction value from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
@@ -5552,9 +6017,8 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
auto DominatedReduxValue = [&](Value *R) {
- return (
- dyn_cast<Instruction>(R) &&
- DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+ return isa<Instruction>(R) &&
+ DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
};
Value *Rdx = nullptr;
@@ -5624,7 +6088,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
- SmallSet<Value *, 8> VisitedInstrs;
+ SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
while (!Stack.empty()) {
Value *V;
@@ -5706,27 +6170,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
if (!R.canMapToVector(IVI->getType(), DL))
return false;
- SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+ if (!findBuildAggregate(IVI, BuildVectorOpds))
return false;
- DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
+ return tryToVectorizeList(BuildVectorOpds, R);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
- SmallVector<Value *, 16> BuildVector;
+ int UserCost;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+ if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+ (llvm::all_of(BuildVectorOpds,
+ [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+ isShuffle(BuildVectorOpds)))
return false;
// Vectorize starting with the build vector operands ignoring the BuildVector
// instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+ return tryToVectorizeList(BuildVectorOpds, R, UserCost);
}
bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -5763,7 +6229,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
- SmallSet<Value *, 16> VisitedInstrs;
+ SmallPtrSet<Value *, 16> VisitedInstrs;
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -5798,14 +6264,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
- DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
+ << NumElts << ")\n");
// The order in which the phi nodes appear in the program does not matter.
// So allow tryToVectorizeList to reorder them if it is beneficial. This
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
bool AllowReorder = NumElts == 2;
if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
- None, AllowReorder)) {
+ /*UserCost=*/0, AllowReorder)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
@@ -5885,7 +6352,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
isa<InsertValueInst>(it))
PostProcessInstructions.push_back(&*it);
-
}
return Changed;
@@ -5899,8 +6365,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
if (Entry.second.size() < 2)
continue;
- DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
- << Entry.second.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+ << Entry.second.size() << ".\n");
// We process the getelementptr list in chunks of 16 (like we do for
// stores) to minimize compile-time.
@@ -5982,14 +6448,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (it->second.size() < 2)
continue;
- DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
- << it->second.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+ << it->second.size() << ".\n");
// Process the stores in chunks of 16.
// TODO: The limit of 16 inhibits greater vectorization factors.
// For example, AVX2 supports v32i8. Increasing this limit, however,
// may cause a significant compile-time increase.
- for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
+ for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}