summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-01-02 19:17:04 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-01-02 19:17:04 +0000
commitb915e9e0fc85ba6f398b3fab0db6a81a8913af94 (patch)
tree98b8f811c7aff2547cab8642daf372d6c59502fb /lib/Transforms/Vectorize/SLPVectorizer.cpp
parent6421cca32f69ac849537a3cff78c352195e99f1b (diff)
Notes
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp716
1 files changed, 476 insertions, 240 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fb94f7924ee0..bcaa8439cffa 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -115,22 +115,22 @@ static bool isValidElementType(Type *Ty) {
!Ty->isPPC_FP128Ty();
}
-/// \returns the parent basic block if all of the instructions in \p VL
-/// are in the same block or null otherwise.
-static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
+/// \returns true if all of the instructions in \p VL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
- return nullptr;
+ return false;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
- return nullptr;
+ return false;
if (BB != I->getParent())
- return nullptr;
+ return false;
}
- return BB;
+ return true;
}
/// \returns True if all of the values in \p VL are constants.
@@ -211,12 +211,12 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
/// of each scalar operation (VL) that will be converted into a vector (I).
/// Flag set: NSW, NUW, exact, and all of fast-math.
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
- if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
- if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
+ if (auto *VecOp = dyn_cast<Instruction>(I)) {
+ if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
// Intersection is initialized to the 0th scalar,
// so start counting from index '1'.
for (int i = 1, e = VL.size(); i < e; ++i) {
- if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
+ if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
Intersection->andIRFlags(Scalar);
}
VecOp->copyIRFlags(Intersection);
@@ -224,15 +224,15 @@ static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
}
}
-/// \returns The type that all of the values in \p VL have or null if there
-/// are different types.
-static Type* getSameType(ArrayRef<Value *> VL) {
+/// \returns true if all of the values in \p VL have the same type or false
+/// otherwise.
+static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
- return nullptr;
+ return false;
- return Ty;
+ return true;
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
@@ -393,6 +393,10 @@ public:
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+ /// \returns True if the VectorizableTree is both tiny and not fully
+ /// vectorizable. We do not vectorize such trees.
+ bool isTreeTinyAndNotFullyVectorizable();
+
private:
struct TreeEntry;
@@ -883,10 +887,10 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- // Number of load-bundles, which contain consecutive loads.
+ // Number of load bundles that contain consecutive loads.
int NumLoadsWantToKeepOrder;
- // Number of load-bundles of size 2, which are consecutive loads if reversed.
+ // Number of load bundles that contain consecutive loads in reversed order.
int NumLoadsWantToChangeOrder;
// Analysis and block reference.
@@ -906,8 +910,11 @@ private:
IRBuilder<> Builder;
/// A map of scalar integer values to the smallest bit width with which they
- /// can legally be represented.
- MapVector<Value *, uint64_t> MinBWs;
+ /// can legally be represented. The values map to (width, signed) pairs,
+ /// where "width" indicates the minimum bit width and "signed" is True if the
+ /// value must be signed-extended, rather than zero-extended, back to its
+ /// original width.
+ MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace llvm
@@ -917,7 +924,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
UserIgnoreList = UserIgnoreLst;
- if (!getSameType(Roots))
+ if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0);
@@ -958,8 +965,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
}
// Ignore users in the user ignore list.
- if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
- UserIgnoreList.end())
+ if (is_contained(UserIgnoreList, UserInst))
continue;
DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
@@ -972,9 +978,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
- bool SameTy = allConstant(VL) || getSameType(VL); (void)SameTy;
bool isAltShuffle = false;
- assert(SameTy && "Invalid types!");
+ assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
@@ -1007,7 +1012,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false);
return;
@@ -1159,7 +1164,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
- // Check if the loads are consecutive or of we need to swizzle them.
+
+ // Make sure all loads in the bundle are simple - we can't vectorize
+ // atomic or volatile loads.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {
@@ -1168,20 +1175,47 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
+ }
+ // Check if the loads are consecutive, reversed, or neither.
+ // TODO: What we really want is to sort the loads, but for now, check
+ // the two likely directions.
+ bool Consecutive = true;
+ bool ReverseConsecutive = true;
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
- if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) {
- ++NumLoadsWantToChangeOrder;
- }
- BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
- DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
- return;
+ Consecutive = false;
+ break;
+ } else {
+ ReverseConsecutive = false;
}
}
- ++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true);
- DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+
+ if (Consecutive) {
+ ++NumLoadsWantToKeepOrder;
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ return;
+ }
+
+ // If none of the load pairs were consecutive when checked in order,
+ // check the reverse order.
+ if (ReverseConsecutive)
+ for (unsigned i = VL.size() - 1; i > 0; --i)
+ if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
+ ReverseConsecutive = false;
+ break;
+ }
+
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+
+ if (ReverseConsecutive) {
+ ++NumLoadsWantToChangeOrder;
+ DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
+ } else {
+ DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ }
return;
}
case Instruction::ZExt:
@@ -1541,8 +1575,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
- VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
- VL.size());
+ VecTy = VectorType::get(
+ IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
if (E->NeedToGather) {
if (allConstant(VL))
@@ -1553,7 +1587,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return getGatherCost(E->Scalars);
}
unsigned Opcode = getSameOpcode(VL);
- assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
+ assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(VL[0]);
switch (Opcode) {
case Instruction::PHI: {
@@ -1762,7 +1796,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
VectorizableTree.size() << " is fully vectorizable .\n");
- // We only handle trees of height 2.
+ // We only handle trees of heights 1 and 2.
+ if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
+ return true;
+
if (VectorizableTree.size() != 2)
return false;
@@ -1779,6 +1816,27 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
return true;
}
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
+
+ // We can vectorize the tree if its size is greater than or equal to the
+ // minimum size specified by the MinTreeSize command line option.
+ if (VectorizableTree.size() >= MinTreeSize)
+ return false;
+
+ // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
+ // can vectorize it if we can prove it fully vectorizable.
+ if (isFullyVectorizableTinyTree())
+ return false;
+
+ assert(VectorizableTree.empty()
+ ? ExternalUses.empty()
+ : true && "We shouldn't have any external users");
+
+ // Otherwise, we can't vectorize the tree. It is both tiny and not fully
+ // vectorizable.
+ return true;
+}
+
int BoUpSLP::getSpillCost() {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
@@ -1816,9 +1874,9 @@ int BoUpSLP::getSpillCost() {
);
// Now find the sequence of instructions between PrevInst and Inst.
- BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
- PrevInstIt(PrevInst->getIterator());
- --PrevInstIt;
+ BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
+ PrevInstIt =
+ PrevInst->getIterator().getReverse();
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
PrevInstIt = Inst->getParent()->rbegin();
@@ -1846,14 +1904,6 @@ int BoUpSLP::getTreeCost() {
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
VectorizableTree.size() << ".\n");
- // We only vectorize tiny trees if it is fully vectorizable.
- if (VectorizableTree.size() < MinTreeSize && !isFullyVectorizableTinyTree()) {
- if (VectorizableTree.empty()) {
- assert(!ExternalUses.size() && "We should not have any external users");
- }
- return INT_MAX;
- }
-
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
for (TreeEntry &TE : VectorizableTree) {
@@ -1882,10 +1932,12 @@ int BoUpSLP::getTreeCost() {
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) {
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+ auto Extend =
+ MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
VecTy = VectorType::get(MinTy, BundleWidth);
- ExtractCost += TTI->getExtractWithExtendCost(
- Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane);
+ ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+ VecTy, EU.Lane);
} else {
ExtractCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
@@ -2182,7 +2234,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(BB, next(BasicBlock::iterator(LastInst)));
+ Builder.SetInsertPoint(BB, ++LastInst->getIterator());
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -2383,6 +2435,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = Builder.CreateICmp(P0, L, R);
E->VectorizedValue = V;
+ propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
@@ -2593,6 +2646,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
E->VectorizedValue = V;
+ propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
@@ -2669,7 +2723,7 @@ Value *BoUpSLP::vectorizeTree() {
if (auto *I = dyn_cast<Instruction>(VectorRoot))
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
auto BundleWidth = VectorizableTree[0].Scalars.size();
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto *VecTy = VectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
VectorizableTree[0].VectorizedValue = Trunc;
@@ -2677,6 +2731,16 @@ Value *BoUpSLP::vectorizeTree() {
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+ // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
+ // specified by ScalarType.
+ auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
+ if (!MinBWs.count(ScalarRoot))
+ return Ex;
+ if (MinBWs[ScalarRoot].second)
+ return Builder.CreateSExt(Ex, ScalarType);
+ return Builder.CreateZExt(Ex, ScalarType);
+ };
+
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -2684,8 +2748,7 @@ Value *BoUpSLP::vectorizeTree() {
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
- if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
- Scalar->user_end())
+ if (!is_contained(Scalar->users(), User))
continue;
assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
@@ -2712,8 +2775,7 @@ Value *BoUpSLP::vectorizeTree() {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
}
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- if (MinBWs.count(ScalarRoot))
- Ex = Builder.CreateSExt(Ex, Scalar->getType());
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);
}
@@ -2721,16 +2783,14 @@ Value *BoUpSLP::vectorizeTree() {
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- if (MinBWs.count(ScalarRoot))
- Ex = Builder.CreateSExt(Ex, Scalar->getType());
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- if (MinBWs.count(ScalarRoot))
- Ex = Builder.CreateSExt(Ex, Scalar->getType());
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
}
@@ -2759,8 +2819,7 @@ Value *BoUpSLP::vectorizeTree() {
assert((ScalarToTreeEntry.count(U) ||
// It is legal to replace users in the ignorelist by undef.
- (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
- UserIgnoreList.end())) &&
+ is_contained(UserIgnoreList, U)) &&
"Replacing out-of-tree value with undef");
}
#endif
@@ -2853,7 +2912,7 @@ void BoUpSLP::optimizeGatherSequence() {
}
}
if (In) {
- assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+ assert(!is_contained(Visited, In));
Visited.push_back(In);
}
}
@@ -2994,9 +3053,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
- BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
+ BasicBlock::reverse_iterator UpIter =
+ ++ScheduleStart->getIterator().getReverse();
BasicBlock::reverse_iterator UpperEnd = BB->rend();
- BasicBlock::iterator DownIter(ScheduleEnd);
+ BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
for (;;) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
@@ -3451,6 +3511,11 @@ void BoUpSLP::computeMinimumValueSizes() {
Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
}
+ // True if the roots can be zero-extended back to their original type, rather
+ // than sign-extended. We know that if the leading bits are not demanded, we
+ // can safely zero-extend. So we initialize IsKnownPositive to True.
+ bool IsKnownPositive = true;
+
// If all the bits of the roots are demanded, we can try a little harder to
// compute a narrower type. This can happen, for example, if the roots are
// getelementptr indices. InstCombine promotes these indices to the pointer
@@ -3462,11 +3527,41 @@ void BoUpSLP::computeMinimumValueSizes() {
// compute the number of high-order bits we can truncate.
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
MaxBitWidth = 8u;
+
+ // Determine if the sign bit of all the roots is known to be zero. If not,
+ // IsKnownPositive is set to False.
+ IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
+ bool KnownZero = false;
+ bool KnownOne = false;
+ ComputeSignBit(R, KnownZero, KnownOne, *DL);
+ return KnownZero;
+ });
+
+ // Determine the maximum number of bits required to store the scalar
+ // values.
for (auto *Scalar : ToDemote) {
auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
}
+
+ // If we can't prove that the sign bit is zero, we must add one to the
+ // maximum bit width to account for the unknown sign bit. This preserves
+ // the existing sign bit so we can safely sign-extend the root back to the
+ // original type. Otherwise, if we know the sign bit is zero, we will
+ // zero-extend the root instead.
+ //
+ // FIXME: This is somewhat suboptimal, as there will be cases where adding
+ // one to the maximum bit width will yield a larger-than-necessary
+ // type. In general, we need to add an extra bit only if we can't
+ // prove that the upper bit of the original type is equal to the
+ // upper bit of the proposed smaller type. If these two bits are the
+ // same (either zero or one) we know that sign-extending from the
+ // smaller type will result in the same value. Here, since we can't
+ // yet prove this, we are just making the proposed smaller type
+ // larger to ensure correctness.
+ if (!IsKnownPositive)
+ ++MaxBitWidth;
}
// Round MaxBitWidth up to the next power-of-two.
@@ -3486,7 +3581,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Finally, map the values we can demote to the maximum bit with we computed.
for (auto *Scalar : ToDemote)
- MinBWs[Scalar] = MaxBitWidth;
+ MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
}
namespace {
@@ -3642,8 +3737,7 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
return !std::equal(VL.begin(), VL.end(), VH.begin());
}
-bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
- int CostThreshold, BoUpSLP &R,
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
@@ -3672,12 +3766,15 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ continue;
+
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
- if (Cost < CostThreshold) {
+ if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
R.vectorizeTree();
@@ -3691,7 +3788,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
}
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
- int costThreshold, BoUpSLP &R) {
+ BoUpSLP &R) {
SetVector<StoreInst *> Heads, Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
@@ -3746,8 +3843,9 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
- for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) {
- if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+ for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
+ Size /= 2) {
+ if (vectorizeStoreChain(Operands, R, Size)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Operands.begin(), Operands.end());
Changed = true;
@@ -3805,11 +3903,12 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
ArrayRef<Value *> BuildVector,
- bool allowReorder) {
+ bool AllowReorder) {
if (VL.size() < 2)
return false;
- DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+ DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
+ << ".\n");
// Check that all of the parts are scalar instructions of the same type.
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
@@ -3818,10 +3917,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Opcode0 = I0->getOpcode();
- // FIXME: Register size should be a parameter to this function, so we can
- // try different vectorization factors.
unsigned Sz = R.getVectorElementSize(I0);
- unsigned VF = R.getMinVecRegSize() / Sz;
+ unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+ unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ if (MaxVF < 2)
+ return false;
for (Value *V : VL) {
Type *Ty = V->getType();
@@ -3837,70 +3937,89 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- unsigned OpsWidth = 0;
+ unsigned NextInst = 0, MaxInst = VL.size();
+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
+ VF /= 2) {
+ // No actual vectorization should happen, if number of parts is the same as
+ // provided vectorization factor (i.e. the scalar type is used for vector
+ // code during codegen).
+ auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+ if (TTI->getNumberOfParts(VecTy) == VF)
+ continue;
+ for (unsigned I = NextInst; I < MaxInst; ++I) {
+ unsigned OpsWidth = 0;
- if (i + VF > e)
- OpsWidth = e - i;
- else
- OpsWidth = VF;
+ if (I + VF > MaxInst)
+ OpsWidth = MaxInst - I;
+ else
+ OpsWidth = VF;
- if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
- break;
+ if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+ break;
- // Check that a previous iteration of this loop did not delete the Value.
- if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
- continue;
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+ continue;
- DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
- << "\n");
- ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
-
- ArrayRef<Value *> BuildVectorSlice;
- if (!BuildVector.empty())
- BuildVectorSlice = BuildVector.slice(i, OpsWidth);
-
- R.buildTree(Ops, BuildVectorSlice);
- // TODO: check if we can allow reordering also for other cases than
- // tryToVectorizePair()
- if (allowReorder && R.shouldReorder()) {
- assert(Ops.size() == 2);
- assert(BuildVectorSlice.empty());
- Value *ReorderedOps[] = { Ops[1], Ops[0] };
- R.buildTree(ReorderedOps, None);
- }
- R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
+ ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+
+ ArrayRef<Value *> BuildVectorSlice;
+ if (!BuildVector.empty())
+ BuildVectorSlice = BuildVector.slice(I, OpsWidth);
+
+ R.buildTree(Ops, BuildVectorSlice);
+ // TODO: check if we can allow reordering for more cases.
+ if (AllowReorder && R.shouldReorder()) {
+ // Conceptually, there is nothing actually preventing us from trying to
+ // reorder a larger list. In fact, we do exactly this when vectorizing
+ // reductions. However, at this point, we only expect to get here from
+ // tryToVectorizePair().
+ assert(Ops.size() == 2);
+ assert(BuildVectorSlice.empty());
+ Value *ReorderedOps[] = {Ops[1], Ops[0]};
+ R.buildTree(ReorderedOps, None);
+ }
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ continue;
- if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
- Value *VectorizedRoot = R.vectorizeTree();
-
- // Reconstruct the build vector by extracting the vectorized root. This
- // way we handle the case where some elements of the vector are undefined.
- // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
- if (!BuildVectorSlice.empty()) {
- // The insert point is the last build vector instruction. The vectorized
- // root will precede it. This guarantees that we get an instruction. The
- // vectorized tree could have been constant folded.
- Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
- unsigned VecIdx = 0;
- for (auto &V : BuildVectorSlice) {
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
- Instruction *I = cast<Instruction>(V);
- assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
- Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
- VectorizedRoot, Builder.getInt32(VecIdx++)));
- I->setOperand(1, Extract);
- I->removeFromParent();
- I->insertAfter(Extract);
- InsertAfter = I;
+ R.computeMinimumValueSizes();
+ int Cost = R.getTreeCost();
+
+ if (Cost < -SLPCostThreshold) {
+ DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ Value *VectorizedRoot = R.vectorizeTree();
+
+ // Reconstruct the build vector by extracting the vectorized root. This
+ // way we handle the case where some elements of the vector are
+ // undefined.
+ // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
+ if (!BuildVectorSlice.empty()) {
+ // The insert point is the last build vector instruction. The
+ // vectorized root will precede it. This guarantees that we get an
+ // instruction. The vectorized tree could have been constant folded.
+ Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
+ unsigned VecIdx = 0;
+ for (auto &V : BuildVectorSlice) {
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
+ Instruction *I = cast<Instruction>(V);
+ assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
+ Instruction *Extract =
+ cast<Instruction>(Builder.CreateExtractElement(
+ VectorizedRoot, Builder.getInt32(VecIdx++)));
+ I->setOperand(1, Extract);
+ I->removeFromParent();
+ I->insertAfter(Extract);
+ InsertAfter = I;
+ }
}
+ // Move to the next bundle.
+ I += VF - 1;
+ NextInst = I + 1;
+ Changed = true;
}
- // Move to the next bundle.
- i += VF - 1;
- Changed = true;
}
}
@@ -3911,36 +4030,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
if (!V)
return false;
+ Value *P = V->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+ return false;
+
// Try to vectorize V.
- if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+ if (tryToVectorizePair(Op0, Op1, R))
return true;
- BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
- BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
// Try to skip B.
if (B && B->hasOneUse()) {
- BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (tryToVectorizePair(A, B0, R)) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
return true;
- }
- if (tryToVectorizePair(A, B1, R)) {
+ if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
return true;
- }
}
// Try to skip A.
if (A && A->hasOneUse()) {
- BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (tryToVectorizePair(A0, B, R)) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
return true;
- }
- if (tryToVectorizePair(A1, B, R)) {
+ if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
return true;
- }
}
- return 0;
+ return false;
}
/// \brief Generate a shuffle mask to be used in a reduction tree.
@@ -3973,7 +4096,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
return ConstantVector::get(ShuffleMask);
}
-
+namespace {
/// Model horizontal reductions.
///
/// A horizontal reduction is a tree of reduction operations (currently add and
@@ -4006,7 +4129,14 @@ class HorizontalReduction {
SmallVector<Value *, 32> ReducedVals;
BinaryOperator *ReductionRoot;
- PHINode *ReductionPHI;
+ // After successfull horizontal reduction vectorization attempt for PHI node
+ // vectorizer tries to update root binary op by combining vectorized tree and
+ // the ReductionPHI node. But during vectorization this ReductionPHI can be
+ // vectorized itself and replaced by the undef value, while the instruction
+ // itself is marked for deletion. This 'marked for deletion' PHI node then can
+ // be used in new binary operation, causing "Use still stuck around after Def
+ // is destroyed" crash upon PHI node deletion.
+ WeakVH ReductionPHI;
/// The opcode of the reduction.
unsigned ReductionOpcode;
@@ -4025,14 +4155,13 @@ public:
unsigned MinVecRegSize;
HorizontalReduction(unsigned MinVecRegSize)
- : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
- ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
+ : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
+ IsPairwiseReduction(false), ReduxWidth(0),
MinVecRegSize(MinVecRegSize) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
- assert((!Phi ||
- std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
+ assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
// We could have a initial reductions that is not an add.
@@ -4113,12 +4242,21 @@ public:
// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);
- // We currently only allow BinaryOperator's and SelectInst's as reduction
- // values in our tree.
- if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
- Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
- else if (NextV != Phi)
+ if (NextV != Phi) {
+ auto *I = dyn_cast<Instruction>(NextV);
+ // Continue analysis if the next operand is a reduction operation or
+ // (possibly) a reduced value. If the reduced value opcode is not set,
+ // the first met operation != reduction operation is considered as the
+ // reduced value class.
+ if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
+ I->getOpcode() == ReductionOpcode)) {
+ if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
+ ReducedValueOpcode = I->getOpcode();
+ Stack.push_back(std::make_pair(I, 0));
+ continue;
+ }
return false;
+ }
}
return true;
}
@@ -4141,7 +4279,15 @@ public:
unsigned i = 0;
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
- V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
+ auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+ V.buildTree(VL, ReductionOps);
+ if (V.shouldReorder()) {
+ SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
+ V.buildTree(Reversed, ReductionOps);
+ }
+ if (V.isTreeTinyAndNotFullyVectorizable())
+ continue;
+
V.computeMinimumValueSizes();
// Estimate cost.
@@ -4175,7 +4321,7 @@ public:
ReducedVals[i]);
}
// Update users.
- if (ReductionPHI) {
+ if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
assert(ReductionRoot && "Need a reduction operation");
ReductionRoot->setOperand(0, VectorizedTree);
ReductionRoot->setOperand(1, ReductionPHI);
@@ -4202,7 +4348,8 @@ private:
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
int ScalarReduxCost =
- ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+ (ReduxWidth - 1) *
+ TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
@@ -4254,6 +4401,7 @@ private:
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}
};
+} // end anonymous namespace
/// \brief Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
@@ -4354,7 +4502,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
return nullptr;
// There is a loop latch, return the incoming value if it comes from
- // that. This reduction pattern occassionaly turns up.
+ // that. This reduction pattern occasionally turns up.
if (P->getIncomingBlock(0) == BBLatch) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == BBLatch) {
@@ -4367,29 +4515,143 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
return nullptr;
}
+namespace {
+/// Tracks instructons and its children.
+class WeakVHWithLevel final : public CallbackVH {
+ /// Operand index of the instruction currently beeing analized.
+ unsigned Level = 0;
+ /// Is this the instruction that should be vectorized, or are we now
+ /// processing children (i.e. operands of this instruction) for potential
+ /// vectorization?
+ bool IsInitial = true;
+
+public:
+ explicit WeakVHWithLevel() = default;
+ WeakVHWithLevel(Value *V) : CallbackVH(V){};
+ /// Restart children analysis each time it is repaced by the new instruction.
+ void allUsesReplacedWith(Value *New) override {
+ setValPtr(New);
+ Level = 0;
+ IsInitial = true;
+ }
+ /// Check if the instruction was not deleted during vectorization.
+ bool isValid() const { return !getValPtr(); }
+ /// Is the istruction itself must be vectorized?
+ bool isInitial() const { return IsInitial; }
+ /// Try to vectorize children.
+ void clearInitial() { IsInitial = false; }
+ /// Are all children processed already?
+ bool isFinal() const {
+ assert(getValPtr() &&
+ (isa<Instruction>(getValPtr()) &&
+ cast<Instruction>(getValPtr())->getNumOperands() >= Level));
+ return getValPtr() &&
+ cast<Instruction>(getValPtr())->getNumOperands() == Level;
+ }
+ /// Get next child operation.
+ Value *nextOperand() {
+ assert(getValPtr() && isa<Instruction>(getValPtr()) &&
+ cast<Instruction>(getValPtr())->getNumOperands() > Level);
+ return cast<Instruction>(getValPtr())->getOperand(Level++);
+ }
+ virtual ~WeakVHWithLevel() = default;
+};
+} // namespace
+
/// \brief Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators BI, then check if it
-/// can be done.
+/// the phi node P with reduction operators Root in a basic block BB, then check
+/// if it can be done.
/// \returns true if a horizontal reduction was matched and reduced.
/// \returns false if a horizontal reduction was not matched.
-static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
- BoUpSLP &R, TargetTransformInfo *TTI,
- unsigned MinRegSize) {
+static bool canBeVectorized(
+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI,
+ const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
- HorizontalReduction HorRdx(MinRegSize);
- if (!HorRdx.matchAssociativeReduction(P, BI))
+ if (!Root)
return false;
- // If there is a sufficient number of reduction values, reduce
- // to a nearby power-of-2. Can safely generate oversized
- // vectors and rely on the backend to split them to legal sizes.
- HorRdx.ReduxWidth =
- std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+ if (Root->getParent() != BB)
+ return false;
+ SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
+ SmallSet<Value *, 8> VisitedInstrs;
+ bool Res = false;
+ while (!Stack.empty()) {
+ Value *V = Stack.back();
+ if (!V) {
+ Stack.pop_back();
+ continue;
+ }
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || isa<PHINode>(Inst)) {
+ Stack.pop_back();
+ continue;
+ }
+ if (Stack.back().isInitial()) {
+ Stack.back().clearInitial();
+ if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+ HorizontalReduction HorRdx(R.getMinVecRegSize());
+ if (HorRdx.matchAssociativeReduction(P, BI)) {
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ HorRdx.ReduxWidth =
+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+
+ if (HorRdx.tryToReduce(R, TTI)) {
+ Res = true;
+ P = nullptr;
+ continue;
+ }
+ }
+ if (P) {
+ Inst = dyn_cast<Instruction>(BI->getOperand(0));
+ if (Inst == P)
+ Inst = dyn_cast<Instruction>(BI->getOperand(1));
+ if (!Inst) {
+ P = nullptr;
+ continue;
+ }
+ }
+ }
+ P = nullptr;
+ if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ Res = true;
+ continue;
+ }
+ }
+ if (Stack.back().isFinal()) {
+ Stack.pop_back();
+ continue;
+ }
- return HorRdx.tryToReduce(R, TTI);
+ if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
+ if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
+ Stack.size() < RecursionMaxDepth)
+ Stack.push_back(NextV);
+ }
+ return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+ BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI) {
+ if (!V)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator>(I))
+ P = nullptr;
+ // Try to match and vectorize a horizontal reduction.
+ return canBeVectorized(P, I, BB, R, TTI,
+ [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
+ return tryToVectorize(BI, R);
+ });
}
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
@@ -4459,65 +4721,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (P->getNumIncomingValues() != 2)
return Changed;
- Value *Rdx = getReductionValue(DT, P, BB, LI);
-
- // Check if this is a Binary Operator.
- BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
- if (!BI)
- continue;
-
// Try to match and vectorize a horizontal reduction.
- if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
-
- Value *Inst = BI->getOperand(0);
- if (Inst == P)
- Inst = BI->getOperand(1);
-
- if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
+ if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+ TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
continue;
}
-
continue;
}
- if (ShouldStartVectorizeHorAtStore)
- if (StoreInst *SI = dyn_cast<StoreInst>(it))
- if (BinaryOperator *BinOp =
- dyn_cast<BinaryOperator>(SI->getValueOperand())) {
- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
- R.getMinVecRegSize()) ||
- tryToVectorize(BinOp, R)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
+ if (ShouldStartVectorizeHorAtStore) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
+ TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
}
+ }
+ }
// Try to vectorize horizontal reductions feeding into a return.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
- if (RI->getNumOperands() != 0)
- if (BinaryOperator *BinOp =
- dyn_cast<BinaryOperator>(RI->getOperand(0))) {
- DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
- if (tryToVectorizePair(BinOp->getOperand(0),
- BinOp->getOperand(1), R)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
+ if (RI->getNumOperands() != 0) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
}
+ }
+ }
// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
@@ -4530,16 +4769,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- for (int i = 0; i < 2; ++i) {
- if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
- if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- break;
- }
+ for (int I = 0; I < 2; ++I) {
+ if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ break;
}
}
continue;
@@ -4690,8 +4927,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
// may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
- Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
- -SLPCostThreshold, R);
+ Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}
}
return Changed;