aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp730
1 files changed, 443 insertions, 287 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index aabd974cd73e..5bc35aa4695f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -47,6 +47,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -85,6 +86,7 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
@@ -107,9 +109,8 @@ using namespace slpvectorizer;
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
-cl::opt<bool>
- llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,
- cl::desc("Run the SLP vectorization passes"));
+cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
+ cl::desc("Run the SLP vectorization passes"));
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
@@ -284,7 +285,7 @@ static bool isCommutative(Instruction *I) {
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
- unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
+ unsigned Size = EI0->getVectorOperandType()->getNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
@@ -293,7 +294,7 @@ isShuffle(ArrayRef<Value *> VL) {
auto *EI = cast<ExtractElementInst>(VL[I]);
auto *Vec = EI->getVectorOperand();
// All vector operands must have the same number of vector elements.
- if (Vec->getType()->getVectorNumElements() != Size)
+ if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
return None;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
@@ -377,6 +378,18 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {
return S.OpValue;
}
+/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// instruction for SLP vectorization.
+///
+/// Example of unsupported opcode is SDIV that can potentially cause UB if the
+/// "shuffled out" lane would result in division by zero.
+static bool isValidForAlternation(unsigned Opcode) {
+ if (Instruction::isIntDivRem(Opcode))
+ return false;
+
+ return true;
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
@@ -399,7 +412,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
- if (Opcode == AltOpcode) {
+ if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
+ isValidForAlternation(Opcode)) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
@@ -411,6 +425,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
+ assert(isValidForAlternation(Opcode) &&
+ isValidForAlternation(InstOpcode) &&
+ "Cast isn't safe for alternation, logic needs to be updated!");
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
@@ -613,7 +630,7 @@ public:
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
- unsigned getVectorElementSize(Value *V) const;
+ unsigned getVectorElementSize(Value *V);
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
@@ -650,6 +667,15 @@ public:
/// may not be necessary.
bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+ /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
+ /// can be load combined in the backend. Load combining may not be allowed in
+ /// the IR optimizer, so we do not want to alter the pattern. For example,
+ /// partially transforming a scalar bswap() pattern into vector code is
+ /// effectively impossible for the backend to undo.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
+ bool isLoadCombineCandidate() const;
+
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
@@ -816,13 +842,12 @@ public:
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
- auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
- auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
- if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
- cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
- cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
+ Value *EV;
+ ConstantInt *Ex1Idx, *Ex2Idx;
+ if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
+ match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
+ Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
return VLOperands::ScoreConsecutiveExtracts;
- }
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
@@ -852,7 +877,7 @@ public:
int getExternalUsesCost(const std::pair<Value *, int> &LHS,
const std::pair<Value *, int> &RHS) {
int Cost = 0;
- SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
+ std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
Value *V = Values[Idx].first;
// Calculate the absolute lane, using the minimum relative lane of LHS
@@ -1385,7 +1410,8 @@ private:
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
+ int getGatherCost(VectorType *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
@@ -1422,7 +1448,7 @@ private:
return VL.size() == ReuseShuffleIndices.size() &&
std::equal(
VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
- [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
+ [this](Value *V, int Idx) { return V == Scalars[Idx]; });
}
/// A vector of scalars.
@@ -1436,7 +1462,7 @@ private:
EntryState State;
/// Does this sequence require some shuffling?
- SmallVector<unsigned, 4> ReuseShuffleIndices;
+ SmallVector<int, 4> ReuseShuffleIndices;
/// Does this entry require reordering?
ArrayRef<unsigned> ReorderIndices;
@@ -1690,6 +1716,9 @@ private:
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
+ /// Maps a value to the proposed vectorizable size.
+ SmallDenseMap<Value *, unsigned> InstrElementSize;
+
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
@@ -2001,6 +2030,20 @@ private:
if (TreeEntry *TE = BundleMember->TE) {
int Lane = BundleMember->Lane;
assert(Lane >= 0 && "Lane not set");
+
+ // Since vectorization tree is being built recursively this assertion
+ // ensures that the tree entry has all operands set before reaching
+ // this code. Couple of exceptions known at the moment are extracts
+ // where their second (immediate) operand is not added. Since
+ // immediates do not affect scheduler behavior this is considered
+ // okay.
+ auto *In = TE->getMainOp();
+ assert(In &&
+ (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+ In->getNumOperands() == TE->getNumOperands()) &&
+ "Missed TreeEntry operands?");
+ (void)In; // fake use to avoid build failure when assertions disabled
+
for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
OpIdx != NumOperands; ++OpIdx)
if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
@@ -2323,6 +2366,7 @@ BoUpSLP::~BoUpSLP() {
"trying to erase instruction with users.");
Pair.getFirst()->eraseFromParent();
}
+ assert(!verifyFunction(*F, &dbgs()));
}
void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
@@ -2978,19 +3022,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
case Instruction::Call: {
- // Check if the calls are all to the same vectorizable intrinsic.
+ // Check if the calls are all to the same vectorizable intrinsic or
+ // library function.
CallInst *CI = cast<CallInst>(VL0);
- // Check if this is an Intrinsic call or something that can be
- // represented by an intrinsic call
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (!isTriviallyVectorizable(ID)) {
+
+ VFShape Shape = VFShape::get(
+ *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/},
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+ if (!VecFunc && !isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
- Function *Int = CI->getCalledFunction();
+ Function *F = CI->getCalledFunction();
unsigned NumArgs = CI->getNumArgOperands();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
@@ -2998,8 +3047,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ScalarArgs[j] = CI->getArgOperand(j);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
- if (!CI2 || CI2->getCalledFunction() != Int ||
+ if (!CI2 || CI2->getCalledFunction() != F ||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+ (VecFunc &&
+ VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -3101,7 +3152,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N = 1;
Type *EltTy = T;
- while (isa<CompositeType>(EltTy)) {
+ while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
+ isa<VectorType>(EltTy)) {
if (auto *ST = dyn_cast<StructType>(EltTy)) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
@@ -3109,16 +3161,19 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
return 0;
N *= ST->getNumElements();
EltTy = *ST->element_begin();
+ } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
+ N *= AT->getNumElements();
+ EltTy = AT->getElementType();
} else {
- auto *SeqT = cast<SequentialType>(EltTy);
- N *= SeqT->getNumElements();
- EltTy = SeqT->getElementType();
+ auto *VT = cast<VectorType>(EltTy);
+ N *= VT->getNumElements();
+ EltTy = VT->getElementType();
}
}
if (!isValidElementType(EltTy))
return 0;
- uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+ uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
return N;
@@ -3148,7 +3203,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
- NElts = Vec->getType()->getVectorNumElements();
+ NElts = cast<VectorType>(Vec->getType())->getNumElements();
}
if (NElts != VL.size())
@@ -3198,6 +3253,35 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
});
}
+static std::pair<unsigned, unsigned>
+getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
+ int IntrinsicCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+
+ auto Shape =
+ VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ int LibCost = IntrinsicCost;
+ if (!CI->isNoBuiltin() && VecFunc) {
+ // Calculate the cost of the vector library call.
+ SmallVector<Type *, 4> VecTys;
+ for (Use &Arg : CI->args())
+ VecTys.push_back(
+ FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+
+ // If the corresponding vector call is cheaper, return its cost.
+ LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
+ TTI::TCK_RecipThroughput);
+ }
+ return {IntrinsicCost, LibCost};
+}
+
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
@@ -3206,12 +3290,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
ScalarTy = SI->getValueOperand()->getType();
else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
ScalarTy = CI->getOperand(0)->getType();
- VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
- VecTy = VectorType::get(
+ VecTy = FixedVectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
@@ -3251,6 +3336,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return ReuseShuffleCost + getGatherCost(VL);
}
+ assert(E->State == TreeEntry::Vectorize && "Unhandled state");
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
@@ -3260,7 +3346,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return 0;
case Instruction::ExtractValue:
- case Instruction::ExtractElement:
+ case Instruction::ExtractElement: {
if (NeedToShuffleReuses) {
unsigned Idx = 0;
for (unsigned I : E->ReuseShuffleIndices) {
@@ -3289,43 +3375,41 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
- if (E->State == TreeEntry::Vectorize) {
- int DeadCost = ReuseShuffleCost;
- if (!E->ReorderIndices.empty()) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- DeadCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- Instruction *E = cast<Instruction>(VL[i]);
- // If all users are going to be vectorized, instruction can be
- // considered as dead.
- // The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(E)) {
- // Take credit for instruction that will become dead.
- if (E->hasOneUse()) {
- Instruction *Ext = E->user_back();
- if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
- all_of(Ext->users(),
- [](User *U) { return isa<GetElementPtrInst>(U); })) {
- // Use getExtractWithExtendCost() to calculate the cost of
- // extractelement/ext pair.
- DeadCost -= TTI->getExtractWithExtendCost(
- Ext->getOpcode(), Ext->getType(), VecTy, i);
- // Add back the cost of s|zext which is subtracted separately.
- DeadCost += TTI->getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
- continue;
- }
+ int DeadCost = ReuseShuffleCost;
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ DeadCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *E = cast<Instruction>(VL[i]);
+ // If all users are going to be vectorized, instruction can be
+ // considered as dead.
+ // The same, if have only one user, it will be vectorized for sure.
+ if (areAllUsersVectorized(E)) {
+ // Take credit for instruction that will become dead.
+ if (E->hasOneUse()) {
+ Instruction *Ext = E->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ DeadCost -= TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), VecTy, i);
+ // Add back the cost of s|zext which is subtracted separately.
+ DeadCost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
+ Ext);
+ continue;
}
- DeadCost -=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
}
+ DeadCost -=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
}
- return DeadCost;
}
- return ReuseShuffleCost + getGatherCost(VL);
-
+ return DeadCost;
+ }
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -3340,7 +3424,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
int ScalarEltCost =
- TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
+ VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
@@ -3348,12 +3433,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * ScalarEltCost;
- VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+ auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
int VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
VecCost = ReuseShuffleCost +
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+ CostKind, VL0);
}
return VecCost - ScalarCost;
}
@@ -3362,13 +3448,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::Select: {
// Calculate the cost of this instruction.
int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(), VL0);
+ Builder.getInt1Ty(),
+ CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+ auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
+ int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
+ CostKind, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
@@ -3429,13 +3517,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarEltCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
+ E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
+ Operands, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
+ E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
+ Operands, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3445,26 +3535,30 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OK_UniformConstantValue;
int ScalarEltCost =
- TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
+ Op1VK, Op2VK);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost =
- TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+ TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
+ Op1VK, Op2VK);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
- MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());
+ Align alignment = cast<LoadInst>(VL0)->getAlign();
int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
+ CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
int VecLdCost =
- TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+ CostKind, VL0);
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecLdCost += TTI->getShuffleCost(
@@ -3477,14 +3571,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
bool IsReorder = !E->ReorderIndices.empty();
auto *SI =
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
- MaybeAlign Alignment(SI->getAlignment());
+ Align Alignment = SI->getAlign();
int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
+ CostKind, VL0);
if (NeedToShuffleReuses)
ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, Alignment, 0, VL0);
+ VecTy, Alignment, 0, CostKind, VL0);
if (IsReorder) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecStCost += TTI->getShuffleCost(
@@ -3497,24 +3592,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type *, 4> ScalarTys;
- for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
- ScalarTys.push_back(CI->getArgOperand(op)->getType());
-
- FastMathFlags FMF;
- if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
- FMF = FPMO->getFastMathFlags();
-
- int ScalarEltCost =
- TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+ IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
+ int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
- SmallVector<Value *, 4> Args(CI->arg_operands());
- int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
- VecTy->getNumElements());
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+ int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -3533,34 +3619,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
- ReuseShuffleCost -= TTI->getInstructionCost(
- I, TargetTransformInfo::TCK_RecipThroughput);
+ ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
- ReuseShuffleCost += TTI->getInstructionCost(
- I, TargetTransformInfo::TCK_RecipThroughput);
+ ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
}
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- ScalarCost += TTI->getInstructionCost(
- I, TargetTransformInfo::TCK_RecipThroughput);
+ ScalarCost += TTI->getInstructionCost(I, CostKind);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
int VecCost = 0;
if (Instruction::isBinaryOp(E->getOpcode())) {
- VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
- VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
+ VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+ VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
+ CostKind);
} else {
Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
- VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
- VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
- VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
- VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
+ auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+ auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+ CostKind);
+ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+ CostKind);
}
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
return ReuseShuffleCost + VecCost - ScalarCost;
@@ -3596,24 +3682,20 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
return true;
}
-bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
- if (RdxOpcode != Instruction::Or)
- return false;
-
- unsigned NumElts = VectorizableTree[0]->Scalars.size();
- Value *FirstReduced = VectorizableTree[0]->Scalars[0];
-
- // Look past the reduction to find a source value. Arbitrarily follow the
+static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
+ TargetTransformInfo *TTI) {
+ // Look past the root to find a source value. Arbitrarily follow the
// path through operand 0 of any 'or'. Also, peek through optional
// shift-left-by-constant.
- Value *ZextLoad = FirstReduced;
- while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
- match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+ Value *ZextLoad = Root;
+ while (!isa<ConstantExpr>(ZextLoad) &&
+ (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+ match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
- // Check if the input to the reduction is an extended load.
+ // Check if the input is an extended load of the required or/shift expression.
Value *LoadPtr;
- if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
return false;
// Require that the total load bit width is a legal integer type.
@@ -3621,15 +3703,36 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
Type *SrcTy = LoadPtr->getType()->getPointerElementType();
unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
- LLVMContext &Context = FirstReduced->getContext();
- if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+ if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
return false;
// Everything matched - assume that we can fold the whole sequence using
// load combining.
- LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
- << *(cast<Instruction>(FirstReduced)) << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
+ << *(cast<Instruction>(Root)) << "\n");
+
+ return true;
+}
+
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+ if (RdxOpcode != Instruction::Or)
+ return false;
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+ return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+}
+
+bool BoUpSLP::isLoadCombineCandidate() const {
+ // Peek through a final sequence of stores and check if all operations are
+ // likely to be load-combined.
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ for (Value *Scalar : VectorizableTree[0]->Scalars) {
+ Value *X;
+ if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
+ !isLoadCombineCandidateImpl(X, NumElts, TTI))
+ return false;
+ }
return true;
}
@@ -3712,7 +3815,7 @@ int BoUpSLP::getSpillCost() const {
if (NumCalls) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
- V.push_back(VectorType::get(II->getType(), BundleWidth));
+ V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
}
@@ -3776,13 +3879,13 @@ int BoUpSLP::getTreeCost() {
// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
- auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+ auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto Extend =
MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
- VecTy = VectorType::get(MinTy, BundleWidth);
+ VecTy = FixedVectorType::get(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
@@ -3809,12 +3912,15 @@ int BoUpSLP::getTreeCost() {
return Cost;
}
-int BoUpSLP::getGatherCost(Type *Ty,
+int BoUpSLP::getGatherCost(VectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {
- int Cost = 0;
- for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+ unsigned NumElts = Ty->getNumElements();
+ APInt DemandedElts = APInt::getNullValue(NumElts);
+ for (unsigned i = 0; i < NumElts; ++i)
if (!ShuffledIndices.count(i))
- Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ DemandedElts.setBit(i);
+ int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+ /*Extract*/ false);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
@@ -3825,7 +3931,7 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
- VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
@@ -3965,9 +4071,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
V = SV->getOperand(0);
} else {
// Reshuffle to get only unique values.
- SmallVector<unsigned, 4> UniqueIdxs;
- SmallSet<unsigned, 4> UsedIdxs;
- for(unsigned Idx : E->ReuseShuffleIndices)
+ SmallVector<int, 4> UniqueIdxs;
+ SmallSet<int, 4> UsedIdxs;
+ for (int Idx : E->ReuseShuffleIndices)
if (UsedIdxs.insert(Idx).second)
UniqueIdxs.emplace_back(Idx);
V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
@@ -3984,7 +4090,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
ScalarTy = SI->getValueOperand()->getType();
// Check that every instruction appears once in this bundle.
- SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<int, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
if (VL.size() > 2) {
DenseMap<Value *, unsigned> UniquePositions;
@@ -4002,7 +4108,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
else
VL = UniqueValues;
}
- VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
Value *V = Gather(VL, VecTy);
if (!ReuseShuffleIndicies.empty()) {
@@ -4017,7 +4123,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
}
static void inversePermutation(ArrayRef<unsigned> Indices,
- SmallVectorImpl<unsigned> &Mask) {
+ SmallVectorImpl<int> &Mask) {
Mask.clear();
const unsigned E = Indices.size();
Mask.resize(E);
@@ -4037,7 +4143,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
- VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
@@ -4056,6 +4162,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
+ assert(E->State == TreeEntry::Vectorize && "Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
switch (ShuffleOrOp) {
@@ -4096,72 +4203,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::ExtractElement: {
- if (E->State == TreeEntry::Vectorize) {
- Value *V = E->getSingleOperand(0);
- if (!E->ReorderIndices.empty()) {
- OrdersType Mask;
- inversePermutation(E->ReorderIndices, Mask);
- Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- if (E->ReorderIndices.empty())
- Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
- E->VectorizedValue = V;
- return V;
+ Value *V = E->getSingleOperand(0);
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int, 4> Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
}
- setInsertPointAfterBundle(E);
- auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ if (E->ReorderIndices.empty())
+ Builder.SetInsertPoint(VL0);
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- if (E->State == TreeEntry::Vectorize) {
- LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
- Builder.SetInsertPoint(LI);
- PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
- Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
- LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());
- Value *NewV = propagateMetadata(V, E->Scalars);
- if (!E->ReorderIndices.empty()) {
- OrdersType Mask;
- inversePermutation(E->ReorderIndices, Mask);
- NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- NewV = Builder.CreateShuffleVector(
- NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
- }
- E->VectorizedValue = NewV;
- return NewV;
+ LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
+ Builder.SetInsertPoint(LI);
+ PointerType *PtrTy =
+ PointerType::get(VecTy, LI->getPointerAddressSpace());
+ Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
+ Value *NewV = propagateMetadata(V, E->Scalars);
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int, 4> Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
}
- setInsertPointAfterBundle(E);
- auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
}
- E->VectorizedValue = V;
- return V;
+ E->VectorizedValue = NewV;
+ return NewV;
}
case Instruction::ZExt:
case Instruction::SExt:
@@ -4207,12 +4287,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- Value *V;
- if (E->getOpcode() == Instruction::FCmp)
- V = Builder.CreateFCmp(P0, L, R);
- else
- V = Builder.CreateICmp(P0, L, R);
-
+ Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -4321,7 +4396,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E);
LoadInst *LI = cast<LoadInst>(VL0);
- Type *ScalarLoadTy = LI->getType();
unsigned AS = LI->getPointerAddressSpace();
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
@@ -4334,14 +4408,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (getTreeEntry(PO))
ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
- MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
- LI = Builder.CreateLoad(VecTy, VecPtr);
- if (!Alignment)
- Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
- LI->setAlignment(Alignment);
+ LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
Value *V = propagateMetadata(LI, E->Scalars);
if (IsReorder) {
- OrdersType Mask;
+ SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
Mask, "reorder_shuffle");
@@ -4359,23 +4429,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool IsReorder = !E->ReorderIndices.empty();
auto *SI = cast<StoreInst>(
IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
- unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
setInsertPointAfterBundle(E);
Value *VecValue = vectorizeTree(E->getOperand(0));
if (IsReorder) {
- OrdersType Mask;
- inversePermutation(E->ReorderIndices, Mask);
+ SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
VecValue = Builder.CreateShuffleVector(
- VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
+ VecValue, UndefValue::get(VecValue->getType()), Mask,
"reorder_shuffle");
}
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
ScalarPtr, VecValue->getType()->getPointerTo(AS));
- StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
+ StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
+ SI->getAlign());
// The pointer operand uses an in-tree scalar, so add the new BitCast to
// ExternalUses to make sure that an extract will be generated in the
@@ -4383,10 +4453,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (getTreeEntry(ScalarPtr))
ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
-
- ST->setAlignment(Align(Alignment));
Value *V = propagateMetadata(ST, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -4445,13 +4511,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (Function *FI = CI->getCalledFunction())
IID = FI->getIntrinsicID();
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+ bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
+ VecCallCosts.first <= VecCallCosts.second;
+
Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (hasVectorInstrinsicScalarOpd(IID, j)) {
+ if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
@@ -4463,10 +4535,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
OpVecs.push_back(OpVec);
}
- Module *M = F->getParent();
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
- Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+ Function *CF;
+ if (!UseIntrinsic) {
+ VFShape Shape = VFShape::get(
+ *CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
+ false /*HasGlobalPred*/);
+ CF = VFDatabase(*CI).getVectorizedFunction(Shape);
+ } else {
+ Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+ CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+ }
+
SmallVector<OperandBundleDef, 1> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
@@ -4527,24 +4606,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// each vector operation.
ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
- SmallVector<Constant *, 8> Mask(e);
+ SmallVector<int, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
auto *OpInst = cast<Instruction>(E->Scalars[i]);
assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
if (OpInst->getOpcode() == E->getAltOpcode()) {
- Mask[i] = Builder.getInt32(e + i);
+ Mask[i] = e + i;
AltScalars.push_back(E->Scalars[i]);
} else {
- Mask[i] = Builder.getInt32(i);
+ Mask[i] = i;
OpScalars.push_back(E->Scalars[i]);
}
}
- Value *ShuffleMask = ConstantVector::get(Mask);
propagateIRFlags(V0, OpScalars);
propagateIRFlags(V1, AltScalars);
- Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
@@ -4586,7 +4664,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
auto BundleWidth = VectorizableTree[0]->Scalars.size();
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto *VecTy = VectorType::get(MinTy, BundleWidth);
+ auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
VectorizableTree[0]->VectorizedValue = Trunc;
}
@@ -4715,6 +4793,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
Builder.ClearInsertionPoint();
+ InstrElementSize.clear();
return VectorizableTree[0]->VectorizedValue;
}
@@ -5251,20 +5330,26 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
BS->ScheduleStart = nullptr;
}
-unsigned BoUpSLP::getVectorElementSize(Value *V) const {
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value without
// traversing the expression tree. This is the common case.
if (auto *Store = dyn_cast<StoreInst>(V))
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+ auto E = InstrElementSize.find(V);
+ if (E != InstrElementSize.end())
+ return E->second;
+
// If V is not a store, we can traverse the expression tree to find loads
// that feed it. The type of the loaded value may indicate a more suitable
// width than V's type. We want to base the vector element size on the width
// of memory operations where possible.
SmallVector<Instruction *, 16> Worklist;
SmallPtrSet<Instruction *, 16> Visited;
- if (auto *I = dyn_cast<Instruction>(V))
+ if (auto *I = dyn_cast<Instruction>(V)) {
Worklist.push_back(I);
+ Visited.insert(I);
+ }
// Traverse the expression tree in bottom-up order looking for loads. If we
// encounter an instruction we don't yet handle, we give up.
@@ -5272,7 +5357,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
auto FoundUnknownInst = false;
while (!Worklist.empty() && !FoundUnknownInst) {
auto *I = Worklist.pop_back_val();
- Visited.insert(I);
// We should only be looking at scalar instructions here. If the current
// instruction has a vector type, give up.
@@ -5292,7 +5376,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
- if (!Visited.count(J))
+ if (Visited.insert(J).second)
Worklist.push_back(J);
}
@@ -5301,13 +5385,17 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
FoundUnknownInst = true;
}
+ int Width = MaxWidth;
// If we didn't encounter a memory access in the expression tree, or if we
- // gave up for some reason, just return the width of V.
+ // gave up for some reason, just return the width of V. Otherwise, return the
+ // maximum width we found.
if (!MaxWidth || FoundUnknownInst)
- return DL->getTypeSizeInBits(V->getType());
+ Width = DL->getTypeSizeInBits(V->getType());
- // Otherwise, return the maximum width we found.
- return MaxWidth;
+ for (Instruction *I : Visited)
+ InstrElementSize[I] = Width;
+
+ return Width;
}
// Determine if a value V in a vectorizable expression Expr can be demoted to a
@@ -5560,6 +5648,7 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<InjectTLIMappingsLegacy>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
@@ -5598,6 +5687,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
LoopInfo *LI_, DominatorTree *DT_,
AssumptionCache *AC_, DemandedBits *DB_,
OptimizationRemarkEmitter *ORE_) {
+ if (!RunSLPVectorization)
+ return false;
SE = SE_;
TTI = TTI_;
TLI = TLI_;
@@ -5657,7 +5748,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
if (Changed) {
R.optimizeGatherSequence();
LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
- LLVM_DEBUG(verifyFunction(F));
}
return Changed;
}
@@ -5688,6 +5778,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
}
if (R.isTreeTinyAndNotFullyVectorizable())
return false;
+ if (R.isLoadCombineCandidate())
+ return false;
R.computeMinimumValueSizes();
@@ -5841,37 +5933,28 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
- Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
+ Value *VL[] = {A, B};
+ return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- int UserCost, bool AllowReorder) {
+ bool AllowReorder,
+ ArrayRef<Value *> InsertUses) {
if (VL.size() < 2)
return false;
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n");
- // Check that all of the parts are scalar instructions of the same type,
+ // Check that all of the parts are instructions of the same type,
// we permit an alternate opcode via InstructionsState.
InstructionsState S = getSameOpcode(VL);
if (!S.getOpcode())
return false;
Instruction *I0 = cast<Instruction>(S.OpValue);
- unsigned Sz = R.getVectorElementSize(I0);
- unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
- unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
- if (MaxVF < 2) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
- << "Cannot SLP vectorize list: vectorization factor "
- << "less than 2 is not supported";
- });
- return false;
- }
-
+ // Make sure invalid types (including vector type) are rejected before
+ // determining vectorization factor for scalar instructions.
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty)) {
@@ -5889,16 +5972,35 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
}
}
+ unsigned Sz = R.getVectorElementSize(I0);
+ unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+ unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ if (MaxVF < 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
+ }
+
bool Changed = false;
bool CandidateFound = false;
int MinCost = SLPCostThreshold;
+ bool CompensateUseCost =
+ !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
+ return V && isa<InsertElementInst>(V);
+ });
+ assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
+ "Each scalar expected to have an associated InsertElement user.");
+
unsigned NextInst = 0, MaxInst = VL.size();
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
- auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+ auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
@@ -5940,8 +6042,48 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
continue;
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost() - UserCost;
+ int Cost = R.getTreeCost();
CandidateFound = true;
+ if (CompensateUseCost) {
+ // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
+ // rather than sum of single inserts as the latter may overestimate
+ // cost. This work should imply improving cost estimation for extracts
+ // that added in for external (for vectorization tree) users,i.e. that
+ // part should also switch to same interface.
+ // For example, the following case is projected code after SLP:
+ // %4 = extractelement <4 x i64> %3, i32 0
+ // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+ // %5 = extractelement <4 x i64> %3, i32 1
+ // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+ // %6 = extractelement <4 x i64> %3, i32 2
+ // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+ // %7 = extractelement <4 x i64> %3, i32 3
+ // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+ //
+ // Extracts here added by SLP in order to feed users (the inserts) of
+ // original scalars and contribute to "ExtractCost" at cost evaluation.
+ // The inserts in turn form sequence to build an aggregate that
+ // detected by findBuildAggregate routine.
+ // SLP makes an assumption that such sequence will be optimized away
+ // later (instcombine) so it tries to compensate ExctractCost with
+ // cost of insert sequence.
+ // Current per element cost calculation approach is not quite accurate
+ // and tends to create bias toward favoring vectorization.
+ // Switching to the TTI interface might help a bit.
+ // Alternative solution could be pattern-match to detect a no-op or
+ // shuffle.
+ unsigned UserCost = 0;
+ for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
+ auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+ UserCost += TTI->getVectorInstrCost(
+ Instruction::InsertElement, IE->getType(), CI->getZExtValue());
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
+ << ".\n");
+ Cost -= UserCost;
+ }
+
MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
@@ -6031,24 +6173,23 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
/// <0,2,...> or <1,3,..> while a splitting reduction will generate
/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
/// \param IsLeft True will generate a mask of even elements, odd otherwise.
-static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
- bool IsPairwise, bool IsLeft,
- IRBuilder<> &Builder) {
+static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
+ unsigned NumEltsToRdx,
+ bool IsPairwise, bool IsLeft) {
assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
- SmallVector<Constant *, 32> ShuffleMask(
- VecLen, UndefValue::get(Builder.getInt32Ty()));
+ SmallVector<int, 32> ShuffleMask(VecLen, -1);
if (IsPairwise)
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+ ShuffleMask[i] = 2 * i + !IsLeft;
else
// Move the upper half of the vector to the lower half.
for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+ ShuffleMask[i] = NumEltsToRdx + i;
- return ConstantVector::get(ShuffleMask);
+ return ShuffleMask;
}
namespace {
@@ -6840,7 +6981,7 @@ private:
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
- Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+ auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);
int PairwiseRdxCost;
int SplittingRdxCost;
@@ -6857,7 +6998,7 @@ private:
case RK_Max:
case RK_UMin:
case RK_UMax: {
- Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
+ auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
ReductionData.getKind() == RK_UMax;
PairwiseRdxCost =
@@ -6922,10 +7063,8 @@ private:
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- Value *LeftMask =
- createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
- Value *RightMask =
- createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+ auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
+ auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);
Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
@@ -6960,20 +7099,16 @@ private:
/// \return true if it matches.
static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,
- int &UserCost) {
+ SmallVectorImpl<Value *> &InsertElts) {
assert((isa<InsertElementInst>(LastInsertInst) ||
isa<InsertValueInst>(LastInsertInst)) &&
"Expected insertelement or insertvalue instruction!");
- UserCost = 0;
do {
Value *InsertedOperand;
- if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+ auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
+ if (IE) {
InsertedOperand = IE->getOperand(1);
LastInsertInst = IE->getOperand(0);
- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
- UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
- IE->getType(), CI->getZExtValue());
- }
} else {
auto *IV = cast<InsertValueInst>(LastInsertInst);
InsertedOperand = IV->getInsertedValueOperand();
@@ -6981,16 +7116,17 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
}
if (isa<InsertElementInst>(InsertedOperand) ||
isa<InsertValueInst>(InsertedOperand)) {
- int TmpUserCost;
SmallVector<Value *, 8> TmpBuildVectorOpds;
+ SmallVector<Value *, 8> TmpInsertElts;
if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
- TmpUserCost))
+ TmpInsertElts))
return false;
BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
TmpBuildVectorOpds.rend());
- UserCost += TmpUserCost;
+ InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
} else {
BuildVectorOpds.push_back(InsertedOperand);
+ InsertElts.push_back(IE);
}
if (isa<UndefValue>(LastInsertInst))
break;
@@ -7000,6 +7136,7 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
return false;
} while (true);
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+ std::reverse(InsertElts.begin(), InsertElts.end());
return true;
}
@@ -7164,26 +7301,29 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
BasicBlock *BB, BoUpSLP &R) {
- int UserCost = 0;
const DataLayout &DL = BB->getModule()->getDataLayout();
if (!R.canMapToVector(IVI->getType(), DL))
return false;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
+ SmallVector<Value *, 16> BuildVectorInsts;
+ if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+ BuildVectorOpds.size() < 2)
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+ return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+ BuildVectorInsts);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
- int UserCost;
+ SmallVector<Value *, 16> BuildVectorInsts;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
+ if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+ BuildVectorOpds.size() < 2 ||
(llvm::all_of(BuildVectorOpds,
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
isShuffle(BuildVectorOpds)))
@@ -7191,7 +7331,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
// Vectorize starting with the build vector operands ignoring the BuildVector
// instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+ return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+ BuildVectorInsts);
}
bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -7228,6 +7369,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallPtrSet<Value *, 16> VisitedInstrs;
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -7254,8 +7396,18 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+ Type *EltTy = (*IncIt)->getType();
+ unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy)
+ : MaxVecRegSize;
+ unsigned MaxNumElts = MaxVecRegSize / EltSize;
+ if (MaxNumElts < 2) {
+ ++IncIt;
+ continue;
+ }
+
while (SameTypeIt != E &&
- (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+ (*SameTypeIt)->getType() == EltTy &&
+ (SameTypeIt - IncIt) < MaxNumElts) {
VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;
}
@@ -7269,8 +7421,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
bool AllowReorder = NumElts == 2;
- if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
- /*UserCost=*/0, AllowReorder)) {
+ if (NumElts > 1 &&
+ tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
@@ -7370,9 +7522,12 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
<< Entry.second.size() << ".\n");
// Process the GEP list in chunks suitable for the target's supported
- // vector size. If a vector register can't hold 1 element, we are done.
+ // vector size. If a vector register can't hold 1 element, we are done. We
+ // are trying to vectorize the index computations, so the maximum number of
+ // elements is based on the size of the index expression, rather than the
+ // size of the GEP itself (the target's pointer size).
unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+ unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
if (MaxVecRegSize < EltSize)
continue;
@@ -7475,6 +7630,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }