diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 730 |
1 files changed, 443 insertions, 287 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index aabd974cd73e..5bc35aa4695f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -47,6 +47,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -85,6 +86,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> @@ -107,9 +109,8 @@ using namespace slpvectorizer; STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); -cl::opt<bool> - llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden, - cl::desc("Run the SLP vectorization passes")); +cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, + cl::desc("Run the SLP vectorization passes")); static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, @@ -284,7 +285,7 @@ static bool isCommutative(Instruction *I) { static Optional<TargetTransformInfo::ShuffleKind> isShuffle(ArrayRef<Value *> VL) { auto *EI0 = cast<ExtractElementInst>(VL[0]); - unsigned Size = EI0->getVectorOperandType()->getVectorNumElements(); + unsigned Size = EI0->getVectorOperandType()->getNumElements(); Value *Vec1 = nullptr; Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; @@ -293,7 +294,7 @@ isShuffle(ArrayRef<Value *> VL) { auto *EI = cast<ExtractElementInst>(VL[I]); auto *Vec = EI->getVectorOperand(); // All vector operands must have the same number of vector elements. - if (Vec->getType()->getVectorNumElements() != Size) + if (cast<VectorType>(Vec->getType())->getNumElements() != Size) return None; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) @@ -377,6 +378,18 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) { return S.OpValue; } +/// \returns true if \p Opcode is allowed as part of of the main/alternate +/// instruction for SLP vectorization. +/// +/// Example of unsupported opcode is SDIV that can potentially cause UB if the +/// "shuffled out" lane would result in division by zero. +static bool isValidForAlternation(unsigned Opcode) { + if (Instruction::isIntDivRem(Opcode)) + return false; + + return true; +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. @@ -399,7 +412,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; - if (Opcode == AltOpcode) { + if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && + isValidForAlternation(Opcode)) { AltOpcode = InstOpcode; AltIndex = Cnt; continue; @@ -411,6 +425,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode) { + assert(isValidForAlternation(Opcode) && + isValidForAlternation(InstOpcode) && + "Cast isn't safe for alternation, logic needs to be updated!"); AltOpcode = InstOpcode; AltIndex = Cnt; continue; @@ -613,7 +630,7 @@ public: /// the stored value. Otherwise, the size is the width of the largest loaded /// value reaching V. This method is used by the vectorizer to calculate /// vectorization factors. - unsigned getVectorElementSize(Value *V) const; + unsigned getVectorElementSize(Value *V); /// Compute the minimum type sizes required to represent the entries in a /// vectorizable tree. @@ -650,6 +667,15 @@ public: /// may not be necessary. bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values + /// can be load combined in the backend. Load combining may not be allowed in + /// the IR optimizer, so we do not want to alter the pattern. For example, + /// partially transforming a scalar bswap() pattern into vector code is + /// effectively impossible for the backend to undo. + /// TODO: If load combining is allowed in the IR optimizer, this analysis + /// may not be necessary. + bool isLoadCombineCandidate() const; + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -816,13 +842,12 @@ public: // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. - auto *Ex1 = dyn_cast<ExtractElementInst>(V1); - auto *Ex2 = dyn_cast<ExtractElementInst>(V2); - if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() && - cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 == - cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) { + Value *EV; + ConstantInt *Ex1Idx, *Ex2Idx; + if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && + match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && + Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) return VLOperands::ScoreConsecutiveExtracts; - } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); @@ -852,7 +877,7 @@ public: int getExternalUsesCost(const std::pair<Value *, int> &LHS, const std::pair<Value *, int> &RHS) { int Cost = 0; - SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS}; + std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { Value *V = Values[Idx].first; // Calculate the absolute lane, using the minimum relative lane of LHS @@ -1385,7 +1410,8 @@ private: /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const; + int getGatherCost(VectorType *Ty, + const DenseSet<unsigned> &ShuffledIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -1422,7 +1448,7 @@ private: return VL.size() == ReuseShuffleIndices.size() && std::equal( VL.begin(), VL.end(), ReuseShuffleIndices.begin(), - [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; }); + [this](Value *V, int Idx) { return V == Scalars[Idx]; }); } /// A vector of scalars. @@ -1436,7 +1462,7 @@ private: EntryState State; /// Does this sequence require some shuffling? - SmallVector<unsigned, 4> ReuseShuffleIndices; + SmallVector<int, 4> ReuseShuffleIndices; /// Does this entry require reordering? ArrayRef<unsigned> ReorderIndices; @@ -1690,6 +1716,9 @@ private: /// Maps a specific scalar to its tree entry. SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; + /// Maps a value to the proposed vectorizable size. + SmallDenseMap<Value *, unsigned> InstrElementSize; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -2001,6 +2030,20 @@ private: if (TreeEntry *TE = BundleMember->TE) { int Lane = BundleMember->Lane; assert(Lane >= 0 && "Lane not set"); + + // Since vectorization tree is being built recursively this assertion + // ensures that the tree entry has all operands set before reaching + // this code. Couple of exceptions known at the moment are extracts + // where their second (immediate) operand is not added. Since + // immediates do not affect scheduler behavior this is considered + // okay. + auto *In = TE->getMainOp(); + assert(In && + (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || + In->getNumOperands() == TE->getNumOperands()) && + "Missed TreeEntry operands?"); + (void)In; // fake use to avoid build failure when assertions disabled + for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); OpIdx != NumOperands; ++OpIdx) if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) @@ -2323,6 +2366,7 @@ BoUpSLP::~BoUpSLP() { "trying to erase instruction with users."); Pair.getFirst()->eraseFromParent(); } + assert(!verifyFunction(*F, &dbgs())); } void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { @@ -2978,19 +3022,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } case Instruction::Call: { - // Check if the calls are all to the same vectorizable intrinsic. + // Check if the calls are all to the same vectorizable intrinsic or + // library function. CallInst *CI = cast<CallInst>(VL0); - // Check if this is an Intrinsic call or something that can be - // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (!isTriviallyVectorizable(ID)) { + + VFShape Shape = VFShape::get( + *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/}, + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + + if (!VecFunc && !isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } - Function *Int = CI->getCalledFunction(); + Function *F = CI->getCalledFunction(); unsigned NumArgs = CI->getNumArgOperands(); SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); for (unsigned j = 0; j != NumArgs; ++j) @@ -2998,8 +3047,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ScalarArgs[j] = CI->getArgOperand(j); for (Value *V : VL) { CallInst *CI2 = dyn_cast<CallInst>(V); - if (!CI2 || CI2->getCalledFunction() != Int || + if (!CI2 || CI2->getCalledFunction() != F || getVectorIntrinsicIDForCall(CI2, TLI) != ID || + (VecFunc && + VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -3101,7 +3152,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; - while (isa<CompositeType>(EltTy)) { + while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) || + isa<VectorType>(EltTy)) { if (auto *ST = dyn_cast<StructType>(EltTy)) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) @@ -3109,16 +3161,19 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { return 0; N *= ST->getNumElements(); EltTy = *ST->element_begin(); + } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { + N *= AT->getNumElements(); + EltTy = AT->getElementType(); } else { - auto *SeqT = cast<SequentialType>(EltTy); - N *= SeqT->getNumElements(); - EltTy = SeqT->getElementType(); + auto *VT = cast<VectorType>(EltTy); + N *= VT->getNumElements(); + EltTy = VT->getElementType(); } } if (!isValidElementType(EltTy)) return 0; - uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N)); + uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) return 0; return N; @@ -3148,7 +3203,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) return false; } else { - NElts = Vec->getType()->getVectorNumElements(); + NElts = cast<VectorType>(Vec->getType())->getNumElements(); } if (NElts != VL.size()) @@ -3198,6 +3253,35 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { }); } +static std::pair<unsigned, unsigned> +getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI, + TargetLibraryInfo *TLI) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + // Calculate the cost of the scalar and vector calls. + IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements()); + int IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); + + auto Shape = + VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false}, + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + int LibCost = IntrinsicCost; + if (!CI->isNoBuiltin() && VecFunc) { + // Calculate the cost of the vector library call. + SmallVector<Type *, 4> VecTys; + for (Use &Arg : CI->args()) + VecTys.push_back( + FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); + + // If the corresponding vector call is cheaper, return its cost. + LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, + TTI::TCK_RecipThroughput); + } + return {IntrinsicCost, LibCost}; +} + int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef<Value*> VL = E->Scalars; @@ -3206,12 +3290,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ScalarTy = SI->getValueOperand()->getType(); else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. if (MinBWs.count(VL[0])) - VecTy = VectorType::get( + VecTy = FixedVectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); @@ -3251,6 +3336,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return ReuseShuffleCost + getGatherCost(VL); } + assert(E->State == TreeEntry::Vectorize && "Unhandled state"); assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -3260,7 +3346,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return 0; case Instruction::ExtractValue: - case Instruction::ExtractElement: + case Instruction::ExtractElement: { if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { @@ -3289,43 +3375,41 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (E->State == TreeEntry::Vectorize) { - int DeadCost = ReuseShuffleCost; - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - DeadCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast<Instruction>(VL[i]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) { - // Take credit for instruction that will become dead. - if (E->hasOneUse()) { - Instruction *Ext = E->user_back(); - if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa<GetElementPtrInst>(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - DeadCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, i); - // Add back the cost of s|zext which is subtracted separately. - DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), Ext); - continue; - } + int DeadCost = ReuseShuffleCost; + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + DeadCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *E = cast<Instruction>(VL[i]); + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (areAllUsersVectorized(E)) { + // Take credit for instruction that will become dead. + if (E->hasOneUse()) { + Instruction *Ext = E->user_back(); + if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa<GetElementPtrInst>(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + DeadCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), VecTy, i); + // Add back the cost of s|zext which is subtracted separately. + DeadCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), E->getType(), CostKind, + Ext); + continue; } - DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } + DeadCost -= + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } - return DeadCost; } - return ReuseShuffleCost + getGatherCost(VL); - + return DeadCost; + } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -3340,7 +3424,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind, + VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -3348,12 +3433,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. int ScalarCost = VL.size() * ScalarEltCost; - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); int VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + CostKind, VL0); } return VecCost - ScalarCost; } @@ -3362,13 +3448,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::Select: { // Calculate the cost of this instruction. int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, - Builder.getInt1Ty(), VL0); + Builder.getInt1Ty(), + CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, + CostKind, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FNeg: @@ -3429,13 +3517,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); + E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, + Operands, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost( - E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); + E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, + Operands, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3445,26 +3535,30 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OK_UniformConstantValue; int ScalarEltCost = - TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); + TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind, + Op1VK, Op2VK); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; int VecCost = - TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind, + Op1VK, Op2VK); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment()); + Align alignment = cast<LoadInst>(VL0)->getAlign(); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, + CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + CostKind, VL0); if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( @@ -3477,14 +3571,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { bool IsReorder = !E->ReorderIndices.empty(); auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); - MaybeAlign Alignment(SI->getAlignment()); + Align Alignment = SI->getAlign(); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, + CostKind, VL0); if (NeedToShuffleReuses) ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, Alignment, 0, VL0); + VecTy, Alignment, 0, CostKind, VL0); if (IsReorder) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecStCost += TTI->getShuffleCost( @@ -3497,24 +3592,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector<Type *, 4> ScalarTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) - ScalarTys.push_back(CI->getArgOperand(op)->getType()); - - FastMathFlags FMF; - if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) - FMF = FPMO->getFastMathFlags(); - - int ScalarEltCost = - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); + IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1); + int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; - SmallVector<Value *, 4> Args(CI->arg_operands()); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, - VecTy->getNumElements()); + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second); LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -3533,34 +3619,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast<Instruction>(VL[Idx]); - ReuseShuffleCost -= TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); } for (Value *V : VL) { Instruction *I = cast<Instruction>(V); - ReuseShuffleCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); } } for (Value *V : VL) { Instruction *I = cast<Instruction>(V); assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - ScalarCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ScalarCost += TTI->getInstructionCost(I, CostKind); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; if (Instruction::isBinaryOp(E->getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy); + VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); + VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, + CostKind); } else { Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); - VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); - VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty); + auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); + auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); + VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, + CostKind); + VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, + CostKind); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3596,24 +3682,20 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { return true; } -bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { - if (RdxOpcode != Instruction::Or) - return false; - - unsigned NumElts = VectorizableTree[0]->Scalars.size(); - Value *FirstReduced = VectorizableTree[0]->Scalars[0]; - - // Look past the reduction to find a source value. Arbitrarily follow the +static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, + TargetTransformInfo *TTI) { + // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional // shift-left-by-constant. - Value *ZextLoad = FirstReduced; - while (match(ZextLoad, m_Or(m_Value(), m_Value())) || - match(ZextLoad, m_Shl(m_Value(), m_Constant()))) + Value *ZextLoad = Root; + while (!isa<ConstantExpr>(ZextLoad) && + (match(ZextLoad, m_Or(m_Value(), m_Value())) || + match(ZextLoad, m_Shl(m_Value(), m_Constant())))) ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); - // Check if the input to the reduction is an extended load. + // Check if the input is an extended load of the required or/shift expression. Value *LoadPtr; - if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) return false; // Require that the total load bit width is a legal integer type. @@ -3621,15 +3703,36 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. Type *SrcTy = LoadPtr->getType()->getPointerElementType(); unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; - LLVMContext &Context = FirstReduced->getContext(); - if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth))) + if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) return false; // Everything matched - assume that we can fold the whole sequence using // load combining. - LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of " - << *(cast<Instruction>(FirstReduced)) << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " + << *(cast<Instruction>(Root)) << "\n"); + + return true; +} + +bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { + if (RdxOpcode != Instruction::Or) + return false; + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + Value *FirstReduced = VectorizableTree[0]->Scalars[0]; + return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI); +} + +bool BoUpSLP::isLoadCombineCandidate() const { + // Peek through a final sequence of stores and check if all operations are + // likely to be load-combined. + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + for (Value *Scalar : VectorizableTree[0]->Scalars) { + Value *X; + if (!match(Scalar, m_Store(m_Value(X), m_Value())) || + !isLoadCombineCandidateImpl(X, NumElts, TTI)) + return false; + } return true; } @@ -3712,7 +3815,7 @@ int BoUpSLP::getSpillCost() const { if (NumCalls) { SmallVector<Type*, 4> V; for (auto *II : LiveValues) - V.push_back(VectorType::get(II->getType(), BundleWidth)); + V.push_back(FixedVectorType::get(II->getType(), BundleWidth)); Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); } @@ -3776,13 +3879,13 @@ int BoUpSLP::getTreeCost() { // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); + VecTy = FixedVectorType::get(MinTy, BundleWidth); ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { @@ -3809,12 +3912,15 @@ int BoUpSLP::getTreeCost() { return Cost; } -int BoUpSLP::getGatherCost(Type *Ty, +int BoUpSLP::getGatherCost(VectorType *Ty, const DenseSet<unsigned> &ShuffledIndices) const { - int Cost = 0; - for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + unsigned NumElts = Ty->getNumElements(); + APInt DemandedElts = APInt::getNullValue(NumElts); + for (unsigned i = 0; i < NumElts; ++i) if (!ShuffledIndices.count(i)) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + DemandedElts.setBit(i); + int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, + /*Extract*/ false); if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; @@ -3825,7 +3931,7 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. @@ -3965,9 +4071,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { V = SV->getOperand(0); } else { // Reshuffle to get only unique values. - SmallVector<unsigned, 4> UniqueIdxs; - SmallSet<unsigned, 4> UsedIdxs; - for(unsigned Idx : E->ReuseShuffleIndices) + SmallVector<int, 4> UniqueIdxs; + SmallSet<int, 4> UsedIdxs; + for (int Idx : E->ReuseShuffleIndices) if (UsedIdxs.insert(Idx).second) UniqueIdxs.emplace_back(Idx); V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), @@ -3984,7 +4090,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { ScalarTy = SI->getValueOperand()->getType(); // Check that every instruction appears once in this bundle. - SmallVector<unsigned, 4> ReuseShuffleIndicies; + SmallVector<int, 4> ReuseShuffleIndicies; SmallVector<Value *, 4> UniqueValues; if (VL.size() > 2) { DenseMap<Value *, unsigned> UniquePositions; @@ -4002,7 +4108,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { else VL = UniqueValues; } - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); Value *V = Gather(VL, VecTy); if (!ReuseShuffleIndicies.empty()) { @@ -4017,7 +4123,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { } static void inversePermutation(ArrayRef<unsigned> Indices, - SmallVectorImpl<unsigned> &Mask) { + SmallVectorImpl<int> &Mask) { Mask.clear(); const unsigned E = Indices.size(); Mask.resize(E); @@ -4037,7 +4143,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -4056,6 +4162,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } + assert(E->State == TreeEntry::Vectorize && "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { @@ -4096,72 +4203,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractElement: { - if (E->State == TreeEntry::Vectorize) { - Value *V = E->getSingleOperand(0); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - if (E->ReorderIndices.empty()) - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; - return V; + Value *V = E->getSingleOperand(0); + if (!E->ReorderIndices.empty()) { + SmallVector<int, 4> Mask; + inversePermutation(E->ReorderIndices, Mask); + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + if (E->ReorderIndices.empty()) + Builder.SetInsertPoint(VL0); V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast<Instruction>(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (E->State == TreeEntry::Vectorize) { - LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); - Builder.SetInsertPoint(LI); - PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); - Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); - Value *NewV = propagateMetadata(V, E->Scalars); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector( - NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = NewV; - return NewV; + LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); + Builder.SetInsertPoint(LI); + PointerType *PtrTy = + PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); + Value *NewV = propagateMetadata(V, E->Scalars); + if (!E->ReorderIndices.empty()) { + SmallVector<int, 4> Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast<Instruction>(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } + // TODO: Merge this shuffle with the ReorderShuffleMask. + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); } - E->VectorizedValue = V; - return V; + E->VectorizedValue = NewV; + return NewV; } case Instruction::ZExt: case Instruction::SExt: @@ -4207,12 +4287,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); - Value *V; - if (E->getOpcode() == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); - + Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -4321,7 +4396,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); - Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), @@ -4334,14 +4408,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (getTreeEntry(PO)) ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); - MaybeAlign Alignment = MaybeAlign(LI->getAlignment()); - LI = Builder.CreateLoad(VecTy, VecPtr); - if (!Alignment) - Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy)); - LI->setAlignment(Alignment); + LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { - OrdersType Mask; + SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask, "reorder_shuffle"); @@ -4359,23 +4429,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool IsReorder = !E->ReorderIndices.empty(); auto *SI = cast<StoreInst>( IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); - unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); if (IsReorder) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); + SmallVector<int, 4> Mask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); VecValue = Builder.CreateShuffleVector( - VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices, + VecValue, UndefValue::get(VecValue->getType()), Mask, "reorder_shuffle"); } Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( ScalarPtr, VecValue->getType()->getPointerTo(AS)); - StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, + SI->getAlign()); // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the @@ -4383,10 +4453,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (getTreeEntry(ScalarPtr)) ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - - ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -4445,13 +4511,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Function *FI = CI->getCalledFunction()) IID = FI->getIntrinsicID(); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + bool UseIntrinsic = ID != Intrinsic::not_intrinsic && + VecCallCosts.first <= VecCallCosts.second; + Value *ScalarArg = nullptr; std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (hasVectorInstrinsicScalarOpd(IID, j)) { + if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); @@ -4463,10 +4535,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { OpVecs.push_back(OpVec); } - Module *M = F->getParent(); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; - Function *CF = Intrinsic::getDeclaration(M, ID, Tys); + Function *CF; + if (!UseIntrinsic) { + VFShape Shape = VFShape::get( + *CI, {static_cast<unsigned>(VecTy->getNumElements()), false}, + false /*HasGlobalPred*/); + CF = VFDatabase(*CI).getVectorizedFunction(Shape); + } else { + Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())}; + CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + } + SmallVector<OperandBundleDef, 1> OpBundles; CI->getOperandBundlesAsDefs(OpBundles); Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); @@ -4527,24 +4606,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // each vector operation. ValueList OpScalars, AltScalars; unsigned e = E->Scalars.size(); - SmallVector<Constant *, 8> Mask(e); + SmallVector<int, 8> Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast<Instruction>(E->Scalars[i]); assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == E->getAltOpcode()) { - Mask[i] = Builder.getInt32(e + i); + Mask[i] = e + i; AltScalars.push_back(E->Scalars[i]); } else { - Mask[i] = Builder.getInt32(i); + Mask[i] = i; OpScalars.push_back(E->Scalars[i]); } } - Value *ShuffleMask = ConstantVector::get(Mask); propagateIRFlags(V0, OpScalars); propagateIRFlags(V1, AltScalars); - Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); if (NeedToShuffleReuses) { @@ -4586,7 +4664,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); auto BundleWidth = VectorizableTree[0]->Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto *VecTy = VectorType::get(MinTy, BundleWidth); + auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); VectorizableTree[0]->VectorizedValue = Trunc; } @@ -4715,6 +4793,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } Builder.ClearInsertionPoint(); + InstrElementSize.clear(); return VectorizableTree[0]->VectorizedValue; } @@ -5251,20 +5330,26 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { BS->ScheduleStart = nullptr; } -unsigned BoUpSLP::getVectorElementSize(Value *V) const { +unsigned BoUpSLP::getVectorElementSize(Value *V) { // If V is a store, just return the width of the stored value without // traversing the expression tree. This is the common case. if (auto *Store = dyn_cast<StoreInst>(V)) return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + auto E = InstrElementSize.find(V); + if (E != InstrElementSize.end()) + return E->second; + // If V is not a store, we can traverse the expression tree to find loads // that feed it. The type of the loaded value may indicate a more suitable // width than V's type. We want to base the vector element size on the width // of memory operations where possible. SmallVector<Instruction *, 16> Worklist; SmallPtrSet<Instruction *, 16> Visited; - if (auto *I = dyn_cast<Instruction>(V)) + if (auto *I = dyn_cast<Instruction>(V)) { Worklist.push_back(I); + Visited.insert(I); + } // Traverse the expression tree in bottom-up order looking for loads. If we // encounter an instruction we don't yet handle, we give up. @@ -5272,7 +5357,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { auto FoundUnknownInst = false; while (!Worklist.empty() && !FoundUnknownInst) { auto *I = Worklist.pop_back_val(); - Visited.insert(I); // We should only be looking at scalar instructions here. If the current // instruction has a vector type, give up. @@ -5292,7 +5376,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast<Instruction>(U.get())) - if (!Visited.count(J)) + if (Visited.insert(J).second) Worklist.push_back(J); } @@ -5301,13 +5385,17 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { FoundUnknownInst = true; } + int Width = MaxWidth; // If we didn't encounter a memory access in the expression tree, or if we - // gave up for some reason, just return the width of V. + // gave up for some reason, just return the width of V. Otherwise, return the + // maximum width we found. if (!MaxWidth || FoundUnknownInst) - return DL->getTypeSizeInBits(V->getType()); + Width = DL->getTypeSizeInBits(V->getType()); - // Otherwise, return the maximum width we found. - return MaxWidth; + for (Instruction *I : Visited) + InstrElementSize[I] = Width; + + return Width; } // Determine if a value V in a vectorizable expression Expr can be demoted to a @@ -5560,6 +5648,7 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<DemandedBitsWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addRequired<InjectTLIMappingsLegacy>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); @@ -5598,6 +5687,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { + if (!RunSLPVectorization) + return false; SE = SE_; TTI = TTI_; TLI = TLI_; @@ -5657,7 +5748,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, if (Changed) { R.optimizeGatherSequence(); LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); - LLVM_DEBUG(verifyFunction(F)); } return Changed; } @@ -5688,6 +5778,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, } if (R.isTreeTinyAndNotFullyVectorizable()) return false; + if (R.isLoadCombineCandidate()) + return false; R.computeMinimumValueSizes(); @@ -5841,37 +5933,28 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; - Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, /*UserCost=*/0, true); + Value *VL[] = {A, B}; + return tryToVectorizeList(VL, R, /*AllowReorder=*/true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, - int UserCost, bool AllowReorder) { + bool AllowReorder, + ArrayRef<Value *> InsertUses) { if (VL.size() < 2) return false; LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size() << ".\n"); - // Check that all of the parts are scalar instructions of the same type, + // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL); if (!S.getOpcode()) return false; Instruction *I0 = cast<Instruction>(S.OpValue); - unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); - unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); - if (MaxVF < 2) { - R.getORE()->emit([&]() { - return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) - << "Cannot SLP vectorize list: vectorization factor " - << "less than 2 is not supported"; - }); - return false; - } - + // Make sure invalid types (including vector type) are rejected before + // determining vectorization factor for scalar instructions. for (Value *V : VL) { Type *Ty = V->getType(); if (!isValidElementType(Ty)) { @@ -5889,16 +5972,35 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, } } + unsigned Sz = R.getVectorElementSize(I0); + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); + unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); + if (MaxVF < 2) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) + << "Cannot SLP vectorize list: vectorization factor " + << "less than 2 is not supported"; + }); + return false; + } + bool Changed = false; bool CandidateFound = false; int MinCost = SLPCostThreshold; + bool CompensateUseCost = + !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { + return V && isa<InsertElementInst>(V); + }); + assert((!CompensateUseCost || InsertUses.size() == VL.size()) && + "Each scalar expected to have an associated InsertElement user."); + unsigned NextInst = 0, MaxInst = VL.size(); for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). - auto *VecTy = VectorType::get(VL[0]->getType(), VF); + auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF); if (TTI->getNumberOfParts(VecTy) == VF) continue; for (unsigned I = NextInst; I < MaxInst; ++I) { @@ -5940,8 +6042,48 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost() - UserCost; + int Cost = R.getTreeCost(); CandidateFound = true; + if (CompensateUseCost) { + // TODO: Use TTI's getScalarizationOverhead for sequence of inserts + // rather than sum of single inserts as the latter may overestimate + // cost. This work should imply improving cost estimation for extracts + // that added in for external (for vectorization tree) users,i.e. that + // part should also switch to same interface. + // For example, the following case is projected code after SLP: + // %4 = extractelement <4 x i64> %3, i32 0 + // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0 + // %5 = extractelement <4 x i64> %3, i32 1 + // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 + // %6 = extractelement <4 x i64> %3, i32 2 + // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 + // %7 = extractelement <4 x i64> %3, i32 3 + // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 + // + // Extracts here added by SLP in order to feed users (the inserts) of + // original scalars and contribute to "ExtractCost" at cost evaluation. + // The inserts in turn form sequence to build an aggregate that + // detected by findBuildAggregate routine. + // SLP makes an assumption that such sequence will be optimized away + // later (instcombine) so it tries to compensate ExctractCost with + // cost of insert sequence. + // Current per element cost calculation approach is not quite accurate + // and tends to create bias toward favoring vectorization. + // Switching to the TTI interface might help a bit. + // Alternative solution could be pattern-match to detect a no-op or + // shuffle. + unsigned UserCost = 0; + for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { + auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]); + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) + UserCost += TTI->getVectorInstrCost( + Instruction::InsertElement, IE->getType(), CI->getZExtValue()); + } + LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost + << ".\n"); + Cost -= UserCost; + } + MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { @@ -6031,24 +6173,23 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { /// <0,2,...> or <1,3,..> while a splitting reduction will generate /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. /// \param IsLeft True will generate a mask of even elements, odd otherwise. -static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, - bool IsPairwise, bool IsLeft, - IRBuilder<> &Builder) { +static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen, + unsigned NumEltsToRdx, + bool IsPairwise, bool IsLeft) { assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); - SmallVector<Constant *, 32> ShuffleMask( - VecLen, UndefValue::get(Builder.getInt32Ty())); + SmallVector<int, 32> ShuffleMask(VecLen, -1); if (IsPairwise) // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); + ShuffleMask[i] = 2 * i + !IsLeft; else // Move the upper half of the vector to the lower half. for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); + ShuffleMask[i] = NumEltsToRdx + i; - return ConstantVector::get(ShuffleMask); + return ShuffleMask; } namespace { @@ -6840,7 +6981,7 @@ private: int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal, unsigned ReduxWidth) { Type *ScalarTy = FirstReducedVal->getType(); - Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); + auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth); int PairwiseRdxCost; int SplittingRdxCost; @@ -6857,7 +6998,7 @@ private: case RK_Max: case RK_UMin: case RK_UMax: { - Type *VecCondTy = CmpInst::makeCmpResultType(VecTy); + auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy)); bool IsUnsigned = ReductionData.getKind() == RK_UMin || ReductionData.getKind() == RK_UMax; PairwiseRdxCost = @@ -6922,10 +7063,8 @@ private: Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - Value *LeftMask = - createRdxShuffleMask(ReduxWidth, i, true, true, Builder); - Value *RightMask = - createRdxShuffleMask(ReduxWidth, i, true, false, Builder); + auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true); + auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false); Value *LeftShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); @@ -6960,20 +7099,16 @@ private: /// \return true if it matches. static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl<Value *> &BuildVectorOpds, - int &UserCost) { + SmallVectorImpl<Value *> &InsertElts) { assert((isa<InsertElementInst>(LastInsertInst) || isa<InsertValueInst>(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"); - UserCost = 0; do { Value *InsertedOperand; - if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) { + auto *IE = dyn_cast<InsertElementInst>(LastInsertInst); + if (IE) { InsertedOperand = IE->getOperand(1); LastInsertInst = IE->getOperand(0); - if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { - UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, - IE->getType(), CI->getZExtValue()); - } } else { auto *IV = cast<InsertValueInst>(LastInsertInst); InsertedOperand = IV->getInsertedValueOperand(); @@ -6981,16 +7116,17 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, } if (isa<InsertElementInst>(InsertedOperand) || isa<InsertValueInst>(InsertedOperand)) { - int TmpUserCost; SmallVector<Value *, 8> TmpBuildVectorOpds; + SmallVector<Value *, 8> TmpInsertElts; if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, - TmpUserCost)) + TmpInsertElts)) return false; BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), TmpBuildVectorOpds.rend()); - UserCost += TmpUserCost; + InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend()); } else { BuildVectorOpds.push_back(InsertedOperand); + InsertElts.push_back(IE); } if (isa<UndefValue>(LastInsertInst)) break; @@ -7000,6 +7136,7 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); + std::reverse(InsertElts.begin(), InsertElts.end()); return true; } @@ -7164,26 +7301,29 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, BoUpSLP &R) { - int UserCost = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); if (!R.canMapToVector(IVI->getType(), DL)) return false; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost)) + SmallVector<Value *, 16> BuildVectorInsts; + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) || + BuildVectorOpds.size() < 2) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, UserCost); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, + BuildVectorInsts); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { - int UserCost; + SmallVector<Value *, 16> BuildVectorInsts; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || + BuildVectorOpds.size() < 2 || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && isShuffle(BuildVectorOpds))) @@ -7191,7 +7331,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R, UserCost); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, + BuildVectorInsts); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -7228,6 +7369,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; SmallPtrSet<Value *, 16> VisitedInstrs; + unsigned MaxVecRegSize = R.getMaxVecRegSize(); bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -7254,8 +7396,18 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Look for the next elements with the same type. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; + Type *EltTy = (*IncIt)->getType(); + unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy) + : MaxVecRegSize; + unsigned MaxNumElts = MaxVecRegSize / EltSize; + if (MaxNumElts < 2) { + ++IncIt; + continue; + } + while (SameTypeIt != E && - (*SameTypeIt)->getType() == (*IncIt)->getType()) { + (*SameTypeIt)->getType() == EltTy && + (SameTypeIt - IncIt) < MaxNumElts) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } @@ -7269,8 +7421,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. bool AllowReorder = NumElts == 2; - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - /*UserCost=*/0, AllowReorder)) { + if (NumElts > 1 && + tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; @@ -7370,9 +7522,12 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { << Entry.second.size() << ".\n"); // Process the GEP list in chunks suitable for the target's supported - // vector size. If a vector register can't hold 1 element, we are done. + // vector size. If a vector register can't hold 1 element, we are done. We + // are trying to vectorize the index computations, so the maximum number of + // elements is based on the size of the index expression, rather than the + // size of the GEP itself (the target's pointer size). unsigned MaxVecRegSize = R.getMaxVecRegSize(); - unsigned EltSize = R.getVectorElementSize(Entry.second[0]); + unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin()); if (MaxVecRegSize < EltSize) continue; @@ -7475,6 +7630,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } |